1
22
23 package com.liferay.portal.lucene;
24
25 import com.liferay.portal.kernel.util.CharPool;
26 import com.liferay.portal.kernel.util.GetterUtil;
27 import com.liferay.portal.kernel.util.StringMaker;
28 import com.liferay.portal.kernel.util.StringPool;
29 import com.liferay.portal.kernel.util.Validator;
30 import com.liferay.portal.util.PropsUtil;
31
32 import java.io.BufferedInputStream;
33 import java.io.BufferedReader;
34 import java.io.ByteArrayInputStream;
35 import java.io.File;
36 import java.io.FileInputStream;
37 import java.io.IOException;
38 import java.io.InputStream;
39
40 import org.apache.commons.logging.Log;
41 import org.apache.commons.logging.LogFactory;
42 import org.apache.jackrabbit.extractor.HTMLTextExtractor;
43 import org.apache.jackrabbit.extractor.MsExcelTextExtractor;
44 import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor;
45 import org.apache.jackrabbit.extractor.MsWordTextExtractor;
46 import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor;
47 import org.apache.jackrabbit.extractor.PdfTextExtractor;
48 import org.apache.jackrabbit.extractor.PlainTextExtractor;
49 import org.apache.jackrabbit.extractor.RTFTextExtractor;
50 import org.apache.jackrabbit.extractor.TextExtractor;
51 import org.apache.jackrabbit.extractor.XMLTextExtractor;
52 import org.apache.lucene.document.Field;
53
54
60 public class LuceneFileExtractor {
61
62 public Field getFile(String field, InputStream is, String fileExt) {
63 String text = null;
64
65 try {
66 fileExt = GetterUtil.getString(fileExt).toLowerCase();
67
68 TextExtractor extractor = null;
69
70 String contentType = null;
71 String encoding = System.getProperty("encoding");
72
73 if (fileExt.equals(".doc")) {
74 extractor = new MsWordTextExtractor();
75
76 contentType = "application/vnd.ms-word";
77 }
78 else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
79 extractor = new HTMLTextExtractor();
80
81 contentType = "text/html";
82 }
83 else if (fileExt.equals(".odb") || fileExt.equals(".odf") ||
84 fileExt.equals(".odg") || fileExt.equals(".odp") ||
85 fileExt.equals(".ods") || fileExt.equals(".odt")) {
86
87 extractor = new OpenOfficeTextExtractor();
88
89 contentType = "application/vnd.oasis.opendocument.";
90
91 if (fileExt.equals(".odb")) {
92 contentType += "database";
93 }
94 else if (fileExt.equals(".odf")) {
95 contentType += "formula";
96 }
97 else if (fileExt.equals(".odg")) {
98 contentType += "graphics";
99 }
100 else if (fileExt.equals(".odp")) {
101 contentType += "presentation";
102 }
103 else if (fileExt.equals(".ods")) {
104 contentType += "spreadsheet";
105 }
106 else if (fileExt.equals(".odt")) {
107 contentType += "text";
108 }
109 }
110 else if (fileExt.equals(".pdf")) {
111 extractor = new PdfTextExtractor();
112
113 contentType = "application/pdf";
114 }
115 else if (fileExt.equals(".ppt")) {
116 extractor = new MsPowerPointTextExtractor();
117
118 contentType = "application/vnd.ms-powerpoint";
119 }
120 else if (fileExt.equals(".rtf")) {
121 extractor = new RTFTextExtractor();
122
123 contentType = "application/rtf";
124 }
125 else if (fileExt.equals(".txt")) {
126 extractor = new PlainTextExtractor();
127
128 contentType = "text/plain";
129 }
130 else if (fileExt.equals(".xls")) {
131 extractor = new MsExcelTextExtractor();
132
133 contentType = "application/vnd.ms-excel";
134 }
135 else if (fileExt.equals(".xml")) {
136 extractor = new XMLTextExtractor();
137
138 contentType = "text/xml";
139 }
140
141 if (extractor != null) {
142 if (_log.isInfoEnabled()) {
143 _log.info(
144 "Using extractor " + extractor.getClass().getName() +
145 " for extension " + fileExt);
146 }
147
148 StringMaker sm = new StringMaker();
149
150 BufferedReader reader = new BufferedReader(
151 extractor.extractText(is, contentType, encoding));
152
153 int i;
154
155 while ((i = reader.read()) != -1) {
156 sm.append((char)i);
157 }
158
159 reader.close();
160
161 text = sm.toString();
162
163 if (Validator.isNotNull(_REGEXP_STRIP)) {
164 text = regexpStrip(text);
165 }
166 }
167 else {
168 if (_log.isInfoEnabled()) {
169 _log.info("No extractor found for extension " + fileExt);
170 }
171 }
172 }
173 catch (Exception e) {
174 _log.error(e);
175 }
176
177 if (_log.isDebugEnabled()) {
178 _log.debug("Extractor returned text:\n\n" + text);
179 }
180
181 if (text == null) {
182 text = StringPool.BLANK;
183 }
184
185 return LuceneFields.getText(field, text);
186 }
187
188 public Field getFile(String field, byte[] byteArray, String fileExt)
189 throws IOException {
190
191 InputStream in = new BufferedInputStream(
192 new ByteArrayInputStream(byteArray));
193
194 return getFile(field, in, fileExt);
195 }
196
197 public Field getFile(String field, File file, String fileExt)
198 throws IOException {
199
200 InputStream in = new FileInputStream(file);
201
202 return getFile(field, in, fileExt);
203 }
204
205 protected String regexpStrip(String text) {
206 char[] array = text.toCharArray();
207
208 for (int i = 0; i < array.length; i++) {
209 String s = String.valueOf(array[i]);
210
211 if (!s.matches(_REGEXP_STRIP)) {
212 array[i] = CharPool.SPACE;
213 }
214 }
215
216 return new String(array);
217 }
218
219 private static final String _REGEXP_STRIP = PropsUtil.get(
220 PropsUtil.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP);
221
222 private static Log _log = LogFactory.getLog(LuceneFileExtractor.class);
223
224 }