1
22
23 package com.liferay.portal.lucene;
24
25 import com.liferay.portal.kernel.util.CharPool;
26 import com.liferay.portal.kernel.util.GetterUtil;
27 import com.liferay.portal.kernel.util.StringMaker;
28 import com.liferay.portal.kernel.util.StringPool;
29 import com.liferay.portal.kernel.util.Validator;
30 import com.liferay.portal.util.PropsValues;
31
32 import java.io.BufferedInputStream;
33 import java.io.BufferedReader;
34 import java.io.ByteArrayInputStream;
35 import java.io.File;
36 import java.io.FileInputStream;
37 import java.io.IOException;
38 import java.io.InputStream;
39
40 import org.apache.commons.logging.Log;
41 import org.apache.commons.logging.LogFactory;
42 import org.apache.jackrabbit.extractor.MsExcelTextExtractor;
43 import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor;
44 import org.apache.jackrabbit.extractor.MsWordTextExtractor;
45 import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor;
46 import org.apache.jackrabbit.extractor.PdfTextExtractor;
47 import org.apache.jackrabbit.extractor.PlainTextExtractor;
48 import org.apache.jackrabbit.extractor.RTFTextExtractor;
49 import org.apache.jackrabbit.extractor.TextExtractor;
50 import org.apache.jackrabbit.extractor.XMLTextExtractor;
51 import org.apache.lucene.document.Field;
52
53
59 public class LuceneFileExtractor {
60
61 public Field getFile(String field, InputStream is, String fileExt) {
62 String text = null;
63
64 try {
65 fileExt = GetterUtil.getString(fileExt).toLowerCase();
66
67 TextExtractor extractor = null;
68
69 String contentType = null;
70 String encoding = System.getProperty("encoding");
71
72 if (fileExt.equals(".doc")) {
73 extractor = new MsWordTextExtractor();
74
75 contentType = "application/vnd.ms-word";
76 }
77 else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
78 extractor = new JerichoHTMLTextExtractor();
79
80 contentType = "text/html";
81 }
82 else if (fileExt.equals(".odb") || fileExt.equals(".odf") ||
83 fileExt.equals(".odg") || fileExt.equals(".odp") ||
84 fileExt.equals(".ods") || fileExt.equals(".odt")) {
85
86 extractor = new OpenOfficeTextExtractor();
87
88 contentType = "application/vnd.oasis.opendocument.";
89
90 if (fileExt.equals(".odb")) {
91 contentType += "database";
92 }
93 else if (fileExt.equals(".odf")) {
94 contentType += "formula";
95 }
96 else if (fileExt.equals(".odg")) {
97 contentType += "graphics";
98 }
99 else if (fileExt.equals(".odp")) {
100 contentType += "presentation";
101 }
102 else if (fileExt.equals(".ods")) {
103 contentType += "spreadsheet";
104 }
105 else if (fileExt.equals(".odt")) {
106 contentType += "text";
107 }
108 }
109 else if (fileExt.equals(".pdf")) {
110 extractor = new PdfTextExtractor();
111
112 contentType = "application/pdf";
113 }
114 else if (fileExt.equals(".ppt")) {
115 extractor = new MsPowerPointTextExtractor();
116
117 contentType = "application/vnd.ms-powerpoint";
118 }
119 else if (fileExt.equals(".rtf")) {
120 extractor = new RTFTextExtractor();
121
122 contentType = "application/rtf";
123 }
124 else if (fileExt.equals(".txt")) {
125 extractor = new PlainTextExtractor();
126
127 contentType = "text/plain";
128 }
129 else if (fileExt.equals(".xls")) {
130 extractor = new MsExcelTextExtractor();
131
132 contentType = "application/vnd.ms-excel";
133 }
134 else if (fileExt.equals(".xml")) {
135 extractor = new XMLTextExtractor();
136
137 contentType = "text/xml";
138 }
139
140 if (extractor != null) {
141 if (_log.isInfoEnabled()) {
142 _log.info(
143 "Using extractor " + extractor.getClass().getName() +
144 " for extension " + fileExt);
145 }
146
147 StringMaker sm = new StringMaker();
148
149 BufferedReader reader = new BufferedReader(
150 extractor.extractText(is, contentType, encoding));
151
152 int i;
153
154 while ((i = reader.read()) != -1) {
155 sm.append((char)i);
156 }
157
158 reader.close();
159
160 text = sm.toString();
161
162 if (Validator.isNotNull(
163 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
164
165 text = regexpStrip(text);
166 }
167 }
168 else {
169 if (_log.isInfoEnabled()) {
170 _log.info("No extractor found for extension " + fileExt);
171 }
172 }
173 }
174 catch (Exception e) {
175 _log.error(e);
176 }
177
178 if (_log.isDebugEnabled()) {
179 _log.debug("Extractor returned text:\n\n" + text);
180 }
181
182 if (text == null) {
183 text = StringPool.BLANK;
184 }
185
186 return LuceneFields.getText(field, text);
187 }
188
189 public Field getFile(String field, byte[] byteArray, String fileExt)
190 throws IOException {
191
192 InputStream in = new BufferedInputStream(
193 new ByteArrayInputStream(byteArray));
194
195 return getFile(field, in, fileExt);
196 }
197
198 public Field getFile(String field, File file, String fileExt)
199 throws IOException {
200
201 InputStream in = new FileInputStream(file);
202
203 return getFile(field, in, fileExt);
204 }
205
206 protected String regexpStrip(String text) {
207 char[] array = text.toCharArray();
208
209 for (int i = 0; i < array.length; i++) {
210 String s = String.valueOf(array[i]);
211
212 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
213 array[i] = CharPool.SPACE;
214 }
215 }
216
217 return new String(array);
218 }
219
220 private static Log _log = LogFactory.getLog(LuceneFileExtractor.class);
221
222 }