1   /**
2    * Copyright (c) 2000-2007 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portal.lucene;
24  
25  import com.liferay.portal.kernel.util.CharPool;
26  import com.liferay.portal.kernel.util.GetterUtil;
27  import com.liferay.portal.kernel.util.StringMaker;
28  import com.liferay.portal.kernel.util.StringPool;
29  import com.liferay.portal.kernel.util.Validator;
30  import com.liferay.portal.util.PropsUtil;
31  
32  import java.io.BufferedInputStream;
33  import java.io.BufferedReader;
34  import java.io.ByteArrayInputStream;
35  import java.io.File;
36  import java.io.FileInputStream;
37  import java.io.IOException;
38  import java.io.InputStream;
39  
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.jackrabbit.extractor.HTMLTextExtractor;
43  import org.apache.jackrabbit.extractor.MsExcelTextExtractor;
44  import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor;
45  import org.apache.jackrabbit.extractor.MsWordTextExtractor;
46  import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor;
47  import org.apache.jackrabbit.extractor.PdfTextExtractor;
48  import org.apache.jackrabbit.extractor.PlainTextExtractor;
49  import org.apache.jackrabbit.extractor.RTFTextExtractor;
50  import org.apache.jackrabbit.extractor.TextExtractor;
51  import org.apache.jackrabbit.extractor.XMLTextExtractor;
52  import org.apache.lucene.document.Field;
53  
54  /**
55   * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
56   *
57   * @author Brian Wing Shun Chan
58   *
59   */
60  public class LuceneFileExtractor {
61  
62      public Field getFile(String field, InputStream is, String fileExt) {
63          String text = null;
64  
65          try {
66              fileExt = GetterUtil.getString(fileExt).toLowerCase();
67  
68              TextExtractor extractor = null;
69  
70              String contentType = null;
71              String encoding = System.getProperty("encoding");
72  
73              if (fileExt.equals(".doc")) {
74                  extractor = new MsWordTextExtractor();
75  
76                  contentType = "application/vnd.ms-word";
77              }
78              else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
79                  extractor = new HTMLTextExtractor();
80  
81                  contentType = "text/html";
82              }
83              else if (fileExt.equals(".odb") || fileExt.equals(".odf") ||
84                       fileExt.equals(".odg") || fileExt.equals(".odp") ||
85                       fileExt.equals(".ods") || fileExt.equals(".odt")) {
86  
87                  extractor = new OpenOfficeTextExtractor();
88  
89                  contentType = "application/vnd.oasis.opendocument.";
90  
91                  if (fileExt.equals(".odb")) {
92                      contentType += "database";
93                  }
94                  else if (fileExt.equals(".odf")) {
95                      contentType += "formula";
96                  }
97                  else if (fileExt.equals(".odg")) {
98                      contentType += "graphics";
99                  }
100                 else if (fileExt.equals(".odp")) {
101                     contentType += "presentation";
102                 }
103                 else if (fileExt.equals(".ods")) {
104                     contentType += "spreadsheet";
105                 }
106                 else if (fileExt.equals(".odt")) {
107                     contentType += "text";
108                 }
109             }
110             else if (fileExt.equals(".pdf")) {
111                 extractor = new PdfTextExtractor();
112 
113                 contentType = "application/pdf";
114             }
115             else if (fileExt.equals(".ppt")) {
116                 extractor = new MsPowerPointTextExtractor();
117 
118                 contentType = "application/vnd.ms-powerpoint";
119             }
120             else if (fileExt.equals(".rtf")) {
121                 extractor = new RTFTextExtractor();
122 
123                 contentType = "application/rtf";
124             }
125             else if (fileExt.equals(".txt")) {
126                 extractor = new PlainTextExtractor();
127 
128                 contentType = "text/plain";
129             }
130             else if (fileExt.equals(".xls")) {
131                 extractor = new MsExcelTextExtractor();
132 
133                 contentType = "application/vnd.ms-excel";
134             }
135             else if (fileExt.equals(".xml")) {
136                 extractor = new XMLTextExtractor();
137 
138                 contentType = "text/xml";
139             }
140 
141             if (extractor != null) {
142                 if (_log.isInfoEnabled()) {
143                     _log.info(
144                         "Using extractor " + extractor.getClass().getName() +
145                             " for extension " + fileExt);
146                 }
147 
148                 StringMaker sm = new StringMaker();
149 
150                 BufferedReader reader = new BufferedReader(
151                     extractor.extractText(is, contentType, encoding));
152 
153                 int i;
154 
155                 while ((i = reader.read()) != -1) {
156                     sm.append((char)i);
157                 }
158 
159                 reader.close();
160 
161                 text = sm.toString();
162 
163                 if (Validator.isNotNull(_REGEXP_STRIP)) {
164                     text = regexpStrip(text);
165                 }
166             }
167             else {
168                 if (_log.isInfoEnabled()) {
169                     _log.info("No extractor found for extension " + fileExt);
170                 }
171             }
172         }
173         catch (Exception e) {
174             _log.error(e);
175         }
176 
177         if (_log.isDebugEnabled()) {
178             _log.debug("Extractor returned text:\n\n" + text);
179         }
180 
181         if (text == null) {
182             text = StringPool.BLANK;
183         }
184 
185         return LuceneFields.getText(field, text);
186     }
187 
188     public Field getFile(String field, byte[] byteArray, String fileExt)
189         throws IOException {
190 
191         InputStream in = new BufferedInputStream(
192             new ByteArrayInputStream(byteArray));
193 
194         return getFile(field, in, fileExt);
195     }
196 
197     public Field getFile(String field, File file, String fileExt)
198         throws IOException {
199 
200         InputStream in = new FileInputStream(file);
201 
202         return getFile(field, in, fileExt);
203     }
204 
205     protected String regexpStrip(String text) {
206         char[] array = text.toCharArray();
207 
208         for (int i = 0; i < array.length; i++) {
209             String s = String.valueOf(array[i]);
210 
211             if (!s.matches(_REGEXP_STRIP)) {
212                 array[i] = CharPool.SPACE;
213             }
214         }
215 
216         return new String(array);
217     }
218 
219     private static final String _REGEXP_STRIP = PropsUtil.get(
220         PropsUtil.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP);
221 
222     private static Log _log = LogFactory.getLog(LuceneFileExtractor.class);
223 
224 }