1   /**
2    * Copyright (c) 2000-2008 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portal.lucene;
24  
25  import com.liferay.portal.kernel.util.CharPool;
26  import com.liferay.portal.kernel.util.GetterUtil;
27  import com.liferay.portal.kernel.util.StringMaker;
28  import com.liferay.portal.kernel.util.StringPool;
29  import com.liferay.portal.kernel.util.Validator;
30  import com.liferay.portal.util.PropsValues;
31  
32  import java.io.BufferedInputStream;
33  import java.io.BufferedReader;
34  import java.io.ByteArrayInputStream;
35  import java.io.File;
36  import java.io.FileInputStream;
37  import java.io.IOException;
38  import java.io.InputStream;
39  
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.jackrabbit.extractor.MsExcelTextExtractor;
43  import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor;
44  import org.apache.jackrabbit.extractor.MsWordTextExtractor;
45  import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor;
46  import org.apache.jackrabbit.extractor.PdfTextExtractor;
47  import org.apache.jackrabbit.extractor.PlainTextExtractor;
48  import org.apache.jackrabbit.extractor.RTFTextExtractor;
49  import org.apache.jackrabbit.extractor.TextExtractor;
50  import org.apache.jackrabbit.extractor.XMLTextExtractor;
51  import org.apache.lucene.document.Field;
52  
53  /**
54   * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
55   *
56   * @author Brian Wing Shun Chan
57   *
58   */
59  public class LuceneFileExtractor {
60  
61      public Field getFile(String field, InputStream is, String fileExt) {
62          String text = null;
63  
64          try {
65              fileExt = GetterUtil.getString(fileExt).toLowerCase();
66  
67              TextExtractor extractor = null;
68  
69              String contentType = null;
70              String encoding = System.getProperty("encoding");
71  
72              if (fileExt.equals(".doc")) {
73                  extractor = new MsWordTextExtractor();
74  
75                  contentType = "application/vnd.ms-word";
76              }
77              else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
78                  extractor = new JerichoHTMLTextExtractor();
79  
80                  contentType = "text/html";
81              }
82              else if (fileExt.equals(".odb") || fileExt.equals(".odf") ||
83                       fileExt.equals(".odg") || fileExt.equals(".odp") ||
84                       fileExt.equals(".ods") || fileExt.equals(".odt")) {
85  
86                  extractor = new OpenOfficeTextExtractor();
87  
88                  contentType = "application/vnd.oasis.opendocument.";
89  
90                  if (fileExt.equals(".odb")) {
91                      contentType += "database";
92                  }
93                  else if (fileExt.equals(".odf")) {
94                      contentType += "formula";
95                  }
96                  else if (fileExt.equals(".odg")) {
97                      contentType += "graphics";
98                  }
99                  else if (fileExt.equals(".odp")) {
100                     contentType += "presentation";
101                 }
102                 else if (fileExt.equals(".ods")) {
103                     contentType += "spreadsheet";
104                 }
105                 else if (fileExt.equals(".odt")) {
106                     contentType += "text";
107                 }
108             }
109             else if (fileExt.equals(".pdf")) {
110                 extractor = new PdfTextExtractor();
111 
112                 contentType = "application/pdf";
113             }
114             else if (fileExt.equals(".ppt")) {
115                 extractor = new MsPowerPointTextExtractor();
116 
117                 contentType = "application/vnd.ms-powerpoint";
118             }
119             else if (fileExt.equals(".rtf")) {
120                 extractor = new RTFTextExtractor();
121 
122                 contentType = "application/rtf";
123             }
124             else if (fileExt.equals(".txt")) {
125                 extractor = new PlainTextExtractor();
126 
127                 contentType = "text/plain";
128             }
129             else if (fileExt.equals(".xls")) {
130                 extractor = new MsExcelTextExtractor();
131 
132                 contentType = "application/vnd.ms-excel";
133             }
134             else if (fileExt.equals(".xml")) {
135                 extractor = new XMLTextExtractor();
136 
137                 contentType = "text/xml";
138             }
139 
140             if (extractor != null) {
141                 if (_log.isInfoEnabled()) {
142                     _log.info(
143                         "Using extractor " + extractor.getClass().getName() +
144                             " for extension " + fileExt);
145                 }
146 
147                 StringMaker sm = new StringMaker();
148 
149                 BufferedReader reader = new BufferedReader(
150                     extractor.extractText(is, contentType, encoding));
151 
152                 int i;
153 
154                 while ((i = reader.read()) != -1) {
155                     sm.append((char)i);
156                 }
157 
158                 reader.close();
159 
160                 text = sm.toString();
161 
162                 if (Validator.isNotNull(
163                         PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
164 
165                     text = regexpStrip(text);
166                 }
167             }
168             else {
169                 if (_log.isInfoEnabled()) {
170                     _log.info("No extractor found for extension " + fileExt);
171                 }
172             }
173         }
174         catch (Exception e) {
175             _log.error(e);
176         }
177 
178         if (_log.isDebugEnabled()) {
179             _log.debug("Extractor returned text:\n\n" + text);
180         }
181 
182         if (text == null) {
183             text = StringPool.BLANK;
184         }
185 
186         return LuceneFields.getText(field, text);
187     }
188 
189     public Field getFile(String field, byte[] byteArray, String fileExt)
190         throws IOException {
191 
192         InputStream in = new BufferedInputStream(
193             new ByteArrayInputStream(byteArray));
194 
195         return getFile(field, in, fileExt);
196     }
197 
198     public Field getFile(String field, File file, String fileExt)
199         throws IOException {
200 
201         InputStream in = new FileInputStream(file);
202 
203         return getFile(field, in, fileExt);
204     }
205 
206     protected String regexpStrip(String text) {
207         char[] array = text.toCharArray();
208 
209         for (int i = 0; i < array.length; i++) {
210             String s = String.valueOf(array[i]);
211 
212             if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
213                 array[i] = CharPool.SPACE;
214             }
215         }
216 
217         return new String(array);
218     }
219 
220     private static Log _log = LogFactory.getLog(LuceneFileExtractor.class);
221 
222 }