1   /**
2    * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portal.search.lucene;
24  
25  import com.liferay.portal.kernel.util.CharPool;
26  import com.liferay.portal.kernel.util.FileUtil;
27  import com.liferay.portal.kernel.util.Validator;
28  import com.liferay.portal.util.PropsValues;
29  
30  import java.io.BufferedInputStream;
31  import java.io.ByteArrayInputStream;
32  import java.io.File;
33  import java.io.FileInputStream;
34  import java.io.IOException;
35  import java.io.InputStream;
36  
37  import org.apache.lucene.document.Field;
38  
39  /**
40   * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
41   *
42   * @author Brian Wing Shun Chan
43   *
44   */
45  public class LuceneFileExtractor {
46  
47      public Field getFile(String field, InputStream is, String fileExt) {
48          String text = FileUtil.extractText(is, fileExt);
49  
50          if (Validator.isNotNull(
51                  PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
52  
53              text = regexpStrip(text);
54          }
55  
56          return LuceneFields.getText(field, text);
57      }
58  
59      public Field getFile(String field, byte[] bytes, String fileExt) {
60          InputStream is = new BufferedInputStream(
61              new ByteArrayInputStream(bytes));
62  
63          return getFile(field, is, fileExt);
64      }
65  
66      public Field getFile(String field, File file, String fileExt)
67          throws IOException {
68  
69          InputStream is = new FileInputStream(file);
70  
71          return getFile(field, is, fileExt);
72      }
73  
74      protected String regexpStrip(String text) {
75          char[] array = text.toCharArray();
76  
77          for (int i = 0; i < array.length; i++) {
78              String s = String.valueOf(array[i]);
79  
80              if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
81                  array[i] = CharPool.SPACE;
82              }
83          }
84  
85          return new String(array);
86      }
87  
88  }