1   /**
2    * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3    *
4    * The contents of this file are subject to the terms of the Liferay Enterprise
5    * Subscription License ("License"). You may not use this file except in
6    * compliance with the License. You can obtain a copy of the License by
7    * contacting Liferay, Inc. See the License for the specific language governing
8    * permissions and limitations under the License, including but not limited to
9    * distribution rights of the Software.
10   *
11   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17   * SOFTWARE.
18   */
19  
20  package com.liferay.portal.search.lucene;
21  
22  import com.liferay.portal.kernel.util.CharPool;
23  import com.liferay.portal.kernel.util.FileUtil;
24  import com.liferay.portal.kernel.util.Validator;
25  import com.liferay.portal.util.PropsValues;
26  
27  import java.io.BufferedInputStream;
28  import java.io.ByteArrayInputStream;
29  import java.io.File;
30  import java.io.FileInputStream;
31  import java.io.IOException;
32  import java.io.InputStream;
33  
34  import org.apache.lucene.document.Field;
35  
36  /**
37   * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
38   *
39   * @author Brian Wing Shun Chan
40   *
41   */
42  public class LuceneFileExtractor {
43  
44      public Field getFile(String field, InputStream is, String fileExt) {
45          String text = FileUtil.extractText(is, fileExt);
46  
47          if (Validator.isNotNull(
48                  PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
49  
50              text = regexpStrip(text);
51          }
52  
53          return LuceneFields.getText(field, text);
54      }
55  
56      public Field getFile(String field, byte[] bytes, String fileExt) {
57          InputStream is = new BufferedInputStream(
58              new ByteArrayInputStream(bytes));
59  
60          return getFile(field, is, fileExt);
61      }
62  
63      public Field getFile(String field, File file, String fileExt)
64          throws IOException {
65  
66          InputStream is = new FileInputStream(file);
67  
68          return getFile(field, is, fileExt);
69      }
70  
71      protected String regexpStrip(String text) {
72          char[] array = text.toCharArray();
73  
74          for (int i = 0; i < array.length; i++) {
75              String s = String.valueOf(array[i]);
76  
77              if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
78                  array[i] = CharPool.SPACE;
79              }
80          }
81  
82          return new String(array);
83      }
84  
85  }