1   /**
2    * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
3    *
4    * This library is free software; you can redistribute it and/or modify it under
5    * the terms of the GNU Lesser General Public License as published by the Free
6    * Software Foundation; either version 2.1 of the License, or (at your option)
7    * any later version.
8    *
9    * This library is distributed in the hope that it will be useful, but WITHOUT
10   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11   * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12   * details.
13   */
14  
15  package com.liferay.portal.search.lucene;
16  
17  import com.liferay.portal.kernel.io.unsync.UnsyncByteArrayInputStream;
18  import com.liferay.portal.kernel.util.CharPool;
19  import com.liferay.portal.kernel.util.FileUtil;
20  import com.liferay.portal.kernel.util.Validator;
21  import com.liferay.portal.util.PropsValues;
22  
23  import java.io.File;
24  import java.io.FileInputStream;
25  import java.io.IOException;
26  import java.io.InputStream;
27  
28  import org.apache.lucene.document.Field;
29  
30  /**
31   * <a href="LuceneFileExtractor.java.html"><b><i>View Source</i></b></a>
32   *
33   * @author Brian Wing Shun Chan
34   */
35  public class LuceneFileExtractor {
36  
37      public Field getFile(String field, InputStream is, String fileExt) {
38          String text = FileUtil.extractText(is, fileExt);
39  
40          if (Validator.isNotNull(
41                  PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
42  
43              text = regexpStrip(text);
44          }
45  
46          return LuceneFields.getText(field, text);
47      }
48  
49      public Field getFile(String field, byte[] bytes, String fileExt) {
50          InputStream is = new UnsyncByteArrayInputStream(bytes);
51  
52          return getFile(field, is, fileExt);
53      }
54  
55      public Field getFile(String field, File file, String fileExt)
56          throws IOException {
57  
58          InputStream is = new FileInputStream(file);
59  
60          return getFile(field, is, fileExt);
61      }
62  
63      protected String regexpStrip(String text) {
64          char[] array = text.toCharArray();
65  
66          for (int i = 0; i < array.length; i++) {
67              String s = String.valueOf(array[i]);
68  
69              if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
70                  array[i] = CharPool.SPACE;
71              }
72          }
73  
74          return new String(array);
75      }
76  
77  }