1
14
15 package com.liferay.portal.search.lucene;
16
17 import com.liferay.portal.kernel.io.unsync.UnsyncByteArrayInputStream;
18 import com.liferay.portal.kernel.util.CharPool;
19 import com.liferay.portal.kernel.util.FileUtil;
20 import com.liferay.portal.kernel.util.Validator;
21 import com.liferay.portal.util.PropsValues;
22
23 import java.io.File;
24 import java.io.FileInputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27
28 import org.apache.lucene.document.Field;
29
30
35 public class LuceneFileExtractor {
36
37 public Field getFile(String field, InputStream is, String fileExt) {
38 String text = FileUtil.extractText(is, fileExt);
39
40 if (Validator.isNotNull(
41 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
42
43 text = regexpStrip(text);
44 }
45
46 return LuceneFields.getText(field, text);
47 }
48
49 public Field getFile(String field, byte[] bytes, String fileExt) {
50 InputStream is = new UnsyncByteArrayInputStream(bytes);
51
52 return getFile(field, is, fileExt);
53 }
54
55 public Field getFile(String field, File file, String fileExt)
56 throws IOException {
57
58 InputStream is = new FileInputStream(file);
59
60 return getFile(field, is, fileExt);
61 }
62
63 protected String regexpStrip(String text) {
64 char[] array = text.toCharArray();
65
66 for (int i = 0; i < array.length; i++) {
67 String s = String.valueOf(array[i]);
68
69 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
70 array[i] = CharPool.SPACE;
71 }
72 }
73
74 return new String(array);
75 }
76
77 }