1
22
23 package com.liferay.portal.search.lucene;
24
25 import com.liferay.portal.kernel.util.CharPool;
26 import com.liferay.portal.kernel.util.FileUtil;
27 import com.liferay.portal.kernel.util.Validator;
28 import com.liferay.portal.util.PropsValues;
29
30 import java.io.BufferedInputStream;
31 import java.io.ByteArrayInputStream;
32 import java.io.File;
33 import java.io.FileInputStream;
34 import java.io.IOException;
35 import java.io.InputStream;
36
37 import org.apache.lucene.document.Field;
38
39
44 public class LuceneFileExtractor {
45
46 public Field getFile(String field, InputStream is, String fileExt) {
47 String text = FileUtil.extractText(is, fileExt);
48
49 if (Validator.isNotNull(
50 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
51
52 text = regexpStrip(text);
53 }
54
55 return LuceneFields.getText(field, text);
56 }
57
58 public Field getFile(String field, byte[] bytes, String fileExt) {
59 InputStream is = new BufferedInputStream(
60 new ByteArrayInputStream(bytes));
61
62 return getFile(field, is, fileExt);
63 }
64
65 public Field getFile(String field, File file, String fileExt)
66 throws IOException {
67
68 InputStream is = new FileInputStream(file);
69
70 return getFile(field, is, fileExt);
71 }
72
73 protected String regexpStrip(String text) {
74 char[] array = text.toCharArray();
75
76 for (int i = 0; i < array.length; i++) {
77 String s = String.valueOf(array[i]);
78
79 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
80 array[i] = CharPool.SPACE;
81 }
82 }
83
84 return new String(array);
85 }
86
87 }