001
014
015 package com.liferay.portal.search.lucene;
016
017 import com.liferay.portal.kernel.io.unsync.UnsyncByteArrayInputStream;
018 import com.liferay.portal.kernel.util.CharPool;
019 import com.liferay.portal.kernel.util.FileUtil;
020 import com.liferay.portal.kernel.util.Validator;
021 import com.liferay.portal.util.PropsValues;
022
023 import java.io.File;
024 import java.io.FileInputStream;
025 import java.io.IOException;
026 import java.io.InputStream;
027
028 import org.apache.lucene.document.Field;
029
030
033 public class LuceneFileExtractor {
034
035 public Field getFile(String field, InputStream is, String fileExt) {
036 String text = FileUtil.extractText(is, fileExt);
037
038 if (Validator.isNotNull(
039 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
040
041 text = regexpStrip(text);
042 }
043
044 return LuceneFields.getText(field, text);
045 }
046
047 public Field getFile(String field, byte[] bytes, String fileExt) {
048 InputStream is = new UnsyncByteArrayInputStream(bytes);
049
050 return getFile(field, is, fileExt);
051 }
052
053 public Field getFile(String field, File file, String fileExt)
054 throws IOException {
055
056 InputStream is = new FileInputStream(file);
057
058 return getFile(field, is, fileExt);
059 }
060
061 protected String regexpStrip(String text) {
062 char[] array = text.toCharArray();
063
064 for (int i = 0; i < array.length; i++) {
065 String s = String.valueOf(array[i]);
066
067 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
068 array[i] = CharPool.SPACE;
069 }
070 }
071
072 return new String(array);
073 }
074
075 }