JerichoHTMLTextExtractor.java |
1 /** 2 * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved. 3 * 4 * This library is free software; you can redistribute it and/or modify it under 5 * the terms of the GNU Lesser General Public License as published by the Free 6 * Software Foundation; either version 2.1 of the License, or (at your option) 7 * any later version. 8 * 9 * This library is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 11 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more 12 * details. 13 */ 14 15 package com.liferay.util.lucene; 16 17 import com.liferay.portal.kernel.io.unsync.UnsyncStringReader; 18 19 import java.io.IOException; 20 import java.io.InputStream; 21 import java.io.Reader; 22 23 import net.htmlparser.jericho.Source; 24 25 import org.apache.jackrabbit.extractor.HTMLTextExtractor; 26 27 /** 28 * <a href="JerichoHTMLTextExtractor.java.html"><b><i>View Source</i></b></a> 29 * 30 * @author Brian Wing Shun Chan 31 */ 32 public class JerichoHTMLTextExtractor extends HTMLTextExtractor { 33 34 public Reader extractText(InputStream stream, String type, String encoding) 35 throws IOException { 36 37 Source source = new Source(stream); 38 39 return new UnsyncStringReader(source.getTextExtractor().toString()); 40 } 41 42 }