JerichoHTMLTextExtractor.java |
1 /** 2 * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved. 3 * 4 * The contents of this file are subject to the terms of the Liferay Enterprise 5 * Subscription License ("License"). You may not use this file except in 6 * compliance with the License. You can obtain a copy of the License by 7 * contacting Liferay, Inc. See the License for the specific language governing 8 * permissions and limitations under the License, including but not limited to 9 * distribution rights of the Software. 10 * 11 * 12 * 13 */ 14 15 package com.liferay.util.lucene; 16 17 import com.liferay.portal.kernel.io.unsync.UnsyncStringReader; 18 19 import java.io.IOException; 20 import java.io.InputStream; 21 import java.io.Reader; 22 23 import net.htmlparser.jericho.Source; 24 25 import org.apache.jackrabbit.extractor.HTMLTextExtractor; 26 27 /** 28 * <a href="JerichoHTMLTextExtractor.java.html"><b><i>View Source</i></b></a> 29 * 30 * @author Brian Wing Shun Chan 31 */ 32 public class JerichoHTMLTextExtractor extends HTMLTextExtractor { 33 34 public Reader extractText(InputStream stream, String type, String encoding) 35 throws IOException { 36 37 Source source = new Source(stream); 38 39 return new UnsyncStringReader(source.getTextExtractor().toString()); 40 } 41 42 }