001    /**
002     * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
003     *
004     * The contents of this file are subject to the terms of the Liferay Enterprise
005     * Subscription License ("License"). You may not use this file except in
006     * compliance with the License. You can obtain a copy of the License by
007     * contacting Liferay, Inc. See the License for the specific language governing
008     * permissions and limitations under the License, including but not limited to
009     * distribution rights of the Software.
010     *
011     *
012     *
013     */
014    
015    package com.liferay.util.lucene;
016    
017    import com.liferay.portal.kernel.io.unsync.UnsyncStringReader;
018    
019    import java.io.IOException;
020    import java.io.InputStream;
021    import java.io.Reader;
022    
023    import net.htmlparser.jericho.Source;
024    
025    import org.apache.jackrabbit.extractor.HTMLTextExtractor;
026    
027    /**
028     * @author Brian Wing Shun Chan
029     */
030    public class JerichoHTMLTextExtractor extends HTMLTextExtractor {
031    
032            public Reader extractText(InputStream stream, String type, String encoding)
033                    throws IOException {
034    
035                    Source source = new Source(stream);
036    
037                    return new UnsyncStringReader(source.getTextExtractor().toString());
038            }
039    
040    }