1   /**
2    * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
3    *
4    * This library is free software; you can redistribute it and/or modify it under
5    * the terms of the GNU Lesser General Public License as published by the Free
6    * Software Foundation; either version 2.1 of the License, or (at your option)
7    * any later version.
8    *
9    * This library is distributed in the hope that it will be useful, but WITHOUT
10   * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11   * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12   * details.
13   */
14  
15  package com.liferay.util.lucene;
16  
17  import com.liferay.portal.kernel.io.unsync.UnsyncStringReader;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import java.io.Reader;
22  
23  import net.htmlparser.jericho.Source;
24  
25  import org.apache.jackrabbit.extractor.HTMLTextExtractor;
26  
27  /**
28   * <a href="JerichoHTMLTextExtractor.java.html"><b><i>View Source</i></b></a>
29   *
30   * @author Brian Wing Shun Chan
31   */
32  public class JerichoHTMLTextExtractor extends HTMLTextExtractor {
33  
34      public Reader extractText(InputStream stream, String type, String encoding)
35          throws IOException {
36  
37          Source source = new Source(stream);
38  
39          return new UnsyncStringReader(source.getTextExtractor().toString());
40      }
41  
42  }