1   /**
2    * Copyright (c) 2000-2009 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portal.util;
24  
25  import au.id.jericho.lib.html.Source;
26  
27  import com.liferay.portal.kernel.util.Html;
28  import com.liferay.portal.kernel.util.StringPool;
29  import com.liferay.portal.kernel.util.StringUtil;
30  
31  /**
32   * <a href="HtmlImpl.java.html"><b><i>View Source</i></b></a>
33   *
34   * @author Brian Wing Shun Chan
35   * @author Clarence Shen
36   * @author Harry Mark
37   *
38   */
39  public class HtmlImpl implements Html {
40  
41      public String escape(String text) {
42          if (text == null) {
43              return null;
44          }
45  
46          // Escape using XSS recommendations from
47          // http://www.owasp.org/index.php/Cross_Site_Scripting
48          // #How_to_Protect_Yourself
49  
50          StringBuilder sb = new StringBuilder(text.length());
51  
52          for (int i = 0; i < text.length(); i++) {
53              char c = text.charAt(i);
54  
55              switch (c) {
56                  case '<':
57                      sb.append("&lt;");
58  
59                      break;
60  
61                  case '>':
62                      sb.append("&gt;");
63  
64                      break;
65  
66                  case '&':
67                      sb.append("&amp;");
68  
69                      break;
70  
71                  case '"':
72                      sb.append("&#034;");
73  
74                      break;
75  
76                  case '\'':
77                      sb.append("&#039;");
78  
79                      break;
80  
81                  default:
82                      sb.append(c);
83  
84                      break;
85              }
86          }
87  
88          return sb.toString();
89      }
90  
91      public String extractText(String html) {
92          if (html == null) {
93              return null;
94          }
95  
96          Source source = new Source(html);
97  
98          return source.getTextExtractor().toString();
99      }
100 
101     public String fromInputSafe(String text) {
102         return StringUtil.replace(text, "&amp;", "&");
103     }
104 
105     public String replaceMsWordCharacters(String text) {
106         return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
107     }
108 
109     public String stripBetween(String text, String tag) {
110         return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
111     }
112 
113     public String stripComments(String text) {
114         return StringUtil.stripBetween(text, "<!--", "-->");
115     }
116 
117     public String stripHtml(String text) {
118         if (text == null) {
119             return null;
120         }
121 
122         text = stripComments(text);
123 
124         StringBuilder sb = new StringBuilder(text.length());
125 
126         int x = 0;
127         int y = text.indexOf("<");
128 
129         while (y != -1) {
130             sb.append(text.substring(x, y));
131             sb.append(StringPool.SPACE);
132 
133             // Look for text enclosed by <script></script>
134 
135             boolean scriptFound = isScriptTag(text, y + 1);
136 
137             if (scriptFound) {
138                 int pos = y + _TAG_SCRIPT.length;
139 
140                 // Find end of the tag
141 
142                 pos = text.indexOf(">", pos);
143 
144                 if (pos >= 0) {
145 
146                     // Check if preceding character is / (i.e. is this instance
147                     // of <script/>)
148 
149                     if (text.charAt(pos-1) != '/') {
150 
151                         // Search for the ending </script> tag
152 
153                         for (;;) {
154                             pos = text.indexOf("</", pos);
155 
156                             if (pos >= 0) {
157                                 if (isScriptTag(text, pos + 2)) {
158                                     y = pos;
159 
160                                     break;
161                                 }
162                                 else {
163 
164                                     // Skip past "</"
165 
166                                     pos += 2;
167                                 }
168                             }
169                             else {
170                                 break;
171                             }
172                         }
173                     }
174                 }
175             }
176 
177             x = text.indexOf(">", y);
178 
179             if (x == -1) {
180                 break;
181             }
182 
183             x++;
184 
185             if (x < y) {
186 
187                 // <b>Hello</b
188 
189                 break;
190             }
191 
192             y = text.indexOf("<", x);
193         }
194 
195         if (y == -1) {
196             sb.append(text.substring(x, text.length()));
197         }
198 
199         return sb.toString();
200     }
201 
202     public String toInputSafe(String text) {
203         return StringUtil.replace(
204             text,
205             new String[] {"&", "\""},
206             new String[] {"&amp;", "&quot;"});
207     }
208 
209     public String unescape(String text) {
210         if (text == null) {
211             return null;
212         }
213 
214         // Optimize this
215 
216         text = StringUtil.replace(text, "&lt;", "<");
217         text = StringUtil.replace(text, "&gt;", ">");
218         text = StringUtil.replace(text, "&amp;", "&");
219         text = StringUtil.replace(text, "&#034;", "\"");
220         text = StringUtil.replace(text, "&#039;", "'");
221         text = StringUtil.replace(text, "&#040;", "(");
222         text = StringUtil.replace(text, "&#041;", ")");
223         text = StringUtil.replace(text, "&#035;", "#");
224         text = StringUtil.replace(text, "&#037;", "%");
225         text = StringUtil.replace(text, "&#059;", ";");
226         text = StringUtil.replace(text, "&#043;", "+");
227         text = StringUtil.replace(text, "&#045;", "-");
228 
229         return text;
230     }
231 
232     protected boolean isScriptTag(String text, int pos) {
233         if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
234             char item;
235 
236             for (int i = 0; i < _TAG_SCRIPT.length; i++) {
237                 item = text.charAt(pos++);
238 
239                 if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
240                     return false;
241                 }
242             }
243 
244             item = text.charAt(pos);
245 
246             // Check that char after "script" is not a letter (i.e. another tag)
247 
248             return !Character.isLetter(item);
249         }
250         else {
251             return false;
252         }
253     }
254 
255     private static final String[] _MS_WORD_UNICODE = new String[] {
256         "\u00ae", "\u2019", "\u201c", "\u201d"
257     };
258 
259     private static final String[] _MS_WORD_HTML = new String[] {
260         "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
261     };
262 
263     private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
264 
265 }