1   /**
2    * Copyright (c) 2000-2008 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portal.util;
24  
25  import au.id.jericho.lib.html.Source;
26  
27  import com.liferay.portal.kernel.util.Html;
28  import com.liferay.portal.kernel.util.StringMaker;
29  import com.liferay.portal.kernel.util.StringPool;
30  import com.liferay.portal.kernel.util.StringUtil;
31  
32  /**
33   * <a href="HtmlImpl.java.html"><b><i>View Source</i></b></a>
34   *
35   * @author Brian Wing Shun Chan
36   * @author Clarence Shen
37   * @author Harry Mark
38   *
39   */
40  public class HtmlImpl implements Html {
41  
42      public String escape(String text) {
43          if (text == null) {
44              return null;
45          }
46  
47          // Escape using XSS recommendations from
48          // http://www.owasp.org/index.php/Cross_Site_Scripting
49          // #How_to_Protect_Yourself
50  
51          StringMaker sm = new StringMaker(text.length());
52  
53          for (int i = 0; i < text.length(); i++) {
54              char c = text.charAt(i);
55  
56              switch (c) {
57                  case '<':
58                      sm.append("&lt;");
59  
60                      break;
61  
62                  case '>':
63                      sm.append("&gt;");
64  
65                      break;
66  
67                  case '&':
68                      sm.append("&amp;");
69  
70                      break;
71  
72                  case '"':
73                      sm.append("&#034;");
74  
75                      break;
76  
77                  case '\'':
78                      sm.append("&#039;");
79  
80                      break;
81  
82                  case '(':
83                      sm.append("&#040;");
84  
85                      break;
86  
87                  case ')':
88                      sm.append("&#041;");
89  
90                      break;
91  
92                  case '#':
93                      sm.append("&#035;");
94  
95                      break;
96  
97                  case '%':
98                      sm.append("&#037;");
99  
100                     break;
101 
102                 case ';':
103                     sm.append("&#059;");
104 
105                     break;
106 
107                 case '+':
108                     sm.append("&#043;");
109 
110                     break;
111 
112                 case '-':
113                     sm.append("&#045;");
114 
115                     break;
116 
117                 default:
118                     sm.append(c);
119 
120                     break;
121             }
122         }
123 
124         return sm.toString();
125     }
126 
127     public String extractText(String html) {
128         if (html == null) {
129             return null;
130         }
131 
132         Source source = new Source(html);
133 
134         return source.getTextExtractor().toString();
135     }
136 
137     public String fromInputSafe(String text) {
138         return StringUtil.replace(text, "&amp;", "&");
139     }
140 
141     public String replaceMsWordCharacters(String text) {
142         return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
143     }
144 
145     public String stripBetween(String text, String tag) {
146         return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
147     }
148 
149     public String stripComments(String text) {
150         return StringUtil.stripBetween(text, "<!--", "-->");
151     }
152 
153     public String stripHtml(String text) {
154         if (text == null) {
155             return null;
156         }
157 
158         text = stripComments(text);
159 
160         StringMaker sm = new StringMaker(text.length());
161 
162         int x = 0;
163         int y = text.indexOf("<");
164 
165         while (y != -1) {
166             sm.append(text.substring(x, y));
167             sm.append(StringPool.SPACE);
168 
169             // Look for text enclosed by <script></script>
170 
171             boolean scriptFound = isScriptTag(text, y + 1);
172 
173             if (scriptFound) {
174                 int pos = y + _TAG_SCRIPT.length;
175 
176                 // Find end of the tag
177 
178                 pos = text.indexOf(">", pos);
179 
180                 if (pos >= 0) {
181 
182                     // Check if preceding character is / (i.e. is this instance
183                     // of <script/>)
184 
185                     if (text.charAt(pos-1) != '/') {
186 
187                         // Search for the ending </script> tag
188 
189                         for (;;) {
190                             pos = text.indexOf("</", pos);
191 
192                             if (pos >= 0) {
193                                 if (isScriptTag(text, pos + 2)) {
194                                     y = pos;
195 
196                                     break;
197                                 }
198                                 else {
199 
200                                     // Skip past "</"
201 
202                                     pos += 2;
203                                 }
204                             }
205                             else {
206                                 break;
207                             }
208                         }
209                     }
210                 }
211             }
212 
213             x = text.indexOf(">", y);
214 
215             if (x == -1) {
216                 break;
217             }
218 
219             x++;
220 
221             if (x < y) {
222 
223                 // <b>Hello</b
224 
225                 break;
226             }
227 
228             y = text.indexOf("<", x);
229         }
230 
231         if (y == -1) {
232             sm.append(text.substring(x, text.length()));
233         }
234 
235         return sm.toString();
236     }
237 
238     public String toInputSafe(String text) {
239         return StringUtil.replace(
240             text,
241             new String[] {"&", "\""},
242             new String[] {"&amp;", "&quot;"});
243     }
244 
245     public String unescape(String text) {
246         if (text == null) {
247             return null;
248         }
249 
250         // Optimize this
251 
252         text = StringUtil.replace(text, "&lt;", "<");
253         text = StringUtil.replace(text, "&gt;", ">");
254         text = StringUtil.replace(text, "&amp;", "&");
255         text = StringUtil.replace(text, "&#034;", "\"");
256         text = StringUtil.replace(text, "&#039;", "'");
257         text = StringUtil.replace(text, "&#040;", "(");
258         text = StringUtil.replace(text, "&#041;", ")");
259         text = StringUtil.replace(text, "&#035;", "#");
260         text = StringUtil.replace(text, "&#037;", "%");
261         text = StringUtil.replace(text, "&#059;", ";");
262         text = StringUtil.replace(text, "&#043;", "+");
263         text = StringUtil.replace(text, "&#045;", "-");
264 
265         return text;
266     }
267 
268     protected boolean isScriptTag(String text, int pos) {
269         if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
270             char item;
271 
272             for (int i = 0; i < _TAG_SCRIPT.length; i++) {
273                 item = text.charAt(pos++);
274 
275                 if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
276                     return false;
277                 }
278             }
279 
280             item = text.charAt(pos);
281 
282             // Check that char after "script" is not a letter (i.e. another tag)
283 
284             return !Character.isLetter(item);
285         }
286         else {
287             return false;
288         }
289     }
290 
291     private static final String[] _MS_WORD_UNICODE = new String[] {
292         "\u00ae", "\u2019", "\u201c", "\u201d"
293     };
294 
295     private static final String[] _MS_WORD_HTML = new String[] {
296         "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
297     };
298 
299     private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
300 
301 }