001    /**
002     * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
003     *
004     * The contents of this file are subject to the terms of the Liferay Enterprise
005     * Subscription License ("License"). You may not use this file except in
006     * compliance with the License. You can obtain a copy of the License by
007     * contacting Liferay, Inc. See the License for the specific language governing
008     * permissions and limitations under the License, including but not limited to
009     * distribution rights of the Software.
010     *
011     *
012     *
013     */
014    
015    package com.liferay.portal.util;
016    
017    import com.liferay.portal.kernel.util.CharPool;
018    import com.liferay.portal.kernel.util.Html;
019    import com.liferay.portal.kernel.util.HttpUtil;
020    import com.liferay.portal.kernel.util.StringBundler;
021    import com.liferay.portal.kernel.util.StringPool;
022    import com.liferay.portal.kernel.util.StringUtil;
023    
024    import java.util.regex.Matcher;
025    import java.util.regex.Pattern;
026    
027    import net.htmlparser.jericho.Source;
028    
029    /**
030     * @author Brian Wing Shun Chan
031     * @author Clarence Shen
032     * @author Harry Mark
033     * @author Samuel Kong
034     * @author Connor McKay
035     */
036    public class HtmlImpl implements Html {
037    
038            public static final int ESCAPE_MODE_ATTRIBUTE = 1;
039    
040            public static final int ESCAPE_MODE_CSS = 2;
041    
042            public static final int ESCAPE_MODE_JS = 3;
043    
044            public static final int ESCAPE_MODE_TEXT = 4;
045    
046            public static final int ESCAPE_MODE_URL = 5;
047    
048            public String escape(String text) {
049                    if (text == null) {
050                            return null;
051                    }
052    
053                    if (text.length() == 0) {
054                            return StringPool.BLANK;
055                    }
056    
057                    // Escape using XSS recommendations from
058                    // http://www.owasp.org/index.php/Cross_Site_Scripting
059                    // #How_to_Protect_Yourself
060    
061                    StringBuilder sb = new StringBuilder(text.length());
062    
063                    for (int i = 0; i < text.length(); i++) {
064                            char c = text.charAt(i);
065    
066                            switch (c) {
067                                    case '<':
068                                            sb.append("&lt;");
069    
070                                            break;
071    
072                                    case '>':
073                                            sb.append("&gt;");
074    
075                                            break;
076    
077                                    case '&':
078                                            sb.append("&amp;");
079    
080                                            break;
081    
082                                    case '"':
083                                            sb.append("&#034;");
084    
085                                            break;
086    
087                                    case '\'':
088                                            sb.append("&#039;");
089    
090                                            break;
091    
092                                    case '\u00bb': // '�'
093                                            sb.append("&raquo;");
094    
095                                            break;
096    
097                                    default:
098                                            sb.append(c);
099    
100                                            break;
101                            }
102                    }
103    
104                    if (sb.length() == text.length()) {
105                            return text;
106                    }
107                    else {
108                            return sb.toString();
109                    }
110            }
111    
112            public String escape(String text, int type) {
113                    if (text == null) {
114                            return null;
115                    }
116    
117                    if (text.length() == 0) {
118                            return StringPool.BLANK;
119                    }
120    
121                    String prefix = StringPool.BLANK;
122                    String postfix = StringPool.BLANK;
123    
124                    if (type == ESCAPE_MODE_ATTRIBUTE) {
125                            prefix = "&#x";
126                            postfix = StringPool.SEMICOLON;
127                    }
128                    else if (type == ESCAPE_MODE_CSS) {
129                            prefix = StringPool.BACK_SLASH;
130                    }
131                    else if (type == ESCAPE_MODE_JS) {
132                            prefix = "\\x";
133                    }
134                    else if (type == ESCAPE_MODE_URL) {
135                            return HttpUtil.encodeURL(text, true);
136                    }
137                    else {
138                            return escape(text);
139                    }
140    
141                    StringBuilder sb = new StringBuilder();
142    
143                    for (int i = 0; i < text.length(); i++) {
144                            char c = text.charAt(i);
145    
146                            if ((Character.isLetterOrDigit(c)) ||
147                                    (c == CharPool.DASH) || (c == CharPool.UNDERLINE)) {
148    
149                                    sb.append(c);
150                            }
151                            else {
152                                    sb.append(prefix);
153                                    sb.append(Integer.toHexString(c));
154                                    sb.append(postfix);
155                            }
156                    }
157    
158                    if (sb.length() == text.length()) {
159                            return text;
160                    }
161                    else {
162                            return sb.toString();
163                    }
164            }
165    
166            public String escapeAttribute(String attribute) {
167                    return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
168            }
169    
170            public String escapeCSS(String css) {
171                    return escape(css, ESCAPE_MODE_CSS);
172            }
173    
174            public String escapeHREF(String href) {
175                    if (href == null) {
176                            return null;
177                    }
178    
179                    if (href.length() == 0) {
180                            return StringPool.BLANK;
181                    }
182    
183                    if (href.indexOf(StringPool.COLON) == 10) {
184                            String protocol = href.substring(0, 10).toLowerCase();
185    
186                            if (protocol.equals("javascript")) {
187                                    return StringUtil.replaceFirst(href, StringPool.COLON, "%3a");
188                            }
189                    }
190    
191                    return href;
192            }
193    
194            public String escapeJS(String js) {
195                    return escape(js, ESCAPE_MODE_JS);
196            }
197    
198            public String escapeURL(String url) {
199                    return escape(url, ESCAPE_MODE_URL);
200            }
201    
202            public String extractText(String html) {
203                    if (html == null) {
204                            return null;
205                    }
206    
207                    Source source = new Source(html);
208    
209                    return source.getTextExtractor().toString();
210            }
211    
212            public String fromInputSafe(String text) {
213                    return StringUtil.replace(text, "&amp;", "&");
214            }
215    
216            public String replaceMsWordCharacters(String text) {
217                    return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
218            }
219    
220            public String stripBetween(String text, String tag) {
221                    return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
222            }
223    
224            public String stripComments(String text) {
225                    return StringUtil.stripBetween(text, "<!--", "-->");
226            }
227    
228            public String stripHtml(String text) {
229                    if (text == null) {
230                            return null;
231                    }
232    
233                    text = stripComments(text);
234    
235                    StringBuilder sb = new StringBuilder(text.length());
236    
237                    int x = 0;
238                    int y = text.indexOf("<");
239    
240                    while (y != -1) {
241                            sb.append(text.substring(x, y));
242                            sb.append(StringPool.SPACE);
243    
244                            // Look for text enclosed by <script></script>
245    
246                            boolean scriptFound = isScriptTag(text, y + 1);
247    
248                            if (scriptFound) {
249                                    int pos = y + _TAG_SCRIPT.length;
250    
251                                    // Find end of the tag
252    
253                                    pos = text.indexOf(">", pos);
254    
255                                    if (pos >= 0) {
256    
257                                            // Check if preceding character is / (i.e. is this instance
258                                            // of <script/>)
259    
260                                            if (text.charAt(pos-1) != '/') {
261    
262                                                    // Search for the ending </script> tag
263    
264                                                    for (;;) {
265                                                            pos = text.indexOf("</", pos);
266    
267                                                            if (pos >= 0) {
268                                                                    if (isScriptTag(text, pos + 2)) {
269                                                                            y = pos;
270    
271                                                                            break;
272                                                                    }
273                                                                    else {
274    
275                                                                            // Skip past "</"
276    
277                                                                            pos += 2;
278                                                                    }
279                                                            }
280                                                            else {
281                                                                    break;
282                                                            }
283                                                    }
284                                            }
285                                    }
286                            }
287    
288                            x = text.indexOf(">", y);
289    
290                            if (x == -1) {
291                                    break;
292                            }
293    
294                            x++;
295    
296                            if (x < y) {
297    
298                                    // <b>Hello</b
299    
300                                    break;
301                            }
302    
303                            y = text.indexOf("<", x);
304                    }
305    
306                    if (y == -1) {
307                            sb.append(text.substring(x, text.length()));
308                    }
309    
310                    return sb.toString();
311            }
312    
313            public String toInputSafe(String text) {
314                    return StringUtil.replace(
315                            text,
316                            new String[] {"&", "\""},
317                            new String[] {"&amp;", "&quot;"});
318            }
319    
320            public String unescape(String text) {
321                    if (text == null) {
322                            return null;
323                    }
324    
325                    if (text.length() == 0) {
326                            return StringPool.BLANK;
327                    }
328    
329                    // Optimize this
330    
331                    text = StringUtil.replace(text, "&lt;", "<");
332                    text = StringUtil.replace(text, "&gt;", ">");
333                    text = StringUtil.replace(text, "&amp;", "&");
334                    text = StringUtil.replace(text, "&#034;", "\"");
335                    text = StringUtil.replace(text, "&#039;", "'");
336                    text = StringUtil.replace(text, "&#040;", "(");
337                    text = StringUtil.replace(text, "&#041;", ")");
338                    text = StringUtil.replace(text, "&#044;", ",");
339                    text = StringUtil.replace(text, "&#035;", "#");
340                    text = StringUtil.replace(text, "&#037;", "%");
341                    text = StringUtil.replace(text, "&#059;", ";");
342                    text = StringUtil.replace(text, "&#061;", "=");
343                    text = StringUtil.replace(text, "&#043;", "+");
344                    text = StringUtil.replace(text, "&#045;", "-");
345    
346                    return text;
347            }
348    
349            public String wordBreak(String text, int columns) {
350                    StringBundler sb = new StringBundler();
351    
352                    int length = 0;
353                    int lastWrite = 0;
354                    int pos = 0;
355    
356                    Pattern pattern = Pattern.compile("([\\s<&]|$)");
357    
358                    Matcher matcher = pattern.matcher(text);
359    
360                    while (matcher.find()) {
361                            if (matcher.start() < pos) {
362                                    continue;
363                            }
364    
365                            while ((length + matcher.start() - pos) >= columns) {
366                                    pos += columns - length;
367    
368                                    sb.append(text.substring(lastWrite, pos));
369                                    sb.append("<wbr/>");
370    
371                                    length = 0;
372                                    lastWrite = pos;
373                            }
374    
375                            length += matcher.start() - pos;
376    
377                            String group = matcher.group();
378    
379                            if (group.equals(StringPool.AMPERSAND)) {
380                                    int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
381    
382                                    if (x != -1) {
383                                            length++;
384                                            pos = x + 1;
385                                    }
386    
387                                    continue;
388                            }
389    
390                            if (group.equals(StringPool.LESS_THAN)) {
391                                    int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
392    
393                                    if (x != -1) {
394                                            pos = x + 1;
395                                    }
396    
397                                    continue;
398                            }
399    
400                            if (group.equals(StringPool.SPACE) ||
401                                    group.equals(StringPool.NEW_LINE)) {
402    
403                                    length = 0;
404                                    pos = matcher.start() + 1;
405                            }
406                    }
407    
408                    sb.append(text.substring(lastWrite));
409    
410                    return sb.toString();
411            }
412    
413            protected boolean isScriptTag(String text, int pos) {
414                    if (pos + _TAG_SCRIPT.length + 1 <= text.length()) {
415                            char item;
416    
417                            for (int i = 0; i < _TAG_SCRIPT.length; i++) {
418                                    item = text.charAt(pos++);
419    
420                                    if (Character.toLowerCase(item) != _TAG_SCRIPT[i]) {
421                                            return false;
422                                    }
423                            }
424    
425                            item = text.charAt(pos);
426    
427                            // Check that char after "script" is not a letter (i.e. another tag)
428    
429                            return !Character.isLetter(item);
430                    }
431                    else {
432                            return false;
433                    }
434            }
435    
436            private static final String[] _MS_WORD_UNICODE = new String[] {
437                    "\u00ae", "\u2019", "\u201c", "\u201d"
438            };
439    
440            private static final String[] _MS_WORD_HTML = new String[] {
441                    "&reg;", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
442            };
443    
444            private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
445    
446    }