1   /**
2    * Copyright (c) 2000-2010 Liferay, Inc. All rights reserved.
3    *
4    * The contents of this file are subject to the terms of the Liferay Enterprise
5    * Subscription License ("License"). You may not use this file except in
6    * compliance with the License. You can obtain a copy of the License by
7    * contacting Liferay, Inc. See the License for the specific language governing
8    * permissions and limitations under the License, including but not limited to
9    * distribution rights of the Software.
10   *
11   *
12   *
13   */
14  
15  package com.liferay.portlet.wiki.translators;
16  
17  import com.liferay.portal.kernel.util.StringPool;
18  import com.liferay.portlet.wiki.importers.mediawiki.MediaWikiImporter;
19  
20  import java.util.regex.Matcher;
21  import java.util.regex.Pattern;
22  
23  /**
24   * <a href="MediaWikiToCreoleTranslator.java.html"><b><i>View Source</i></b></a>
25   *
26   * @author Jorge Ferrer
27   */
28  public class MediaWikiToCreoleTranslator extends BaseTranslator {
29  
30      public static final String TABLE_OF_CONTENTS = "<<TableOfContents>>\n\n";
31  
32      public MediaWikiToCreoleTranslator() {
33          initRegexps();
34          initNowikiRegexps();
35      }
36  
37      protected void initNowikiRegexps() {
38  
39          // Preformat protected
40  
41          nowikiRegexps.add("(<nowiki>)(.*?)(</nowiki>)");
42          nowikiRegexps.add("(<pre>)(.*?)(</pre>)");
43  
44          // Escape protected
45  
46          nowikiRegexps.add(
47              "~(\\*\\*|~|//|-|#|\\{\\{|}}|\\\\|~\\[~~[|]]|----|=|\\|)");
48      }
49  
50      protected void initRegexps() {
51  
52          // Clean unnecessary header emphasis
53  
54          regexps.put("= '''([^=]+)''' =", "= $1 =");
55          regexps.put("== '''([^=]+)''' ==", "== $1 ==");
56          regexps.put("== '''([^=]+)''' ===", "=== $1 ===");
57  
58          // Unscape angle brackets
59  
60          regexps.put("&lt;", "<");
61          regexps.put("&gt;", ">");
62  
63          // Remove categories
64  
65          regexps.put("\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*", "");
66  
67          // Remove disambiguations
68  
69          regexps.put("\\{{2}OtherTopics\\|([^\\}]*)\\}{2}", StringPool.BLANK);
70  
71          // Remove work in progress
72  
73          regexps.put("\\{{2}Work in progress\\}{2}", StringPool.BLANK);
74  
75          // Bold and italics
76  
77          regexps.put(
78              "''''((?s:.)*?)(''''|(\n\n|\r\r|\r\n\r\n))", "**//$1//**$3");
79  
80          // Bold
81  
82          regexps.put("'''((?s:.)*?)('''|(\n\n|\r\r|\r\n\r\n))", "**$1**$3");
83  
84          // Italics
85  
86          regexps.put("''((?s:.)*?)(''|(\n\n|\r\r|\r\n\r\n))", "//$1//$3");
87  
88          // Normalize URLs
89  
90          regexps.put("\\[{2}((http|ftp)[^ ]*) ([^\\]]*)\\]{2}", "[$1 $3]");
91  
92          // URL
93  
94          regexps.put("\\[((http|ftp)[^ ]*)\\]", "[[$1]]");
95  
96          // URL with label
97  
98          regexps.put("\\[((http|ftp)[^ ]*) ([^\\]]*)\\]", "[[$1|$3]]");
99  
100         // Term and definition
101 
102         regexps.put("^\\t([\\w]+):\\t(.*)", "**$1**:\n$2");
103 
104         // Indented paragraph
105 
106         regexps.put("^\\t:\\t(.*)", "$1");
107 
108         // Monospace
109 
110         regexps.put("(^ (.+))(\\n (.+))*", "{{{\n$0\n}}}");
111 
112         // No wiki
113 
114         regexps.put("<nowiki>([^<]*)</nowiki>", "{{{$1}}}");
115 
116         // HTML PRE
117 
118         regexps.put("<pre>([^<]*)</pre>", "{{{$1}}}");
119 
120         // User reference
121 
122         regexps.put("[-]*\\[{2}User:([^\\]]*)\\]{2}", "$1");
123     }
124 
125     protected String postProcess(String content) {
126 
127         // LEP-6118
128 
129         Matcher matcher = Pattern.compile(
130             "^=([^=]+)=", Pattern.MULTILINE).matcher(content);
131 
132         if (matcher.find()) {
133             content = runRegexp(content, "^===([^=]+)===", "====$1====");
134             content = runRegexp(content, "^==([^=]+)==", "===$1===");
135             content = runRegexp(content, "^=([^=]+)=", "==$1==");
136         }
137 
138         // Remove HTML tags
139 
140         for (int i = 0; i < _HTML_TAGS.length; i++) {
141             content = content.replaceAll(_HTML_TAGS[i], StringPool.BLANK);
142         }
143 
144         // Images
145 
146         matcher = Pattern.compile(
147             "\\[{2}Image:([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
148 
149         StringBuffer sb = new StringBuffer(content);
150 
151         int offset = 0;
152 
153         while (matcher.find()) {
154             String image =
155                 "{{" + MediaWikiImporter.SHARED_IMAGES_TITLE + "/" +
156                     matcher.group(1).toLowerCase() + "}}";
157 
158             sb.replace(
159                 matcher.start(0) + offset, matcher.end(0) + offset, image);
160 
161             offset += MediaWikiImporter.SHARED_IMAGES_TITLE.length() - 5;
162         }
163 
164         content = sb.toString();
165 
166         // Remove underscores from links
167 
168         matcher = Pattern.compile(
169             "\\[{2}([^\\]]*)\\]{2}", Pattern.DOTALL).matcher(content);
170 
171         sb = new StringBuffer(content);
172 
173         while (matcher.find()) {
174             String link = matcher.group(1).replace(
175                 StringPool.UNDERLINE, StringPool.SPACE);
176 
177             sb.replace(matcher.start(1), matcher.end(1), link);
178         }
179 
180         return TABLE_OF_CONTENTS + super.postProcess(sb.toString());
181     }
182 
183     private static final String[] _HTML_TAGS = {
184         "<blockquote>", "</blockquote>", "<br>", "<br/>", "<br />",  "<center>",
185         "</center>", "<cite>", "</cite>","<code>", "</code>", "<div[^>]*>",
186         "</div>", "<font[^>]*>", "</font>", "<hr>", "<hr/>", "<hr />", "<p>",
187         "</p>", "<tt>", "</tt>", "<var>", "</var>"};
188 
189 }