1
2 package org.catacomb.xdoc;
3
4
5 import org.catacomb.report.E;
6 import org.catacomb.util.FileUtil;
7
8 import java.io.BufferedReader;
9 import java.io.File;
10 import java.io.IOException;
11 import java.io.StringReader;
12 import java.util.StringTokenizer;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15
16
17
18 public class TextTagger {
19
20 static String[][] legalizers = {{"<", "<"},
21 {">", ">"},
22 {"'", "'"},
23 {"\"", """}
24 };
25
26
27
28
29 static String[][] linkizers = {
30 {"((\\s)(http://[\\w\\.]*)(\\s))", " <ain href=\"$3\">$3</ain> "},
31 {"((\\s)(\\w+\\.\\w+\\.\\w+)(\\s))", " <ain href=\"http://$3\">$3</ain> "},
32 {"(\\{(.+?)\\|(.+?)\\})", " <ain href=\"$3\">$2</ain> "},
33 {"(?m)^file: *(.+) *$", "file: <ai href\"($1)\">$1</ai><br/>\\\n"},
34 {" #(\\d*)", " <CR>$1</CR>"}
35 };
36
37
38 static String[][] emphasizers = {
39 {"((\\s_)([\\w\\s]+)_([\\s\\p{Punct}]))", " <u>$3</u>$4"},
40 {"((\\s\\*)([\\w\\s]+)\\*([\\s\\p{Punct}]))", " <b>$3</b>$4"},
41 {"((\\s/)([\\w\\s]+)/([\\s\\p{Punct}]))", " <i>$3</i>$4"}
42 };
43
44
45
46
47 static String[][] headerizers = {
48 {"^H:(.*)$", "\\\n\\\n<h2>$1</h2>\\\n\\\n "},
49 {"^H1:(.*)$", "\\\n\\\n<h1>$1</h1>\\\n\\\n"},
50 {"^H2:(.*)$", "\\\n\\\n<h2>$1</h2>\\\n\\\n"},
51 {"^H3:(.*)$", "\\\n\\\n<h3>$1</h3>\\\n\\\n"},
52 {"^H4:(.*)$", "\\\n\\\n<h4>$1</h4>\\\n\\\n"}
53 };
54
55
56 static String[][] elementizers = {
57 {"(^(\\w+?): *(.*?) *$)", "<element name=\"$2\" value=\"$3\"/>\n"}
58 };
59
60
61
62
63
64
65 static String paragraphSeparator = "(?<=(\r\n|\r|\n))([ \\t]*$)+";
66 static String paragraphMatch = "(^.*\\S+.*$)+";
67
68
69 static String listItemPrefix = " *[+*-] +";
70
71 static String attributeLine = "(^(\\w+?) *= *(.*?) *$)";
72 static String elementTag="^(\\w+?):";
73
74
75 static TextTagger tagger;
76
77 public static TextTagger getTagger() {
78 if (tagger == null) {
79 tagger = new TextTagger();
80 }
81 return tagger;
82 }
83
84 public TextTagger() {
85
86 }
87
88
89 public void init() {
90
91 }
92
93
94
95 public String textToXML(String plainText, String defaultRoot) {
96 String xtxt = plainText;
97
98
99
100
101 xtxt = legalize(xtxt);
102
103
104
105
106 String[] wrapper = new String[2];
107 xtxt = extractAttributes(xtxt, defaultRoot, wrapper);
108
109
110
111 xtxt = headerize(xtxt);
112
113
114 xtxt=elementize(xtxt);
115
116
117
118 xtxt = listize(xtxt);
119
120
121
122 xtxt = paragraphize(xtxt);
123
124 xtxt = linkize(xtxt);
125
126 xtxt = emphasize(xtxt);
127
128
129
130 xtxt = xmlwrap(xtxt, wrapper);
131
132
133 return xtxt;
134 }
135
136
137 public String tagText(String plainText) {
138 String xtxt = plainText;
139
140 xtxt = xtxt.replaceAll("-p-", "\n\n");
141
142
143 xtxt = legalize(xtxt);
144
145
146
147
148 xtxt = headerize(xtxt);
149
150
151
152
153
154 xtxt = listize(xtxt);
155
156
157
158 xtxt = paragraphize(xtxt);
159
160 xtxt = linkize(xtxt);
161
162 xtxt = emphasize(xtxt);
163
164 return xtxt;
165 }
166
167
168
169
170 public String textToEdit(String plainText, String defaultRoot) {
171
172 String xtxt = plainText;
173
174 String fullxml = textToXML(plainText, defaultRoot);
175
176
177
178 xtxt = legalize(xtxt);
179
180
181
182
183 String[] wrapper = new String[2];
184 xtxt = extractAttributesAsElements(xtxt, "source", wrapper);
185
186
187
188
189 xtxt = preWrap(xtxt);
190
191 xtxt = xmlwrap(xtxt, wrapper);
192
193
194 StringBuffer sb = new StringBuffer();
195 sb.append("<editsource>\n");
196 sb.append(xtxt);
197 sb.append("\n");
198 sb.append(" <output>\n");
199 sb.append(fullxml);
200 sb.append("\n");
201 sb.append(" </output>\n");
202 sb.append("</editsource>\n");
203
204
205 return sb.toString();
206 }
207
208
209
210
211 public void textToXML(File ftxt, File fxml) {
212 String txt = FileUtil.readStringFromFile(ftxt);
213 String xtxt = textToXML(txt, "default");
214 FileUtil.writeStringToFile(xtxt, fxml);
215 }
216
217
218 public void textToEdit(File ftxt, File fedt) {
219 String txt = FileUtil.readStringFromFile(ftxt);
220 String etxt = textToEdit(txt, "default");
221 FileUtil.writeStringToFile(etxt, fedt);
222 }
223
224
225
226
227 public String applyReplacements(String txtin, String[][] reps) {
228 String txt = txtin;
229 for (int i = 0; i < reps.length; i++) {
230 String[] lp = reps[i];
231 txt = txt.replaceAll(lp[0], lp[1]);
232 }
233 return txt;
234 }
235
236
237 public String applyReplacementsMultiline(String txtin, String[][] reps) {
238 String txt = txtin;
239 for (int i = 0; i < reps.length; i++) {
240 String[] lp = reps[i];
241 Pattern pattern = Pattern.compile(lp[0], Pattern.MULTILINE);
242 Matcher matcher = pattern.matcher(txt);
243 String output = matcher.replaceAll(lp[1]);
244 txt = output;
245 }
246 return txt;
247 }
248
249
250
251
252 public String legalize(String txt) {
253 return applyReplacements(txt, legalizers);
254 }
255
256
257 public String linkize(String txt) {
258 return applyReplacements(txt, linkizers);
259 }
260
261
262 public String emphasize(String txt) {
263 return applyReplacements(txt, emphasizers);
264 }
265
266
267 public String headerize(String txt) {
268 return applyReplacementsMultiline(txt, headerizers);
269 }
270
271
272 public String elementize(String txt) {
273
274 return elementize2(txt);
275 }
276
277
278
279 public String htmlizeParagraph(String apar) {
280 String stmp = legalize(apar);
281 String ret = emphasize(stmp);
282 return ret;
283 }
284
285
286
287
288
289
290
291 public String elementize2(String txt) {
292 Pattern pat = Pattern.compile(elementTag);
293 Matcher matcher = pat.matcher("");
294
295 StringBuffer sb = new StringBuffer();
296
297 try {
298 BufferedReader sr = new BufferedReader(new StringReader(txt));
299 while (sr.ready()) {
300 String line = sr.readLine();
301 if (line == null) {
302 break;
303 } else {
304 matcher.reset(line);
305 if (matcher.find() && matcher.start() == 0) {
306 String tag = matcher.group(1);
307 sb.append(makeElement(tag, line));
308 sb.append("\n");
309 } else {
310 sb.append(line);
311 sb.append("\n");
312 }
313 }
314
315 }
316 } catch (IOException ex) {
317 E.error("io exception listizing ");
318 ex.printStackTrace();
319 }
320
321 return sb.toString();
322 }
323
324
325
326 private String makeElement(String tag, String line) {
327 String ret = "";
328 if (line.startsWith(tag + ":")) {
329 StringBuffer sb = new StringBuffer();
330 sb.append("<");
331 sb.append(tag);
332
333 String rest = line.substring(tag.length()+1, line.length());
334 StringTokenizer st = new StringTokenizer(rest, ",");
335 boolean doneclose = false;
336 while (st.hasMoreTokens()) {
337 String stok = st.nextToken();
338
339 if (doneclose) {
340 sb.append(stok);
341 } else {
342 int ieq = stok.indexOf("=");
343 if (ieq > 0) {
344 String nm = stok.substring(0, ieq);
345 String val = stok.substring(ieq+1, stok.length());
346 nm = nm.trim();
347 nm = nm.replaceAll(" ", "_");
348 sb.append(" ");
349 sb.append(nm);
350 sb.append("=\"");
351 val = val.trim();
352 sb.append(val);
353 sb.append("\"");
354
355 } else {
356 sb.append(">");
357 sb.append(stok);
358 doneclose = true;
359 }
360 }
361
362 }
363 if (!doneclose) {
364 sb.append(">");
365 }
366
367 sb.append("</");
368 sb.append(tag);
369 sb.append(">\n");
370 ret = sb.toString();
371 } else {
372 E.error(" misinterpreted element? " +
373 "---" + tag + "--- ---" + line);
374 }
375 return ret;
376 }
377
378
379
380
381
382 public String paragraphize(String s) {
383 return paragraphize1(s);
384 }
385
386
387
388 public String paragraphize1(String sin) {
389 String s = sin;
390 s = s.replaceAll("-p-", "\n\n");
391 String[] paras = Pattern.compile(paragraphSeparator, Pattern.MULTILINE).split(s);
392
393 StringBuffer sb = new StringBuffer();
394
395 for (int i = 0; i < paras.length; i++) {
396 String par = paras[i];
397 par = par.trim();
398 insertParagraph(sb, par);
399 }
400 return sb.toString();
401 }
402
403
404
405 public String paragraphize2(String s) {
406
407
408 Pattern pattern = Pattern.compile(paragraphMatch, Pattern.MULTILINE);
409 Matcher matcher = pattern.matcher(s);
410
411 StringBuffer sb = new StringBuffer();
412
413 while (matcher.find()) {
414 String para = matcher.group();
415 para = para.trim();
416 insertParagraph(sb, para);
417 }
418 return sb.toString();
419 }
420
421
422 void insertParagraph(StringBuffer sb, String para) {
423 if (para.length() > 0) {
424 if (para.indexOf("<") < 0) {
425
426 sb.append("\n<p>\n");
427 sb.append(para);
428 sb.append("\n</p>\n\n");
429 } else {
430 sb.append(para);
431 sb.append("\n");
432 }
433 }
434 }
435
436
437
438
439 public String listize(String txt) {
440
441
442
443
444
445
446
447
448
449
450
451
452
453 Pattern pat = Pattern.compile(listItemPrefix);
454 Matcher matcher = pat.matcher("");
455
456 StringBuffer sb = new StringBuffer();
457
458 try {
459 BufferedReader sr = new BufferedReader(new StringReader(txt));
460
461 while (sr.ready()) {
462 String line = sr.readLine();
463
464 if (line == null) {
465 break;
466 } else {
467
468 matcher.reset(line);
469 if (matcher.find() && matcher.start() == 0) {
470
471 String itemintro = matcher.group();
472 wrapList(sr, sb, itemintro, line);
473
474 } else {
475 sb.append(line);
476 sb.append("\n");
477 }
478 }
479
480 }
481 } catch (IOException ex) {
482 E.error("io exception listizing ");
483 ex.printStackTrace();
484 }
485
486 return sb.toString();
487 }
488
489
490 public void wrapList(BufferedReader sr, StringBuffer sb,
491 String itemintro, String fl) throws IOException {
492 String firstline = fl;
493 String itemcont = itemintro.replaceFirst("\\S", " ");
494
495 boolean inli = false;
496 boolean inul = false;
497
498 sb.append("\n\n<ul>\n");
499 inul = true;
500
501 sb.append(" <li>\n");
502 inli = true;
503
504 firstline = firstline.substring(itemintro.length(), firstline.length());
505 sb.append(firstline);
506 sb.append("\n");
507
508 String line = "";
509 while (sr.ready()) {
510 line = sr.readLine();
511 if (line == null) {
512 break;
513
514 } else if (line.trim().length() == 0) {
515
516
517 } else if (line.startsWith(itemcont)) {
518 sb.append(line);
519 sb.append("\n");
520 line = null;
521
522 } else if (line.startsWith(itemintro)) {
523 if (inli) {
524 sb.append(" </li>\n");
525 }
526 line = line.substring(itemintro.length(), line.length());
527 sb.append(" <li>\n");
528 sb.append(line);
529 sb.append("\n");
530 line = null;
531
532 } else {
533 break;
534 }
535 }
536
537
538 if (inli) {
539 sb.append("</li>\n");
540 }
541
542 if (inul) {
543 sb.append("</ul>\n\n");
544 }
545
546 if (line != null) {
547 sb.append("\n\n");
548 sb.append(line);
549 sb.append("\n");
550 }
551 }
552
553
554
555
556
557
558
559
560
561 String xmlwrap(String xtxt, String[] wrapper) {
562 StringBuffer sb = new StringBuffer();
563 sb.append(wrapper[0]);
564 sb.append("\n");
565 sb.append(xtxt);
566 sb.append("\n");
567 sb.append(wrapper[1]);
568 sb.append("\n");
569 return sb.toString();
570 }
571
572
573 String preWrap(String s) {
574 StringBuffer sb = new StringBuffer();
575 sb.append("<pre>\n");
576 sb.append(s);
577 sb.append("\n");
578 sb.append("</pre>\n");
579 return sb.toString();
580 }
581
582
583
584
585 String htmlWrap(String s) {
586 StringBuffer sb = new StringBuffer();
587 sb.append("<html>\n");
588 sb.append("<head></head>\n");
589 sb.append("<body>\n");
590 sb.append(s);
591 sb.append("\n");
592 sb.append("</body>\n");
593 return sb.toString();
594 }
595
596
597
598 String readAttributes(String xtxtin, String[][] sat) {
599 String xtxt = xtxtin;
600 Pattern pattern = Pattern.compile(attributeLine, Pattern.MULTILINE);
601
602 int nat = 0;
603 Matcher matcher = pattern.matcher("");
604 while (true) {
605 xtxt = xtxt.trim();
606 matcher.reset(xtxt);
607 if (xtxt.length() > 0 && matcher.find() && matcher.start() == 0) {
608 String group = matcher.group();
609 xtxt = xtxt.substring(group.length(), xtxt.length());
610
611 String name = matcher.group(2);
612 String value = matcher.group(3).trim();
613
614 if (containsAttribute(sat, nat, name)) {
615 E.warning(" ignoringdubplicate attribute " +
616 name + " " + value);
617
618 } else {
619 sat[nat][0] = name;
620 sat[nat][1] = value;
621 nat++;
622 }
623
624 } else {
625 break;
626 }
627 }
628 return xtxt;
629 }
630
631
632
633
634 private boolean containsAttribute(String[][] sat, int nat, String name) {
635 boolean contains = false;
636 for (int i = 0; i<nat; i++) {
637 if (sat[i][0].equals(name)) {
638 contains = true;
639 break;
640 }
641 }
642 return contains;
643 }
644
645
646
647
648 String extractAttributes(String xtxtin, String defaultRoot, String[] wrapper) {
649 String xtxt = xtxtin;
650 String[][] sat = new String[100][2]; /
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735