View Javadoc

1   
2   package org.catacomb.xdoc;
3   
4   
5   import org.catacomb.report.E;
6   import org.catacomb.util.FileUtil;
7   
8   import java.io.BufferedReader;
9   import java.io.File;
10  import java.io.IOException;
11  import java.io.StringReader;
12  import java.util.StringTokenizer;
13  import java.util.regex.Matcher;
14  import java.util.regex.Pattern;
15  
16  
17  
18  public class TextTagger {
19  
20      static String[][] legalizers = {{"<", "&lt;"},
21          {">", "&gt;"},
22          {"'", "&apos;"},
23          {"\"", "&quot;"}
24      };
25  
26      //				   {"&", "&amp;"},
27  
28  
29      static String[][] linkizers = {
30          {"((\\s)(http://[\\w\\.]*)(\\s))", " <ain href=\"$3\">$3</ain> "},
31          {"((\\s)(\\w+\\.\\w+\\.\\w+)(\\s))", " <ain href=\"http://$3\">$3</ain> "},
32          {"(\\{(.+?)\\|(.+?)\\})", " <ain href=\"$3\">$2</ain> "},
33          {"(?m)^file: *(.+) *$", "file: <ai href\"($1)\">$1</ai><br/>\\\n"},
34          {" #(\\d*)", " <CR>$1</CR>"}
35      };
36  
37  
38      static String[][] emphasizers = {
39          {"((\\s_)([\\w\\s]+)_([\\s\\p{Punct}]))", " <u>$3</u>$4"},
40          {"((\\s\\*)([\\w\\s]+)\\*([\\s\\p{Punct}]))", " <b>$3</b>$4"},
41          {"((\\s/)([\\w\\s]+)/([\\s\\p{Punct}]))", " <i>$3</i>$4"}
42      };
43  
44  
45  
46      // should check there is no markup in the header;
47      static String[][] headerizers = {
48          {"^H:(.*)$",  "\\\n\\\n<h2>$1</h2>\\\n\\\n "},
49          {"^H1:(.*)$", "\\\n\\\n<h1>$1</h1>\\\n\\\n"},
50          {"^H2:(.*)$", "\\\n\\\n<h2>$1</h2>\\\n\\\n"},
51          {"^H3:(.*)$", "\\\n\\\n<h3>$1</h3>\\\n\\\n"},
52          {"^H4:(.*)$", "\\\n\\\n<h4>$1</h4>\\\n\\\n"}
53      };
54  
55  
56      static String[][] elementizers = {
57          {"(^(\\w+?): *(.*?) *$)", "<element name=\"$2\" value=\"$3\"/>\n"}
58      };
59  
60  
61  
62  
63  
64      // paragraph patterns need compiling in MULTILINE mode;
65      static String paragraphSeparator = "(?<=(\r\n|\r|\n))([ \\t]*$)+";
66      static String paragraphMatch = "(^.*\\S+.*$)+";   // doesn't work?
67      //   static String listItemPrefix = " +\\S +";
68  
69      static String listItemPrefix = " *[+*-] +";
70  
71      static String attributeLine = "(^(\\w+?) *= *(.*?) *$)";
72      static String elementTag="^(\\w+?):";
73  
74  
75      static TextTagger tagger;
76  
77      public static TextTagger getTagger() {
78          if (tagger == null) {
79              tagger = new TextTagger();
80          }
81          return tagger;
82      }
83  
84      public TextTagger() {
85          // some static variables should be instance variables undre user configuration ****
86      }
87  
88  
89      public void init() {
90  
91      }
92  
93  
94  
95      public String textToXML(String plainText, String defaultRoot) {
96          String xtxt = plainText;
97  
98  
99  
100         // escape any dodgy characters;
101         xtxt = legalize(xtxt);
102 
103 
104 
105         // extract any attributes;
106         String[] wrapper = new String[2];
107         xtxt = extractAttributes(xtxt, defaultRoot, wrapper);
108 
109 
110         // put in headers - mark these up before paragraphizing
111         xtxt = headerize(xtxt);
112 
113         // extract elements of form "name: value "
114         xtxt=elementize(xtxt);
115 
116 
117         // lists
118         xtxt = listize(xtxt);
119 
120 
121         // paragraphs won't wrap existing markup, so do them before the rest
122         xtxt = paragraphize(xtxt);
123 
124         xtxt = linkize(xtxt);
125 
126         xtxt = emphasize(xtxt);
127 
128 
129         // wrap in root element
130         xtxt = xmlwrap(xtxt, wrapper);
131 
132         // finished - should check legality here...
133         return xtxt;
134     }
135 
136 
137     public String tagText(String plainText) {
138         String xtxt = plainText;
139 
140         xtxt = xtxt.replaceAll("-p-", "\n\n");
141 
142         // escape any dodgy characters;
143         xtxt = legalize(xtxt);
144 
145 
146 
147         // put in headers - mark these up before paragraphizing
148         xtxt = headerize(xtxt);
149 
150         // extract elements of form "name: value "
151 //   xtxt=elementize(xtxt);
152 
153         // lists
154         xtxt = listize(xtxt);
155 
156 
157         // paragraphs won't wrap existing markup, so do them before the rest
158         xtxt = paragraphize(xtxt);
159 
160         xtxt = linkize(xtxt);
161 
162         xtxt = emphasize(xtxt);
163 
164         return xtxt;
165     }
166 
167 
168 
169 
170     public String textToEdit(String plainText, String defaultRoot) {
171         // return legal xml with the content untouched and wrapped in a pre;
172         String xtxt = plainText;
173 
174         String fullxml = textToXML(plainText, defaultRoot);
175 
176 
177         // escape any dodgy characters;
178         xtxt = legalize(xtxt);
179 
180 
181 
182         // extract any attributes;
183         String[] wrapper = new String[2];
184         xtxt = extractAttributesAsElements(xtxt, "source", wrapper);
185 
186         // extract elements of form "name: value "
187         //      xtxt=elementize(xtxt);
188 
189         xtxt = preWrap(xtxt);
190 
191         xtxt = xmlwrap(xtxt, wrapper);
192 
193 
194         StringBuffer sb = new StringBuffer();
195         sb.append("<editsource>\n");
196         sb.append(xtxt);
197         sb.append("\n");
198         sb.append("   <output>\n");
199         sb.append(fullxml);
200         sb.append("\n");
201         sb.append("   </output>\n");
202         sb.append("</editsource>\n");
203 
204         // finished - should check legality here...
205         return sb.toString();
206     }
207 
208 
209 
210 
211     public void textToXML(File ftxt, File fxml) {
212         String txt = FileUtil.readStringFromFile(ftxt);
213         String xtxt = textToXML(txt, "default");
214         FileUtil.writeStringToFile(xtxt, fxml);
215     }
216 
217 
218     public void textToEdit(File ftxt, File fedt) {
219         String txt = FileUtil.readStringFromFile(ftxt);
220         String etxt = textToEdit(txt, "default");
221         FileUtil.writeStringToFile(etxt, fedt);
222     }
223 
224 
225 
226 
227     public String applyReplacements(String txtin, String[][] reps) {
228         String txt = txtin;
229         for (int i = 0; i < reps.length; i++) {
230             String[] lp = reps[i];
231             txt = txt.replaceAll(lp[0], lp[1]);
232         }
233         return txt;
234     }
235 
236 
237     public String applyReplacementsMultiline(String txtin, String[][] reps) {
238         String txt = txtin;
239         for (int i = 0; i < reps.length; i++) {
240             String[] lp = reps[i];
241             Pattern pattern = Pattern.compile(lp[0], Pattern.MULTILINE);
242             Matcher matcher = pattern.matcher(txt);
243             String output = matcher.replaceAll(lp[1]);
244             txt = output;
245         }
246         return txt;
247     }
248 
249 
250 
251 
252     public String legalize(String txt) {
253         return applyReplacements(txt, legalizers);
254     }
255 
256 
257     public String linkize(String txt) {
258         return applyReplacements(txt, linkizers);
259     }
260 
261 
262     public String emphasize(String txt) {
263         return applyReplacements(txt, emphasizers);
264     }
265 
266 
267     public String headerize(String txt) {
268         return applyReplacementsMultiline(txt, headerizers);
269     }
270 
271 
272     public String elementize(String txt) {
273         //      return applyReplacementsMultiline(txt, elementizers);
274         return elementize2(txt);
275     }
276 
277 
278 
279     public String htmlizeParagraph(String apar) {
280         String stmp = legalize(apar);
281         String ret = emphasize(stmp);
282         return ret;
283     }
284 
285 
286 
287 
288 
289     // should be 5 lines max !!!! ******
290 
291     public String elementize2(String txt) {
292         Pattern pat = Pattern.compile(elementTag);
293         Matcher matcher = pat.matcher("");
294 
295         StringBuffer sb = new StringBuffer();
296 
297         try {
298             BufferedReader sr = new BufferedReader(new StringReader(txt));
299             while (sr.ready()) {
300                 String line = sr.readLine();
301                 if (line == null) {
302                     break;
303                 } else {
304                     matcher.reset(line);
305                     if (matcher.find() && matcher.start() == 0) {
306                         String tag = matcher.group(1);
307                         sb.append(makeElement(tag, line));
308                         sb.append("\n");
309                     } else {
310                         sb.append(line);
311                         sb.append("\n");
312                     }
313                 }
314 
315             }
316         } catch (IOException ex) {
317             E.error("io exception listizing ");
318             ex.printStackTrace();
319         }
320 
321         return sb.toString();
322     }
323 
324 
325     // do it all with a regex? **********
326     private String makeElement(String tag, String line) {
327         String ret = "";
328         if (line.startsWith(tag + ":")) {
329             StringBuffer sb = new StringBuffer();
330             sb.append("<");
331             sb.append(tag);
332 
333             String rest = line.substring(tag.length()+1, line.length());
334             StringTokenizer st = new StringTokenizer(rest, ",");
335             boolean doneclose = false;
336             while (st.hasMoreTokens()) {
337                 String stok = st.nextToken();
338 
339                 if (doneclose) {
340                     sb.append(stok);
341                 } else {
342                     int ieq = stok.indexOf("=");
343                     if (ieq > 0) {
344                         String nm = stok.substring(0, ieq);
345                         String val = stok.substring(ieq+1, stok.length());
346                         nm = nm.trim();
347                         nm = nm.replaceAll(" ", "_");
348                         sb.append(" ");
349                         sb.append(nm);
350                         sb.append("=\"");
351                         val = val.trim();
352                         sb.append(val);
353                         sb.append("\"");
354 
355                     } else {
356                         sb.append(">");
357                         sb.append(stok);
358                         doneclose = true;
359                     }
360                 }
361 
362             }
363             if (!doneclose) {
364                 sb.append(">");
365             }
366 
367             sb.append("</");
368             sb.append(tag);
369             sb.append(">\n");
370             ret = sb.toString();
371         } else {
372             E.error(" misinterpreted element? " +
373                     "---" + tag + "---  ---" + line);
374         }
375         return ret;
376     }
377 
378 
379 
380 
381 
382     public String paragraphize(String s) {
383         return paragraphize1(s);
384     }
385 
386 
387 
388     public String paragraphize1(String sin) {
389         String s = sin;
390         s = s.replaceAll("-p-", "\n\n");
391         String[] paras = Pattern.compile(paragraphSeparator, Pattern.MULTILINE).split(s);
392 
393         StringBuffer sb = new StringBuffer();
394 
395         for (int i = 0; i < paras.length; i++) {
396             String par = paras[i];
397             par = par.trim();
398             insertParagraph(sb, par);
399         }
400         return sb.toString();
401     }
402 
403 
404 
405     public String paragraphize2(String s) {
406         // alt paragraphizer
407         // Compile the pattern
408         Pattern pattern = Pattern.compile(paragraphMatch, Pattern.MULTILINE);
409         Matcher matcher = pattern.matcher(s);
410 
411         StringBuffer sb = new StringBuffer();
412         // Read the paragraphs
413         while (matcher.find()) {
414             String para = matcher.group();
415             para = para.trim();
416             insertParagraph(sb, para);
417         }
418         return sb.toString();
419     }
420 
421 
422     void insertParagraph(StringBuffer sb, String para) {
423         if (para.length() > 0) {
424             if (para.indexOf("<") < 0) {
425                 // no markup in the paragraph. Must avoid splitting any exiting markup.
426                 sb.append("\n<p>\n");
427                 sb.append(para);
428                 sb.append("\n</p>\n\n");
429             } else {
430                 sb.append(para);
431                 sb.append("\n");
432             }
433         }
434     }
435 
436 
437 
438 
439     public String listize(String txt) {
440         // spot indented blocks beginning with the same symbol, such as
441         /*
442 
443         + point one
444         more on point 1
445         + pont 2 and so on
446         + last one
447 
448         so to initialize list it is one or more spaces, a character, one or more spaces.
449         continuing items have the same indent. New items have indent with character. Blank lines
450         can precede another point, but otherwise end the list as does breaking the indent.
451         */
452 
453         Pattern pat = Pattern.compile(listItemPrefix);
454         Matcher matcher = pat.matcher("");
455 
456         StringBuffer sb = new StringBuffer();
457 
458         try {
459             BufferedReader sr = new BufferedReader(new StringReader(txt));
460 
461             while (sr.ready()) {
462                 String line = sr.readLine();
463 
464                 if (line == null) {
465                     break;
466                 } else {
467 
468                     matcher.reset(line);
469                     if (matcher.find() && matcher.start() == 0) {
470 
471                         String itemintro = matcher.group();
472                         wrapList(sr, sb, itemintro, line);
473 
474                     } else {
475                         sb.append(line);
476                         sb.append("\n");
477                     }
478                 }
479 
480             }
481         } catch (IOException ex) {
482             E.error("io exception listizing ");
483             ex.printStackTrace();
484         }
485 
486         return sb.toString();
487     }
488 
489 
490     public void wrapList(BufferedReader sr, StringBuffer sb,
491                          String itemintro, String fl) throws IOException {
492         String firstline = fl;
493         String itemcont = itemintro.replaceFirst("\\S", " ");
494 
495         boolean inli = false;
496         boolean inul = false;
497 
498         sb.append("\n\n<ul>\n");
499         inul = true;
500 
501         sb.append("  <li>\n");
502         inli = true;
503 
504         firstline = firstline.substring(itemintro.length(), firstline.length());
505         sb.append(firstline);
506         sb.append("\n");
507 
508         String line = "";
509         while (sr.ready()) {
510             line = sr.readLine();
511             if (line == null) {
512                 break;
513 
514             } else if (line.trim().length() == 0) {
515                 // just skip it;
516 
517             } else if (line.startsWith(itemcont)) {
518                 sb.append(line);
519                 sb.append("\n");
520                 line = null;
521 
522             } else if (line.startsWith(itemintro)) {
523                 if (inli) {
524                     sb.append("   </li>\n");
525                 }
526                 line = line.substring(itemintro.length(), line.length());
527                 sb.append("   <li>\n");
528                 sb.append(line);
529                 sb.append("\n");
530                 line = null;
531 
532             } else {
533                 break;
534             }
535         }
536 
537 
538         if (inli) {
539             sb.append("</li>\n");
540         }
541 
542         if (inul) {
543             sb.append("</ul>\n\n");
544         }
545 
546         if (line != null) {
547             sb.append("\n\n");
548             sb.append(line);
549             sb.append("\n");
550         }
551     }
552 
553 
554 
555 
556 
557 
558 
559 
560 
561     String xmlwrap(String xtxt, String[] wrapper) {
562         StringBuffer sb = new StringBuffer();
563         sb.append(wrapper[0]);
564         sb.append("\n");
565         sb.append(xtxt);
566         sb.append("\n");
567         sb.append(wrapper[1]);
568         sb.append("\n");
569         return sb.toString();
570     }
571 
572 
573     String preWrap(String s) {
574         StringBuffer sb = new StringBuffer();
575         sb.append("<pre>\n");
576         sb.append(s);
577         sb.append("\n");
578         sb.append("</pre>\n");
579         return sb.toString();
580     }
581 
582 
583 
584 
585     String htmlWrap(String s) {
586         StringBuffer sb = new StringBuffer();
587         sb.append("<html>\n");
588         sb.append("<head></head>\n");
589         sb.append("<body>\n");
590         sb.append(s);
591         sb.append("\n");
592         sb.append("</body>\n");
593         return sb.toString();
594     }
595 
596 
597 
598     String readAttributes(String xtxtin, String[][] sat) {
599         String xtxt = xtxtin;
600         Pattern pattern = Pattern.compile(attributeLine, Pattern.MULTILINE);
601 
602         int nat = 0;
603         Matcher matcher = pattern.matcher("");
604         while (true) {
605             xtxt = xtxt.trim();
606             matcher.reset(xtxt);
607             if (xtxt.length() > 0 && matcher.find() && matcher.start() == 0) {
608                 String group = matcher.group();
609                 xtxt = xtxt.substring(group.length(), xtxt.length());
610 
611                 String name = matcher.group(2);
612                 String value = matcher.group(3).trim();
613 
614                 if (containsAttribute(sat, nat, name)) {
615                     E.warning(" ignoringdubplicate attribute " +
616                               name + " " + value);
617 
618                 } else {
619                     sat[nat][0] = name;
620                     sat[nat][1] = value;
621                     nat++;
622                 }
623 
624             } else {
625                 break;
626             }
627         }
628         return xtxt;
629     }
630 
631 
632 
633     // ugly - shouldn't use if more than a couple of attributes;
634     private boolean containsAttribute(String[][] sat, int nat, String name) {
635         boolean contains = false;
636         for (int i = 0; i<nat; i++) {
637             if (sat[i][0].equals(name)) {
638                 contains = true;
639                 break;
640             }
641         }
642         return contains;
643     }
644 
645 
646 
647 
648     String extractAttributes(String xtxtin, String defaultRoot, String[] wrapper) {
649         String xtxt = xtxtin;
650         String[][] sat = new String[100][2]; //***
651 
652         xtxt = readAttributes(xtxt, sat);
653 
654         StringBuffer sb = new StringBuffer();
655         String eltname = defaultRoot;
656         if (eltname == null) {
657             eltname = "default";
658         }
659         for (int i = 0; i < sat.length && sat[i][0] != null; i++) {
660             if (sat[i][0].equals("type")) {
661                 eltname = sat[i][1].replaceAll("\\s", "_");
662             } else {
663                 sb.append("\n      ");
664                 sb.append(sat[i][0]);
665                 sb.append("=\"");
666                 sb.append(sat[i][1]);
667                 sb.append("\"");
668             }
669 
670         }
671         wrapper[0] = ("<" + eltname + " src=\"txt\"" + sb.toString() + ">");
672         wrapper[1] = ("</" + eltname + ">");
673         return xtxt;
674     }
675 
676 
677 
678 
679 
680 
681     public String extractAttributesAsElements(String xtxtin, String defaultRoot,
682             String[] wrapper) {
683         String[][] sat = new String[100][2];
684 
685         String xtxt = readAttributes(xtxtin, sat);
686 
687         StringBuffer sb = new StringBuffer();
688         for (int i = 0; i < sat.length && sat[i][0] != null; i++) {
689             sb.append("   <attribute name=\"");
690             sb.append(sat[i][0]);
691             sb.append("\" value=\"");
692             sb.append(sat[i][1]);
693             sb.append("\"/>\n");
694         }
695         wrapper[0] = ("<" + defaultRoot + ">\n" + sb.toString());
696         wrapper[1] = ("</" + defaultRoot + ">");
697         return xtxt;
698     }
699 
700 
701 
702 
703     public static void main(String[] argv) {
704 
705         TextTagger tt = new TextTagger();
706 
707         String sd = argv[0];
708 
709         File fdir = new File(sd);
710         File[] af = fdir.listFiles();
711 
712         for (int i = 0; i < af.length; i++) {
713             String fnm = af[i].getName();
714             if (fnm.endsWith(".txt")) {
715 
716 
717 
718                 File ftxt = af[i];
719                 String sr = ftxt.getName();
720                 sr = sr.substring(0, sr.length() - 4);
721 
722                 File fedt = new File(fdir, sr + ".edt");
723                 tt.textToEdit(ftxt, fedt);
724 
725                 File fxml = new File(fdir, sr + ".xml");
726                 tt.textToXML(ftxt, fxml);
727             }
728         }
729     }
730 
731 
732 
733 
734 
735 }