View Javadoc

1   package org.textensor.xml;
2   
3   import java.io.IOException;
4   import java.io.StreamTokenizer;
5   import java.io.StringReader;
6   
7   import java.util.HashMap;
8   
9   import org.textensor.report.E;
10  
11  
12  public class XMLTokenizer {
13  
14      static int iq;
15      static int ieq;
16      static int iabo;
17      static int iabc;
18      static int iqm;
19      static int iexc;
20      static int ims;
21  
22      StreamTokenizer streamTokenizer;
23  
24      int count;
25  
26  
27      static {
28          String sord = "\"=<>?!-";
29          iq = sord.charAt(0);
30          ieq = sord.charAt(1);
31          iabo = sord.charAt(2);
32          iabc = sord.charAt(3);
33          iqm = sord.charAt(4);
34          iexc = sord.charAt(5);
35          ims = sord.charAt(6);
36      }
37  
38  
39  
40      HashMap<String, String> cdataHM;
41  
42      String srcString;
43  
44      public XMLTokenizer(String s) {
45          // EFF remove this - just for debugging;
46          srcString = extractCDATAs(s);
47  
48  
49          streamTokenizer = new StreamTokenizer(new StringReader(srcString));
50          initializeStreamTokenizer(streamTokenizer);
51      }
52  
53  
54      private String extractCDATAs(String src) {
55          StringBuffer sret = new StringBuffer();
56          int icur = 0;
57          int iscd = src.indexOf("<![CDATA[");
58  
59          while (iscd >= icur) {
60              sret.append(src.substring(icur, iscd));
61              int iecd = src.indexOf("]]>", iscd + 9);
62              if (iecd >= 0) {
63                  String cdata = src.substring(iscd + 9, iecd);
64                  if (cdataHM == null) {
65                      cdataHM = new HashMap<String, String>();
66                  }
67                  String rpl = "xyz" + cdataHM.size();
68                  cdataHM.put(rpl, cdata);
69                  sret.append(rpl);
70  
71              } else {
72                  iecd = iscd + 6;
73                  E.error("no closure of cdata beginning character " + iscd + "? ");
74              }
75              icur = iecd + 3;
76              iscd = src.indexOf("<![CDATA[", icur);
77          }
78          if (icur < src.length()) {
79              sret.append(src.substring(icur, src.length()));
80          }
81          return sret.toString();
82      }
83  
84  
85      private void setStringValue(XMLToken xmlt, String sv) {
86          if (sv.startsWith("xyz")) {
87              if (cdataHM != null && cdataHM.containsKey(sv)) {
88                  sv = cdataHM.get(sv);
89              } else {
90                  E.warning("looks like a CDATA key, but not present? " + sv);
91              }
92          }
93  
94          xmlt.setStringValue(sv);
95      }
96  
97  
98      public int lineno() {
99          return streamTokenizer.lineno();
100     }
101 
102 
103     public void initializeStreamTokenizer(StreamTokenizer st) {
104         st.resetSyntax();
105         st.eolIsSignificant(false);
106         st.slashStarComments(false);
107         st.slashSlashComments(false);
108         st.lowerCaseMode(false);
109         String slim = "AZaz09";
110         st.wordChars(slim.charAt(0), slim.charAt(1));
111         st.wordChars(slim.charAt(2), slim.charAt(3));
112         st.wordChars(slim.charAt(4), slim.charAt(5));
113         // st.wordChars(0x00A0, 0x00FF);
114 
115 
116         String wsc = " \t\n";
117         for (int i = 0; i < wsc.length(); i++) {
118             int ic = wsc.charAt(i);
119             st.whitespaceChars(ic, ic);
120         }
121 
122 
123         st.quoteChar(iq);
124 
125         String swc = "_/.:&;,()\'+-.[]{}$";
126         for (int i = 0; i < swc.length(); i++) {
127             int ic = swc.charAt(i);
128             st.wordChars(ic, ic);
129         }
130     }
131 
132 
133     public XMLToken nextToken() {
134         XMLToken xmlt = new XMLToken();
135         int itok = ntok(streamTokenizer);
136 
137 
138         if (streamTokenizer.ttype == StreamTokenizer.TT_EOF) {
139             xmlt.setType(XMLToken.NONE);
140 
141 
142         } else if (itok == iq) {
143             xmlt.setType(XMLToken.STRING);
144             // quoted string;
145             String sss = streamTokenizer.sval;
146             setStringValue(xmlt, StringEncoder.xmlUnescape(sss));
147 
148 
149         } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
150             xmlt.setType(XMLToken.STRING);
151             setStringValue(xmlt, StringEncoder.xmlUnescape(streamTokenizer.sval));
152 
153         } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
154             xmlt.setType(XMLToken.NUMBER);
155             // boolean, int or double, all as doubles;
156             double d = streamTokenizer.nval;
157             ntok(streamTokenizer);
158             if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
159                     && ((streamTokenizer.sval).startsWith("E-")
160                         || (streamTokenizer.sval).startsWith("E+") || (streamTokenizer.sval).startsWith("E"))) { // POSERR
161                 // -
162                 // catches
163                 // wrong
164                 // things?
165 
166                 String s = streamTokenizer.sval.substring(1, streamTokenizer.sval.length());
167                 int ppp = Integer.parseInt(s);
168                 // err ("st.sval " + st.sval);
169                 // err ("read exponent: " + ppp);
170                 d *= Math.pow(10., ppp);
171             } else {
172                 streamTokenizer.pushBack();
173             }
174             xmlt.setDValue(d);
175 
176 
177         } else if (itok == iabo) {
178             itok = ntok(streamTokenizer);
179             String sv = streamTokenizer.sval;
180 
181             if (itok == iqm) {
182                 // should be the first line of a file - read on until
183                 // the next question mark, just keeping the text in sinfo
184                 // for now;
185                 xmlt.setType(XMLToken.INTRO);
186                 String svalue = "";
187                 itok = -1;
188                 while (itok != iqm) {
189                     itok = ntok(streamTokenizer);
190                     if (streamTokenizer.sval != null)
191                         svalue += streamTokenizer.sval + " ";
192                 }
193                 setStringValue(xmlt, svalue);
194 
195             } else if (itok == iexc) {
196                 itok = ntok(streamTokenizer);
197                 String sval = streamTokenizer.sval;
198 
199                 String svalue = "";
200                 if (sval != null && sval.startsWith("[CDATA[")) {
201                     E.error("shouldn't get CDATA in xml tokenizer");
202 
203                 } else if (sval.startsWith("--")) {
204                     xmlt.setType(XMLToken.COMMENT);
205                     svalue = streamTokenizer.sval.substring(2, streamTokenizer.sval.length()) + " ";
206                     while (itok != iabc || !(svalue.endsWith("--"))) {
207                         itok = ntok(streamTokenizer);
208                         if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
209                             svalue += " " + streamTokenizer.sval;
210                             //  pstok = streamTokenizer.sval;
211                         } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
212                             svalue += " " + streamTokenizer.nval;
213                             //  pstok = "";
214                         }
215                     }
216                     xmlt.setStringValue(svalue.substring(0, svalue.length() -2));
217                     streamTokenizer.pushBack();
218 
219 
220                 } else if (itok == ims) {
221                     itok = ntok(streamTokenizer);
222                     if (itok == ims) {
223                         E.info("reading comment start as separate minus signs");
224                         int[] ipr = new int[3];
225                         while (ipr[0] != ims || ipr[1] != ims || ipr[2] != iabc) {
226                             itok = ntok(streamTokenizer);
227 
228                             if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
229                                 svalue += streamTokenizer.sval + " ";
230                             } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
231                                 svalue += " " + streamTokenizer.nval;
232                             }
233                             if (streamTokenizer.sval != null && streamTokenizer.sval.endsWith("--")) {
234                                 ipr[1] = ims;
235                                 ipr[2] = ims;
236                             } else {
237                                 ipr[0] = ipr[1];
238                                 ipr[1] = ipr[2];
239                                 ipr[2] = itok;
240                             }
241                         }
242                         streamTokenizer.pushBack();
243                     } else {
244                         E.error("found <!- but not followed by -  at " + streamTokenizer.lineno());
245                     }
246                 } else {
247                     E.error("found <! but not followed by -  at " + streamTokenizer.lineno());
248                 }
249                 setStringValue(xmlt, svalue);
250 
251 
252             } else if (sv.startsWith("/")) {
253                 xmlt.setType(XMLToken.CLOSE);
254                 setStringValue(xmlt, sv.substring(1, sv.length()));
255 
256             } else {
257                 if (sv.endsWith("/")) {
258                     xmlt.setType(XMLToken.OPENCLOSE);
259                     setStringValue(xmlt, sv.substring(0, sv.length() - 1));
260                 } else {
261                     xmlt.setType(XMLToken.OPEN);
262                     setStringValue(xmlt, sv);
263                 }
264             }
265 
266             itok = ntok(streamTokenizer);
267             if (itok == iabc) {
268                 // fine - end of tag;
269 
270             } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
271                 String[] attNV = new String[160]; // EFF check eff
272                 int natt = 0;
273 
274                 while (itok != iabc) {
275 
276                     if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
277                         if (streamTokenizer.sval.equals("/")) {
278                             xmlt.setType(XMLToken.OPENCLOSE);
279 
280                         } else {
281                             attNV[2 * natt] = streamTokenizer.sval;
282                             itok = ntok(streamTokenizer);
283                             if (itok == ieq) {
284                                 itok = ntok(streamTokenizer);
285 
286                                 if (itok == iq) {
287                                     attNV[2 * natt + 1] = streamTokenizer.sval;
288                                     natt++;
289                                 } else {
290                                     E.shortError("expecting quoted string " + " while reading atributes "
291                                                  + "but got " + stok(itok) + " sval=" + streamTokenizer.sval
292                                                  + " nval=" + streamTokenizer.nval);
293                                     E.info("original string was " + srcString);
294                                 }
295                             } else {
296                                 E.shortError("at " + streamTokenizer.lineno()
297                                              + " expecting = while reading attributes " + "but got " + stok(itok)
298                                              + " sval=" + streamTokenizer.sval + " nval=" + streamTokenizer.nval);
299                                 E.info("original string was " + srcString);
300                             }
301                         }
302                     } else {
303                         E.shortError("at line " + streamTokenizer.lineno()
304                                      + " found non-word while reading attributes " + stok(itok)
305                                      + "  item so far = " + this);
306                         E.info("original string was " + srcString);
307                     }
308                     itok = ntok(streamTokenizer);
309                 }
310                 String[] sat = new String[2 * natt];
311                 for (int i = 0; i < 2 * natt; i++) {
312                     sat[i] = attNV[i];
313                 }
314                 xmlt.setAttributes(sat);
315 
316             } else {
317                 E.error("expecting word " + stok(itok));
318 
319             }
320 
321         } else {
322             // just return the token as a string;
323             xmlt.setType(XMLToken.STRING);
324             setStringValue(xmlt, stok(itok));
325 
326         }
327         return xmlt;
328     }
329 
330 
331 
332     private int ntok(StreamTokenizer st) {
333         int itok = -1;
334         try {
335             itok = st.nextToken();
336         } catch (IOException e) {
337             err(" " + e);
338             itok = -999;
339         }
340 
341         /*
342          * if (count < 20) { E.info("token " + count + " " + itok + " " + st.sval + " " +
343          * st.nval); count += 1; }
344          */
345 
346         return itok;
347     }
348 
349 
350     private String stok(int itok) {
351         return "" + (char)itok;
352     }
353 
354 
355     private void err(String s) {
356         System.out.println(s);
357     }
358 
359 }