View Javadoc

1   package org.catacomb.serial.xml;
2   
3   import org.catacomb.report.E;
4   
5   import java.io.IOException;
6   import java.io.StreamTokenizer;
7   import java.io.StringReader;
8   
9   import java.util.HashMap;
10  
11  
12  public class XMLTokenizer {
13  
14      static int iq;
15      static int ieq;
16      static int iabo;
17      static int iabc;
18      static int iqm;
19      static int iexc;
20      static int ims;
21  
22      StreamTokenizer streamTokenizer;
23  
24  
25      static {
26          String sord = "\"=<>?!-";
27          iq = sord.charAt(0);
28          ieq = sord.charAt(1);
29          iabo = sord.charAt(2);
30          iabc = sord.charAt(3);
31          iqm = sord.charAt(4);
32          iexc = sord.charAt(5);
33          ims = sord.charAt(6);
34      }
35  
36  
37  
38      HashMap<String, String> cdataHM;
39  
40      String srcString;
41  
42      public XMLTokenizer(String s) {
43          // EFF remove this - just for debugging;
44          srcString = extractCDATAs(s);
45  
46  
47          streamTokenizer = new StreamTokenizer(new StringReader(srcString));
48          initializeStreamTokenizer(streamTokenizer);
49      }
50  
51  
52      private String extractCDATAs(String src) {
53          StringBuffer sret = new StringBuffer();
54          int icur = 0;
55          int iscd = src.indexOf("<![CDATA[");
56  
57          while (iscd >= icur) {
58              sret.append(src.substring(icur, iscd));
59              int iecd = src.indexOf("]]>", iscd + 9);
60              if (iecd >= 0) {
61                  String cdata = src.substring(iscd + 9, iecd);
62                  if (cdataHM == null) {
63                      cdataHM = new HashMap<String, String>();
64                  }
65                  String rpl = "xyz" + cdataHM.size();
66                  cdataHM.put(rpl, cdata);
67                  sret.append(rpl);
68  
69              } else {
70                  iecd = iscd + 6;
71                  E.error("no closure of cdata beginning character " + iscd + "? ");
72              }
73              icur = iecd + 3;
74              iscd = src.indexOf("<![CDATA[", icur);
75          }
76          if (icur < src.length()) {
77              sret.append(src.substring(icur, src.length()));
78          }
79          return sret.toString();
80      }
81  
82  
83      private void setStringValue(XMLToken xmlt, String svin) {
84          String sv = svin;
85          if (sv.startsWith("xyz")) {
86              if (cdataHM != null && cdataHM.containsKey(sv)) {
87                  sv = cdataHM.get(sv);
88              } else {
89                  E.warning("looks like a CDATA key, but not present? " + sv);
90              }
91          }
92  
93          xmlt.setStringValue(sv);
94      }
95  
96  
97      public int lineno() {
98          return streamTokenizer.lineno();
99      }
100 
101 
102     public void initializeStreamTokenizer(StreamTokenizer st) {
103         st.resetSyntax();
104         st.eolIsSignificant(false);
105         st.slashStarComments(false);
106         st.slashSlashComments(false);
107         st.lowerCaseMode(false);
108         String slim = "AZaz09";
109         st.wordChars(slim.charAt(0), slim.charAt(1));
110         st.wordChars(slim.charAt(2), slim.charAt(3));
111         st.wordChars(slim.charAt(4), slim.charAt(5));
112         // st.wordChars(0x00A0, 0x00FF);
113 
114 
115         String wsc = " \t\n";
116         for (int i = 0; i < wsc.length(); i++) {
117             int ic = wsc.charAt(i);
118             st.whitespaceChars(ic, ic);
119         }
120 
121 
122         st.quoteChar(iq);
123 
124         String swc = "_/.:&;,()\'+-.[]{}$";
125         for (int i = 0; i < swc.length(); i++) {
126             int ic = swc.charAt(i);
127             st.wordChars(ic, ic);
128         }
129     }
130 
131 
132     public XMLToken nextToken() {
133         XMLToken xmlt = new XMLToken();
134         int itok = ntok(streamTokenizer);
135 
136 
137         if (streamTokenizer.ttype == StreamTokenizer.TT_EOF) {
138             xmlt.setType(XMLToken.NONE);
139 
140 
141         } else if (itok == iq) {
142             xmlt.setType(XMLToken.STRING);
143             // quoted string;
144             String sss = streamTokenizer.sval;
145             setStringValue(xmlt, StringEncoder.xmlUnescape(sss));
146 
147 
148         } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
149             xmlt.setType(XMLToken.STRING);
150             setStringValue(xmlt, StringEncoder.xmlUnescape(streamTokenizer.sval));
151 
152         } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
153             xmlt.setType(XMLToken.NUMBER);
154             // boolean, int or double, all as doubles;
155             double d = streamTokenizer.nval;
156             ntok(streamTokenizer);
157             if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
158                     && ((streamTokenizer.sval).startsWith("E-")
159                         || (streamTokenizer.sval).startsWith("E+") || (streamTokenizer.sval).startsWith("E"))) { // POSERR
160                 // -
161                 // catches
162                 // wrong
163                 // things?
164 
165                 String s = streamTokenizer.sval.substring(1, streamTokenizer.sval.length());
166                 int ppp = Integer.parseInt(s);
167                 // err ("st.sval " + st.sval);
168                 // err ("read exponent: " + ppp);
169                 d *= Math.pow(10., ppp);
170             } else {
171                 streamTokenizer.pushBack();
172             }
173             xmlt.setDValue(d);
174 
175 
176         } else if (itok == iabo) {
177             itok = ntok(streamTokenizer);
178             String sv = streamTokenizer.sval;
179 
180             if (itok == iqm) {
181                 // should be the first line of a file - read on until
182                 // the next question mark, just keeping the text in sinfo
183                 // for now;
184                 xmlt.setType(XMLToken.INTRO);
185                 String svalue = "";
186                 itok = -1;
187                 while (itok != iqm) {
188                     itok = ntok(streamTokenizer);
189                     if (streamTokenizer.sval != null)
190                         svalue += streamTokenizer.sval + " ";
191                 }
192                 setStringValue(xmlt, svalue);
193 
194             } else if (itok == iexc) {
195                 itok = ntok(streamTokenizer);
196                 String sval = streamTokenizer.sval;
197 
198                 String svalue = "";
199                 if (sval != null && sval.startsWith("[CDATA[")) {
200                     E.error("shouldn't get CDATA in xml tokenizer");
201 
202                 } else if (sval != null && sval.startsWith("--")) {
203                     xmlt.setType(XMLToken.COMMENT);
204                     svalue = streamTokenizer.sval.substring(2, streamTokenizer.sval.length()) + " ";
205                     while (itok != iabc) {
206                         itok = ntok(streamTokenizer);
207                         if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
208                             svalue += streamTokenizer.sval + " ";
209                         } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
210                             svalue += " " + streamTokenizer.nval;
211                         }
212                     }
213                     streamTokenizer.pushBack();
214 
215 
216 
217                 } else if (itok == ims) {
218                     itok = ntok(streamTokenizer);
219                     if (itok == ims) {
220                         int[] ipr = new int[3];
221                         while (ipr[0] != ims || ipr[1] != ims || ipr[2] != iabc) {
222                             itok = ntok(streamTokenizer);
223 
224                             if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
225                                 svalue += streamTokenizer.sval + " ";
226                             } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
227                                 svalue += " " + streamTokenizer.nval;
228                             }
229                             if (streamTokenizer.sval != null && streamTokenizer.sval.endsWith("--")) {
230                                 ipr[1] = ims;
231                                 ipr[2] = ims;
232                             } else {
233                                 ipr[0] = ipr[1];
234                                 ipr[1] = ipr[2];
235                                 ipr[2] = itok;
236                             }
237                         }
238                         streamTokenizer.pushBack();
239                     } else {
240                         E.error("found <!- but not followed by -  at " + streamTokenizer.lineno());
241                     }
242                 } else {
243                     E.error("found <! but not followed by -  at " + streamTokenizer.lineno());
244                 }
245                 setStringValue(xmlt, svalue);
246 
247 
248             } else if (sv.startsWith("/")) {
249                 xmlt.setType(XMLToken.CLOSE);
250                 setStringValue(xmlt, sv.substring(1, sv.length()));
251 
252             } else {
253                 if (sv.endsWith("/")) {
254                     xmlt.setType(XMLToken.OPENCLOSE);
255                     setStringValue(xmlt, sv.substring(0, sv.length() - 1));
256                 } else {
257                     xmlt.setType(XMLToken.OPEN);
258                     setStringValue(xmlt, sv);
259                 }
260             }
261 
262             itok = ntok(streamTokenizer);
263             if (itok == iabc) {
264                 // fine - end of tag;
265 
266             } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
267                 String[] attNV = new String[160]; // EFF check eff
268                 int natt = 0;
269 
270                 while (itok != iabc) {
271 
272                     if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
273                         if (streamTokenizer.sval.equals("/")) {
274                             xmlt.setType(XMLToken.OPENCLOSE);
275 
276                         } else {
277                             attNV[2 * natt] = streamTokenizer.sval;
278                             itok = ntok(streamTokenizer);
279                             if (itok == ieq) {
280                                 itok = ntok(streamTokenizer);
281 
282                                 if (itok == iq) {
283                                     attNV[2 * natt + 1] = streamTokenizer.sval;
284                                     natt++;
285                                 } else {
286                                     E.shortError("expecting quoted string " + " while reading atributes "
287                                                  + "but got " + stok(itok) + " sval=" + streamTokenizer.sval
288                                                  + " nval=" + streamTokenizer.nval);
289                                     E.info("original string was " + srcString);
290                                 }
291                             } else {
292                                 E.shortError("at " + streamTokenizer.lineno()
293                                              + " expecting = while reading attributes " + "but got " + stok(itok)
294                                              + " sval=" + streamTokenizer.sval + " nval=" + streamTokenizer.nval);
295                                 E.info("original string was " + srcString);
296                             }
297                         }
298                     } else {
299                         E.shortError("at line " + streamTokenizer.lineno()
300                                      + " found non-word while reading attributes " + stok(itok)
301                                      + "  item so far = " + this);
302                         E.info("original string was " + srcString);
303                     }
304                     itok = ntok(streamTokenizer);
305                 }
306                 String[] sat = new String[2 * natt];
307                 for (int i = 0; i < 2 * natt; i++) {
308                     sat[i] = attNV[i];
309                 }
310                 xmlt.setAttributes(sat);
311 
312             } else {
313                 E.error("expecting word " + stok(itok));
314 
315             }
316 
317         } else {
318             // just return the token as a string;
319             xmlt.setType(XMLToken.STRING);
320             setStringValue(xmlt, stok(itok));
321 
322         }
323         return xmlt;
324     }
325 
326 
327 
328     private int ntok(StreamTokenizer st) {
329         int itok = -1;
330         try {
331             itok = st.nextToken();
332         } catch (IOException e) {
333             err(" " + e);
334             itok = -999;
335         }
336 
337         /*
338          * if (count < 20) { E.info("token " + count + " " + itok + " " + st.sval + " " +
339          * st.nval); count += 1; }
340          */
341 
342         return itok;
343     }
344 
345 
346     private String stok(int itok) {
347         return "" + (char)itok;
348     }
349 
350 
351     private void err(String s) {
352         System.out.println(s);
353     }
354 
355 }