1 package org.catacomb.serial.xml;
2
3 import org.catacomb.report.E;
4
5 import java.io.IOException;
6 import java.io.StreamTokenizer;
7 import java.io.StringReader;
8
9 import java.util.HashMap;
10
11
12 public class XMLTokenizer {
13
14 static int iq;
15 static int ieq;
16 static int iabo;
17 static int iabc;
18 static int iqm;
19 static int iexc;
20 static int ims;
21
22 StreamTokenizer streamTokenizer;
23
24
25 static {
26 String sord = "\"=<>?!-";
27 iq = sord.charAt(0);
28 ieq = sord.charAt(1);
29 iabo = sord.charAt(2);
30 iabc = sord.charAt(3);
31 iqm = sord.charAt(4);
32 iexc = sord.charAt(5);
33 ims = sord.charAt(6);
34 }
35
36
37
38 HashMap<String, String> cdataHM;
39
40 String srcString;
41
42 public XMLTokenizer(String s) {
43
44 srcString = extractCDATAs(s);
45
46
47 streamTokenizer = new StreamTokenizer(new StringReader(srcString));
48 initializeStreamTokenizer(streamTokenizer);
49 }
50
51
52 private String extractCDATAs(String src) {
53 StringBuffer sret = new StringBuffer();
54 int icur = 0;
55 int iscd = src.indexOf("<![CDATA[");
56
57 while (iscd >= icur) {
58 sret.append(src.substring(icur, iscd));
59 int iecd = src.indexOf("]]>", iscd + 9);
60 if (iecd >= 0) {
61 String cdata = src.substring(iscd + 9, iecd);
62 if (cdataHM == null) {
63 cdataHM = new HashMap<String, String>();
64 }
65 String rpl = "xyz" + cdataHM.size();
66 cdataHM.put(rpl, cdata);
67 sret.append(rpl);
68
69 } else {
70 iecd = iscd + 6;
71 E.error("no closure of cdata beginning character " + iscd + "? ");
72 }
73 icur = iecd + 3;
74 iscd = src.indexOf("<![CDATA[", icur);
75 }
76 if (icur < src.length()) {
77 sret.append(src.substring(icur, src.length()));
78 }
79 return sret.toString();
80 }
81
82
83 private void setStringValue(XMLToken xmlt, String svin) {
84 String sv = svin;
85 if (sv.startsWith("xyz")) {
86 if (cdataHM != null && cdataHM.containsKey(sv)) {
87 sv = cdataHM.get(sv);
88 } else {
89 E.warning("looks like a CDATA key, but not present? " + sv);
90 }
91 }
92
93 xmlt.setStringValue(sv);
94 }
95
96
97 public int lineno() {
98 return streamTokenizer.lineno();
99 }
100
101
102 public void initializeStreamTokenizer(StreamTokenizer st) {
103 st.resetSyntax();
104 st.eolIsSignificant(false);
105 st.slashStarComments(false);
106 st.slashSlashComments(false);
107 st.lowerCaseMode(false);
108 String slim = "AZaz09";
109 st.wordChars(slim.charAt(0), slim.charAt(1));
110 st.wordChars(slim.charAt(2), slim.charAt(3));
111 st.wordChars(slim.charAt(4), slim.charAt(5));
112
113
114
115 String wsc = " \t\n";
116 for (int i = 0; i < wsc.length(); i++) {
117 int ic = wsc.charAt(i);
118 st.whitespaceChars(ic, ic);
119 }
120
121
122 st.quoteChar(iq);
123
124 String swc = "_/.:&;,()\'+-.[]{}$";
125 for (int i = 0; i < swc.length(); i++) {
126 int ic = swc.charAt(i);
127 st.wordChars(ic, ic);
128 }
129 }
130
131
132 public XMLToken nextToken() {
133 XMLToken xmlt = new XMLToken();
134 int itok = ntok(streamTokenizer);
135
136
137 if (streamTokenizer.ttype == StreamTokenizer.TT_EOF) {
138 xmlt.setType(XMLToken.NONE);
139
140
141 } else if (itok == iq) {
142 xmlt.setType(XMLToken.STRING);
143
144 String sss = streamTokenizer.sval;
145 setStringValue(xmlt, StringEncoder.xmlUnescape(sss));
146
147
148 } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
149 xmlt.setType(XMLToken.STRING);
150 setStringValue(xmlt, StringEncoder.xmlUnescape(streamTokenizer.sval));
151
152 } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
153 xmlt.setType(XMLToken.NUMBER);
154
155 double d = streamTokenizer.nval;
156 ntok(streamTokenizer);
157 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
158 && ((streamTokenizer.sval).startsWith("E-")
159 || (streamTokenizer.sval).startsWith("E+") || (streamTokenizer.sval).startsWith("E"))) {
160
161
162
163
164
165 String s = streamTokenizer.sval.substring(1, streamTokenizer.sval.length());
166 int ppp = Integer.parseInt(s);
167
168
169 d *= Math.pow(10., ppp);
170 } else {
171 streamTokenizer.pushBack();
172 }
173 xmlt.setDValue(d);
174
175
176 } else if (itok == iabo) {
177 itok = ntok(streamTokenizer);
178 String sv = streamTokenizer.sval;
179
180 if (itok == iqm) {
181
182
183
184 xmlt.setType(XMLToken.INTRO);
185 String svalue = "";
186 itok = -1;
187 while (itok != iqm) {
188 itok = ntok(streamTokenizer);
189 if (streamTokenizer.sval != null)
190 svalue += streamTokenizer.sval + " ";
191 }
192 setStringValue(xmlt, svalue);
193
194 } else if (itok == iexc) {
195 itok = ntok(streamTokenizer);
196 String sval = streamTokenizer.sval;
197
198 String svalue = "";
199 if (sval != null && sval.startsWith("[CDATA[")) {
200 E.error("shouldn't get CDATA in xml tokenizer");
201
202 } else if (sval != null && sval.startsWith("--")) {
203 xmlt.setType(XMLToken.COMMENT);
204 svalue = streamTokenizer.sval.substring(2, streamTokenizer.sval.length()) + " ";
205 while (itok != iabc) {
206 itok = ntok(streamTokenizer);
207 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
208 svalue += streamTokenizer.sval + " ";
209 } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
210 svalue += " " + streamTokenizer.nval;
211 }
212 }
213 streamTokenizer.pushBack();
214
215
216
217 } else if (itok == ims) {
218 itok = ntok(streamTokenizer);
219 if (itok == ims) {
220 int[] ipr = new int[3];
221 while (ipr[0] != ims || ipr[1] != ims || ipr[2] != iabc) {
222 itok = ntok(streamTokenizer);
223
224 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
225 svalue += streamTokenizer.sval + " ";
226 } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
227 svalue += " " + streamTokenizer.nval;
228 }
229 if (streamTokenizer.sval != null && streamTokenizer.sval.endsWith("--")) {
230 ipr[1] = ims;
231 ipr[2] = ims;
232 } else {
233 ipr[0] = ipr[1];
234 ipr[1] = ipr[2];
235 ipr[2] = itok;
236 }
237 }
238 streamTokenizer.pushBack();
239 } else {
240 E.error("found <!- but not followed by - at " + streamTokenizer.lineno());
241 }
242 } else {
243 E.error("found <! but not followed by - at " + streamTokenizer.lineno());
244 }
245 setStringValue(xmlt, svalue);
246
247
248 } else if (sv.startsWith("/")) {
249 xmlt.setType(XMLToken.CLOSE);
250 setStringValue(xmlt, sv.substring(1, sv.length()));
251
252 } else {
253 if (sv.endsWith("/")) {
254 xmlt.setType(XMLToken.OPENCLOSE);
255 setStringValue(xmlt, sv.substring(0, sv.length() - 1));
256 } else {
257 xmlt.setType(XMLToken.OPEN);
258 setStringValue(xmlt, sv);
259 }
260 }
261
262 itok = ntok(streamTokenizer);
263 if (itok == iabc) {
264
265
266 } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
267 String[] attNV = new String[160];
268 int natt = 0;
269
270 while (itok != iabc) {
271
272 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
273 if (streamTokenizer.sval.equals("/")) {
274 xmlt.setType(XMLToken.OPENCLOSE);
275
276 } else {
277 attNV[2 * natt] = streamTokenizer.sval;
278 itok = ntok(streamTokenizer);
279 if (itok == ieq) {
280 itok = ntok(streamTokenizer);
281
282 if (itok == iq) {
283 attNV[2 * natt + 1] = streamTokenizer.sval;
284 natt++;
285 } else {
286 E.shortError("expecting quoted string " + " while reading atributes "
287 + "but got " + stok(itok) + " sval=" + streamTokenizer.sval
288 + " nval=" + streamTokenizer.nval);
289 E.info("original string was " + srcString);
290 }
291 } else {
292 E.shortError("at " + streamTokenizer.lineno()
293 + " expecting = while reading attributes " + "but got " + stok(itok)
294 + " sval=" + streamTokenizer.sval + " nval=" + streamTokenizer.nval);
295 E.info("original string was " + srcString);
296 }
297 }
298 } else {
299 E.shortError("at line " + streamTokenizer.lineno()
300 + " found non-word while reading attributes " + stok(itok)
301 + " item so far = " + this);
302 E.info("original string was " + srcString);
303 }
304 itok = ntok(streamTokenizer);
305 }
306 String[] sat = new String[2 * natt];
307 for (int i = 0; i < 2 * natt; i++) {
308 sat[i] = attNV[i];
309 }
310 xmlt.setAttributes(sat);
311
312 } else {
313 E.error("expecting word " + stok(itok));
314
315 }
316
317 } else {
318
319 xmlt.setType(XMLToken.STRING);
320 setStringValue(xmlt, stok(itok));
321
322 }
323 return xmlt;
324 }
325
326
327
328 private int ntok(StreamTokenizer st) {
329 int itok = -1;
330 try {
331 itok = st.nextToken();
332 } catch (IOException e) {
333 err(" " + e);
334 itok = -999;
335 }
336
337
338
339
340
341
342 return itok;
343 }
344
345
346 private String stok(int itok) {
347 return "" + (char)itok;
348 }
349
350
351 private void err(String s) {
352 System.out.println(s);
353 }
354
355 }