1 package org.textensor.xml;
2
3 import java.io.IOException;
4 import java.io.StreamTokenizer;
5 import java.io.StringReader;
6
7 import java.util.HashMap;
8
9 import org.textensor.report.E;
10
11
12 public class XMLTokenizer {
13
14 static int iq;
15 static int ieq;
16 static int iabo;
17 static int iabc;
18 static int iqm;
19 static int iexc;
20 static int ims;
21
22 StreamTokenizer streamTokenizer;
23
24 int count;
25
26
27 static {
28 String sord = "\"=<>?!-";
29 iq = sord.charAt(0);
30 ieq = sord.charAt(1);
31 iabo = sord.charAt(2);
32 iabc = sord.charAt(3);
33 iqm = sord.charAt(4);
34 iexc = sord.charAt(5);
35 ims = sord.charAt(6);
36 }
37
38
39
40 HashMap<String, String> cdataHM;
41
42 String srcString;
43
44 public XMLTokenizer(String s) {
45
46 srcString = extractCDATAs(s);
47
48
49 streamTokenizer = new StreamTokenizer(new StringReader(srcString));
50 initializeStreamTokenizer(streamTokenizer);
51 }
52
53
54 private String extractCDATAs(String src) {
55 StringBuffer sret = new StringBuffer();
56 int icur = 0;
57 int iscd = src.indexOf("<![CDATA[");
58
59 while (iscd >= icur) {
60 sret.append(src.substring(icur, iscd));
61 int iecd = src.indexOf("]]>", iscd + 9);
62 if (iecd >= 0) {
63 String cdata = src.substring(iscd + 9, iecd);
64 if (cdataHM == null) {
65 cdataHM = new HashMap<String, String>();
66 }
67 String rpl = "xyz" + cdataHM.size();
68 cdataHM.put(rpl, cdata);
69 sret.append(rpl);
70
71 } else {
72 iecd = iscd + 6;
73 E.error("no closure of cdata beginning character " + iscd + "? ");
74 }
75 icur = iecd + 3;
76 iscd = src.indexOf("<![CDATA[", icur);
77 }
78 if (icur < src.length()) {
79 sret.append(src.substring(icur, src.length()));
80 }
81 return sret.toString();
82 }
83
84
85 private void setStringValue(XMLToken xmlt, String sv) {
86 if (sv.startsWith("xyz")) {
87 if (cdataHM != null && cdataHM.containsKey(sv)) {
88 sv = cdataHM.get(sv);
89 } else {
90 E.warning("looks like a CDATA key, but not present? " + sv);
91 }
92 }
93
94 xmlt.setStringValue(sv);
95 }
96
97
98 public int lineno() {
99 return streamTokenizer.lineno();
100 }
101
102
103 public void initializeStreamTokenizer(StreamTokenizer st) {
104 st.resetSyntax();
105 st.eolIsSignificant(false);
106 st.slashStarComments(false);
107 st.slashSlashComments(false);
108 st.lowerCaseMode(false);
109 String slim = "AZaz09";
110 st.wordChars(slim.charAt(0), slim.charAt(1));
111 st.wordChars(slim.charAt(2), slim.charAt(3));
112 st.wordChars(slim.charAt(4), slim.charAt(5));
113
114
115
116 String wsc = " \t\n";
117 for (int i = 0; i < wsc.length(); i++) {
118 int ic = wsc.charAt(i);
119 st.whitespaceChars(ic, ic);
120 }
121
122
123 st.quoteChar(iq);
124
125 String swc = "_/.:&;,()\'+-.[]{}$";
126 for (int i = 0; i < swc.length(); i++) {
127 int ic = swc.charAt(i);
128 st.wordChars(ic, ic);
129 }
130 }
131
132
133 public XMLToken nextToken() {
134 XMLToken xmlt = new XMLToken();
135 int itok = ntok(streamTokenizer);
136
137
138 if (streamTokenizer.ttype == StreamTokenizer.TT_EOF) {
139 xmlt.setType(XMLToken.NONE);
140
141
142 } else if (itok == iq) {
143 xmlt.setType(XMLToken.STRING);
144
145 String sss = streamTokenizer.sval;
146 setStringValue(xmlt, StringEncoder.xmlUnescape(sss));
147
148
149 } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
150 xmlt.setType(XMLToken.STRING);
151 setStringValue(xmlt, StringEncoder.xmlUnescape(streamTokenizer.sval));
152
153 } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
154 xmlt.setType(XMLToken.NUMBER);
155
156 double d = streamTokenizer.nval;
157 ntok(streamTokenizer);
158 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
159 && ((streamTokenizer.sval).startsWith("E-")
160 || (streamTokenizer.sval).startsWith("E+") || (streamTokenizer.sval).startsWith("E"))) {
161
162
163
164
165
166 String s = streamTokenizer.sval.substring(1, streamTokenizer.sval.length());
167 int ppp = Integer.parseInt(s);
168
169
170 d *= Math.pow(10., ppp);
171 } else {
172 streamTokenizer.pushBack();
173 }
174 xmlt.setDValue(d);
175
176
177 } else if (itok == iabo) {
178 itok = ntok(streamTokenizer);
179 String sv = streamTokenizer.sval;
180
181 if (itok == iqm) {
182
183
184
185 xmlt.setType(XMLToken.INTRO);
186 String svalue = "";
187 itok = -1;
188 while (itok != iqm) {
189 itok = ntok(streamTokenizer);
190 if (streamTokenizer.sval != null)
191 svalue += streamTokenizer.sval + " ";
192 }
193 setStringValue(xmlt, svalue);
194
195 } else if (itok == iexc) {
196 itok = ntok(streamTokenizer);
197 String sval = streamTokenizer.sval;
198
199 String svalue = "";
200 if (sval != null && sval.startsWith("[CDATA[")) {
201 E.error("shouldn't get CDATA in xml tokenizer");
202
203 } else if (sval.startsWith("--")) {
204 xmlt.setType(XMLToken.COMMENT);
205 svalue = streamTokenizer.sval.substring(2, streamTokenizer.sval.length()) + " ";
206 while (itok != iabc || !(svalue.endsWith("--"))) {
207 itok = ntok(streamTokenizer);
208 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
209 svalue += " " + streamTokenizer.sval;
210
211 } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
212 svalue += " " + streamTokenizer.nval;
213
214 }
215 }
216 xmlt.setStringValue(svalue.substring(0, svalue.length() -2));
217 streamTokenizer.pushBack();
218
219
220 } else if (itok == ims) {
221 itok = ntok(streamTokenizer);
222 if (itok == ims) {
223 E.info("reading comment start as separate minus signs");
224 int[] ipr = new int[3];
225 while (ipr[0] != ims || ipr[1] != ims || ipr[2] != iabc) {
226 itok = ntok(streamTokenizer);
227
228 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
229 svalue += streamTokenizer.sval + " ";
230 } else if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
231 svalue += " " + streamTokenizer.nval;
232 }
233 if (streamTokenizer.sval != null && streamTokenizer.sval.endsWith("--")) {
234 ipr[1] = ims;
235 ipr[2] = ims;
236 } else {
237 ipr[0] = ipr[1];
238 ipr[1] = ipr[2];
239 ipr[2] = itok;
240 }
241 }
242 streamTokenizer.pushBack();
243 } else {
244 E.error("found <!- but not followed by - at " + streamTokenizer.lineno());
245 }
246 } else {
247 E.error("found <! but not followed by - at " + streamTokenizer.lineno());
248 }
249 setStringValue(xmlt, svalue);
250
251
252 } else if (sv.startsWith("/")) {
253 xmlt.setType(XMLToken.CLOSE);
254 setStringValue(xmlt, sv.substring(1, sv.length()));
255
256 } else {
257 if (sv.endsWith("/")) {
258 xmlt.setType(XMLToken.OPENCLOSE);
259 setStringValue(xmlt, sv.substring(0, sv.length() - 1));
260 } else {
261 xmlt.setType(XMLToken.OPEN);
262 setStringValue(xmlt, sv);
263 }
264 }
265
266 itok = ntok(streamTokenizer);
267 if (itok == iabc) {
268
269
270 } else if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
271 String[] attNV = new String[160];
272 int natt = 0;
273
274 while (itok != iabc) {
275
276 if (streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
277 if (streamTokenizer.sval.equals("/")) {
278 xmlt.setType(XMLToken.OPENCLOSE);
279
280 } else {
281 attNV[2 * natt] = streamTokenizer.sval;
282 itok = ntok(streamTokenizer);
283 if (itok == ieq) {
284 itok = ntok(streamTokenizer);
285
286 if (itok == iq) {
287 attNV[2 * natt + 1] = streamTokenizer.sval;
288 natt++;
289 } else {
290 E.shortError("expecting quoted string " + " while reading atributes "
291 + "but got " + stok(itok) + " sval=" + streamTokenizer.sval
292 + " nval=" + streamTokenizer.nval);
293 E.info("original string was " + srcString);
294 }
295 } else {
296 E.shortError("at " + streamTokenizer.lineno()
297 + " expecting = while reading attributes " + "but got " + stok(itok)
298 + " sval=" + streamTokenizer.sval + " nval=" + streamTokenizer.nval);
299 E.info("original string was " + srcString);
300 }
301 }
302 } else {
303 E.shortError("at line " + streamTokenizer.lineno()
304 + " found non-word while reading attributes " + stok(itok)
305 + " item so far = " + this);
306 E.info("original string was " + srcString);
307 }
308 itok = ntok(streamTokenizer);
309 }
310 String[] sat = new String[2 * natt];
311 for (int i = 0; i < 2 * natt; i++) {
312 sat[i] = attNV[i];
313 }
314 xmlt.setAttributes(sat);
315
316 } else {
317 E.error("expecting word " + stok(itok));
318
319 }
320
321 } else {
322
323 xmlt.setType(XMLToken.STRING);
324 setStringValue(xmlt, stok(itok));
325
326 }
327 return xmlt;
328 }
329
330
331
332 private int ntok(StreamTokenizer st) {
333 int itok = -1;
334 try {
335 itok = st.nextToken();
336 } catch (IOException e) {
337 err(" " + e);
338 itok = -999;
339 }
340
341
342
343
344
345
346 return itok;
347 }
348
349
350 private String stok(int itok) {
351 return "" + (char)itok;
352 }
353
354
355 private void err(String s) {
356 System.out.println(s);
357 }
358
359 }