1 /***
2 * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3 * Copyright (C) 2009 <berendona@users.sourceforge.net>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 package simplespider.simplespider.bot.extractor.html.stream;
42
43 import java.io.IOException;
44 import java.io.Writer;
45 import java.util.Properties;
46
47 import org.apache.commons.logging.Log;
48 import org.apache.commons.logging.LogFactory;
49
50 final class HtmlWriter extends Writer {
51
52 private static final Log LOG = LogFactory.getLog(HtmlWriter.class);
53
54 public static final char lb = '<';
55 public static final char rb = '>';
56 public static final char dash = '-';
57 public static final char excl = '!';
58 public static final char singlequote = '\'';
59 public static final char doublequote = '"';
60
61 private TagWriter filterCont;
62 private Properties filterOpts;
63 private final TagListener scraper;
64 private TagWriter buffer;
65 private String filterTag;
66 private boolean inSingleQuote;
67 private boolean inDoubleQuote;
68 private boolean inComment;
69 private boolean inScript;
70 private boolean inStyle;
71 private boolean binaryUnsuspect;
72 private final boolean passbyIfBinarySuspect;
73
74 public HtmlWriter(final boolean passbyIfBinarySuspect, final TagListener scraper, final int bufferSize) {
75 this.scraper = scraper;
76 this.buffer = new TagWriter(bufferSize);
77 this.inSingleQuote = false;
78 this.inDoubleQuote = false;
79 this.inComment = false;
80 this.inScript = false;
81 this.inStyle = false;
82 this.binaryUnsuspect = true;
83 this.passbyIfBinarySuspect = passbyIfBinarySuspect;
84 this.filterOpts = null;
85 this.filterCont = null;
86 }
87
88 private static boolean binaryHint(final char c) {
89
90
91 if (c > 31) {
92 return false;
93 }
94
95
96
97
98
99
100 if (c > 7 && c <= 13) {
101 return false;
102 }
103
104 return true;
105 }
106
107 public boolean binarySuspect() {
108 return !this.binaryUnsuspect;
109 }
110
111 @Override
112 public void write(final int c) throws IOException {
113 if ((this.binaryUnsuspect) && (binaryHint((char) c))) {
114 this.binaryUnsuspect = false;
115 if (this.passbyIfBinarySuspect) {
116 close();
117 }
118 }
119
120 if (this.binaryUnsuspect || !this.passbyIfBinarySuspect) {
121 if (this.inSingleQuote) {
122 this.buffer.append(c);
123 if (c == singlequote) {
124 this.inSingleQuote = false;
125 }
126
127 if ((c == rb) && (this.buffer.charAt(0) == lb)) {
128 this.inSingleQuote = false;
129
130 filterSentence(this.buffer.getChars(), singlequote);
131
132 this.buffer.reset();
133 }
134 } else if (this.inDoubleQuote) {
135 this.buffer.append(c);
136 if (c == doublequote) {
137 this.inDoubleQuote = false;
138 }
139
140 if (c == rb && this.buffer.charAt(0) == lb) {
141 this.inDoubleQuote = false;
142
143 filterSentence(this.buffer.getChars(), doublequote);
144
145 this.buffer.reset();
146 }
147 } else if (this.inComment) {
148 this.buffer.append(c);
149 if (c == rb && this.buffer.length() > 6 && this.buffer.charAt(this.buffer.length() - 3) == dash) {
150
151 this.inComment = false;
152
153 this.buffer.reset();
154 }
155 } else if (this.inScript) {
156 this.buffer.append(c);
157 final int bufferLength = this.buffer.length();
158 if ((c == rb) && (bufferLength > 14) && (this.buffer.charAt(bufferLength - 9) == lb) && (this.buffer.charAt(bufferLength - 8) == '/')
159 && (this.buffer.charAt(bufferLength - 7) == 's') && (this.buffer.charAt(bufferLength - 6) == 'c')
160 && (this.buffer.charAt(bufferLength - 5) == 'r') && (this.buffer.charAt(bufferLength - 4) == 'i')
161 && (this.buffer.charAt(bufferLength - 3) == 'p') && (this.buffer.charAt(bufferLength - 2) == 't')) {
162
163 this.inScript = false;
164
165 this.buffer.reset();
166 }
167 } else if (this.inStyle) {
168 this.buffer.append(c);
169 final int bufferLength = this.buffer.length();
170 if ((c == rb) && (bufferLength > 13) && (this.buffer.charAt(bufferLength - 8) == lb) && (this.buffer.charAt(bufferLength - 7) == '/')
171 && (this.buffer.charAt(bufferLength - 6) == 's') && (this.buffer.charAt(bufferLength - 5) == 't')
172 && (this.buffer.charAt(bufferLength - 4) == 'y') && (this.buffer.charAt(bufferLength - 3) == 'l')
173 && (this.buffer.charAt(bufferLength - 2) == 'e')) {
174
175 this.inStyle = false;
176
177 this.buffer.reset();
178 }
179 } else {
180 if (this.buffer.length() == 0) {
181 if (c == rb) {
182
183 } else {
184 this.buffer.append(c);
185 }
186 } else if (this.buffer.charAt(0) == lb) {
187 if (c == singlequote) {
188 this.inSingleQuote = true;
189 }
190 if (c == doublequote) {
191 this.inDoubleQuote = true;
192 }
193
194 if ((this.buffer.length() >= 3) && (this.buffer.charAt(1) == excl) && (this.buffer.charAt(2) == dash) && (c == dash)) {
195
196 this.inComment = true;
197 this.buffer.append(c);
198 } else if ((this.buffer.length() >= 6) && (this.buffer.charAt(1) == 's') && (this.buffer.charAt(2) == 'c')
199 && (this.buffer.charAt(3) == 'r') && (this.buffer.charAt(4) == 'i') && (this.buffer.charAt(5) == 'p') && (c == 't')) {
200
201 this.inScript = true;
202 this.buffer.append(c);
203 } else if ((this.buffer.length() >= 5) && (this.buffer.charAt(1) == 's') && (this.buffer.charAt(2) == 't')
204 && (this.buffer.charAt(3) == 'y') && (this.buffer.charAt(4) == 'l') && (c == 'e')) {
205
206 this.inStyle = true;
207 this.buffer.append(c);
208 } else if (c == rb) {
209 this.buffer.append(c);
210
211 filterSentence(this.buffer.getChars(), doublequote);
212
213 this.buffer.reset();
214 } else if (c == lb) {
215
216
217 if (this.buffer.length() > 0) {
218 filterSentence(this.buffer.getChars(), doublequote);
219 }
220
221 this.buffer.reset();
222 this.buffer.append(c);
223 } else {
224 this.buffer.append(c);
225 }
226 } else {
227
228 if (c == lb) {
229
230 if (this.buffer.length() > 0) {
231 filterSentence(this.buffer.getChars(), doublequote);
232 }
233
234 this.buffer.reset();
235 this.buffer.append(c);
236 } else {
237
238 this.buffer.append(c);
239 }
240 }
241 }
242 }
243 }
244
245 private void filterSentence(final char[] in, final char quotechar) {
246 if (in.length == 0) {
247 return;
248 }
249
250 if (in.length > 2 && in[0] == lb) {
251
252
253 String tag;
254 int tagend;
255 if (in[1] == '/') {
256
257 tagend = tagEnd(in, 2);
258 tag = new String(in, 2, tagend - 2);
259 final char[] text = new char[in.length - tagend - 1];
260 System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
261 filterTag(tag, false, text, quotechar);
262 return;
263 }
264
265
266 tagend = tagEnd(in, 1);
267 tag = new String(in, 1, tagend - 1);
268 final char[] text = new char[in.length - tagend - 1];
269 System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
270 filterTag(tag, true, text, quotechar);
271 return;
272 }
273
274
275 filterTag(null, true, in, quotechar);
276 return;
277 }
278
279 private void filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
280 if (this.filterTag == null) {
281
282 if (tag == null) {
283 return;
284 }
285
286
287 if (opening) {
288 if ((this.scraper != null) && (this.scraper.isTagWithoutContent(tag))) {
289
290 final TagWriter charBuffer = new TagWriter(content);
291 this.scraper.scrapeTagWithoutContent(tag, charBuffer.propParser());
292 try {
293 charBuffer.close();
294 } catch (final IOException e) {
295 LOG.warn("Failed to close tag writer", e);
296 }
297 }
298 if (((this.scraper != null) && (this.scraper.isTagWithContent(tag)))) {
299 final TagWriter scb = new TagWriter(content);
300
301 final Properties properties = scb.propParser();
302 try {
303 scb.close();
304 } catch (final IOException e) {
305 LOG.warn("Failed to close tag writer", e);
306 }
307
308 if (content[content.length - 1] == '/') {
309
310 this.scraper.scrapeTagWithContent(tag, properties, null);
311 } else {
312
313 this.filterTag = tag;
314 this.filterOpts = properties;
315 this.filterCont = new TagWriter();
316 }
317
318 return;
319 } else {
320
321 return;
322 }
323 }
324
325
326 return;
327
328 }
329
330
331 if (tag == null) {
332 this.filterCont.append(content);
333 return;
334 }
335
336
337
338
339
340
341
342
343
344 if (this.scraper != null) {
345 this.scraper.scrapeTagWithContent(this.filterTag, this.filterOpts, this.filterCont.getChars());
346 }
347 this.filterTag = null;
348 this.filterOpts = null;
349 this.filterCont = null;
350 return;
351 }
352
353 private static int tagEnd(final char[] tag, final int start) {
354 char c;
355 for (int i = start; i < tag.length; i++) {
356 c = tag[i];
357 if (c != '!' && c != '-' && (c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
358 return i;
359 }
360 }
361 return tag.length - 1;
362 }
363
364 private void filterFinalize(final char quotechar) {
365
366 if (this.scraper != null && this.filterCont != null) {
367 this.scraper.scrapeTagWithContent(this.filterTag, this.filterOpts, this.filterCont.getChars());
368 }
369 this.filterTag = null;
370 this.filterOpts = null;
371 this.filterCont = null;
372 }
373
374 @Override
375 public void close() throws IOException {
376 final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
377 if (this.buffer != null) {
378 if (this.buffer.length() > 0) {
379 filterSentence(this.buffer.getChars(), quotechar);
380 }
381 this.buffer = null;
382 }
383 filterFinalize(quotechar);
384 this.filterTag = null;
385 this.filterOpts = null;
386 this.filterCont = null;
387 }
388
389 @Override
390 public void write(final char b[]) throws IOException {
391 write(b, 0, b.length);
392 }
393
394 @Override
395 public void write(final char b[], final int off, final int len) throws IOException {
396 if ((off | len | (b.length - (len + off)) | (off + len)) < 0) {
397 throw new IndexOutOfBoundsException();
398 }
399 for (int i = off; i < (len - off); i++) {
400 this.write(b[i]);
401 }
402 }
403
404 @Override
405 public void flush() throws IOException {
406
407 }
408 }