View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  // serverCharBuffer.java
19  // ---------------------------
20  // (C) by Michael Peter Christen; mc@yacy.net
21  // first published on http://www.anomic.de
22  // Frankfurt, Germany, 2004
23  //
24  // $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
25  // $LastChangedRevision: 1715 $
26  // $LastChangedBy: borg-0300 $
27  //
28  // This program is free software; you can redistribute it and/or modify
29  // it under the terms of the GNU General Public License as published by
30  // the Free Software Foundation; either version 2 of the License, or
31  // (at your option) any later version.
32  //
33  // This program is distributed in the hope that it will be useful,
34  // but WITHOUT ANY WARRANTY; without even the implied warranty of
35  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
36  // GNU General Public License for more details.
37  //
38  // You should have received a copy of the GNU General Public License
39  // along with this program; if not, write to the Free Software
40  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
41  package simplespider.simplespider.bot.extractor.html.stream;
42  
43  import java.io.IOException;
44  import java.io.Writer;
45  import java.util.Properties;
46  
47  import org.apache.commons.logging.Log;
48  import org.apache.commons.logging.LogFactory;
49  
50  final class HtmlWriter extends Writer {
51  
52  	private static final Log	LOG			= LogFactory.getLog(HtmlWriter.class);
53  
54  	public static final char	lb			= '<';
55  	public static final char	rb			= '>';
56  	public static final char	dash		= '-';
57  	public static final char	excl		= '!';
58  	public static final char	singlequote	= '\'';
59  	public static final char	doublequote	= '"';
60  
61  	private TagWriter			filterCont;
62  	private Properties			filterOpts;
63  	private final TagListener	scraper;
64  	private TagWriter			buffer;
65  	private String				filterTag;
66  	private boolean				inSingleQuote;
67  	private boolean				inDoubleQuote;
68  	private boolean				inComment;
69  	private boolean				inScript;
70  	private boolean				inStyle;
71  	private boolean				binaryUnsuspect;
72  	private final boolean		passbyIfBinarySuspect;
73  
74  	public HtmlWriter(final boolean passbyIfBinarySuspect, final TagListener scraper, final int bufferSize) {
75  		this.scraper = scraper;
76  		this.buffer = new TagWriter(bufferSize);
77  		this.inSingleQuote = false;
78  		this.inDoubleQuote = false;
79  		this.inComment = false;
80  		this.inScript = false;
81  		this.inStyle = false;
82  		this.binaryUnsuspect = true;
83  		this.passbyIfBinarySuspect = passbyIfBinarySuspect;
84  		this.filterOpts = null;
85  		this.filterCont = null;
86  	}
87  
88  	private static boolean binaryHint(final char c) {
89  		// space, punctiation and symbols, letters and digits (ASCII/latin)
90  		//if (c >= 31 && c < 128) return false;
91  		if (c > 31) {
92  			return false;
93  		}
94  		//  8 = backspace
95  		//  9 = horizontal tab
96  		// 10 = new line (line feed)
97  		// 11 = vertical tab
98  		// 12 = new page (form feed)
99  		// 13 = carriage return
100 		if (c > 7 && c <= 13) {
101 			return false;
102 		}
103 
104 		return true;
105 	}
106 
107 	public boolean binarySuspect() {
108 		return !this.binaryUnsuspect;
109 	}
110 
111 	@Override
112 	public void write(final int c) throws IOException {
113 		if ((this.binaryUnsuspect) && (binaryHint((char) c))) {
114 			this.binaryUnsuspect = false;
115 			if (this.passbyIfBinarySuspect) {
116 				close();
117 			}
118 		}
119 
120 		if (this.binaryUnsuspect || !this.passbyIfBinarySuspect) {
121 			if (this.inSingleQuote) {
122 				this.buffer.append(c);
123 				if (c == singlequote) {
124 					this.inSingleQuote = false;
125 				}
126 				// check error cases
127 				if ((c == rb) && (this.buffer.charAt(0) == lb)) {
128 					this.inSingleQuote = false;
129 					// the tag ends here. after filtering: pass on
130 					filterSentence(this.buffer.getChars(), singlequote);
131 					// buffer = new serverByteBuffer();
132 					this.buffer.reset();
133 				}
134 			} else if (this.inDoubleQuote) {
135 				this.buffer.append(c);
136 				if (c == doublequote) {
137 					this.inDoubleQuote = false;
138 				}
139 				// check error cases
140 				if (c == rb && this.buffer.charAt(0) == lb) {
141 					this.inDoubleQuote = false;
142 					// the tag ends here. after filtering: pass on
143 					filterSentence(this.buffer.getChars(), doublequote);
144 					// buffer = new serverByteBuffer();
145 					this.buffer.reset();
146 				}
147 			} else if (this.inComment) {
148 				this.buffer.append(c);
149 				if (c == rb && this.buffer.length() > 6 && this.buffer.charAt(this.buffer.length() - 3) == dash) {
150 					// comment is at end
151 					this.inComment = false;
152 					// buffer = new serverByteBuffer();
153 					this.buffer.reset();
154 				}
155 			} else if (this.inScript) {
156 				this.buffer.append(c);
157 				final int bufferLength = this.buffer.length();
158 				if ((c == rb) && (bufferLength > 14) && (this.buffer.charAt(bufferLength - 9) == lb) && (this.buffer.charAt(bufferLength - 8) == '/')
159 						&& (this.buffer.charAt(bufferLength - 7) == 's') && (this.buffer.charAt(bufferLength - 6) == 'c')
160 						&& (this.buffer.charAt(bufferLength - 5) == 'r') && (this.buffer.charAt(bufferLength - 4) == 'i')
161 						&& (this.buffer.charAt(bufferLength - 3) == 'p') && (this.buffer.charAt(bufferLength - 2) == 't')) {
162 					// script is at end
163 					this.inScript = false;
164 					// buffer = new serverByteBuffer();
165 					this.buffer.reset();
166 				}
167 			} else if (this.inStyle) {
168 				this.buffer.append(c);
169 				final int bufferLength = this.buffer.length();
170 				if ((c == rb) && (bufferLength > 13) && (this.buffer.charAt(bufferLength - 8) == lb) && (this.buffer.charAt(bufferLength - 7) == '/')
171 						&& (this.buffer.charAt(bufferLength - 6) == 's') && (this.buffer.charAt(bufferLength - 5) == 't')
172 						&& (this.buffer.charAt(bufferLength - 4) == 'y') && (this.buffer.charAt(bufferLength - 3) == 'l')
173 						&& (this.buffer.charAt(bufferLength - 2) == 'e')) {
174 					// style is at end
175 					this.inStyle = false;
176 					// buffer = new serverByteBuffer();
177 					this.buffer.reset();
178 				}
179 			} else {
180 				if (this.buffer.length() == 0) {
181 					if (c == rb) {
182 						// very strange error case; we just let it pass
183 					} else {
184 						this.buffer.append(c);
185 					}
186 				} else if (this.buffer.charAt(0) == lb) {
187 					if (c == singlequote) {
188 						this.inSingleQuote = true;
189 					}
190 					if (c == doublequote) {
191 						this.inDoubleQuote = true;
192 					}
193 					// fill in tag text
194 					if ((this.buffer.length() >= 3) && (this.buffer.charAt(1) == excl) && (this.buffer.charAt(2) == dash) && (c == dash)) {
195 						// this is the start of a comment
196 						this.inComment = true;
197 						this.buffer.append(c);
198 					} else if ((this.buffer.length() >= 6) && (this.buffer.charAt(1) == 's') && (this.buffer.charAt(2) == 'c')
199 							&& (this.buffer.charAt(3) == 'r') && (this.buffer.charAt(4) == 'i') && (this.buffer.charAt(5) == 'p') && (c == 't')) {
200 						// this is the start of a javascript
201 						this.inScript = true;
202 						this.buffer.append(c);
203 					} else if ((this.buffer.length() >= 5) && (this.buffer.charAt(1) == 's') && (this.buffer.charAt(2) == 't')
204 							&& (this.buffer.charAt(3) == 'y') && (this.buffer.charAt(4) == 'l') && (c == 'e')) {
205 						// this is the start of a css-style
206 						this.inStyle = true;
207 						this.buffer.append(c);
208 					} else if (c == rb) {
209 						this.buffer.append(c);
210 						// the tag ends here. after filtering: pass on
211 						filterSentence(this.buffer.getChars(), doublequote);
212 						// buffer = new serverByteBuffer();
213 						this.buffer.reset();
214 					} else if (c == lb) {
215 						// this is an error case
216 						// we consider that there is one rb missing
217 						if (this.buffer.length() > 0) {
218 							filterSentence(this.buffer.getChars(), doublequote);
219 						}
220 						// buffer = new serverByteBuffer();
221 						this.buffer.reset();
222 						this.buffer.append(c);
223 					} else {
224 						this.buffer.append(c);
225 					}
226 				} else {
227 					// fill in plain text
228 					if (c == lb) {
229 						// the text ends here
230 						if (this.buffer.length() > 0) {
231 							filterSentence(this.buffer.getChars(), doublequote);
232 						}
233 						// buffer = new serverByteBuffer();
234 						this.buffer.reset();
235 						this.buffer.append(c);
236 					} else {
237 						// simply append
238 						this.buffer.append(c);
239 					}
240 				}
241 			}
242 		}
243 	}
244 
245 	private void filterSentence(final char[] in, final char quotechar) {
246 		if (in.length == 0) {
247 			return;
248 		}
249 		// scan the string and parse structure
250 		if (in.length > 2 && in[0] == lb) {
251 
252 			// a tag
253 			String tag;
254 			int tagend;
255 			if (in[1] == '/') {
256 				// a closing tag
257 				tagend = tagEnd(in, 2);
258 				tag = new String(in, 2, tagend - 2);
259 				final char[] text = new char[in.length - tagend - 1];
260 				System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
261 				filterTag(tag, false, text, quotechar);
262 				return;
263 			}
264 
265 			// an opening tag
266 			tagend = tagEnd(in, 1);
267 			tag = new String(in, 1, tagend - 1);
268 			final char[] text = new char[in.length - tagend - 1];
269 			System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
270 			filterTag(tag, true, text, quotechar);
271 			return;
272 		}
273 
274 		// a text
275 		filterTag(null, true, in, quotechar);
276 		return;
277 	}
278 
279 	private void filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
280 		if (this.filterTag == null) {
281 			// we are not collection tag text
282 			if (tag == null) {
283 				return;
284 			}
285 
286 			// we have a new tag
287 			if (opening) {
288 				if ((this.scraper != null) && (this.scraper.isTagWithoutContent(tag))) {
289 					// this single tag is collected at once here
290 					final TagWriter charBuffer = new TagWriter(content);
291 					this.scraper.scrapeTagWithoutContent(tag, charBuffer.propParser());
292 					try {
293 						charBuffer.close();
294 					} catch (final IOException e) {
295 						LOG.warn("Failed to close tag writer", e);
296 					}
297 				}
298 				if (((this.scraper != null) && (this.scraper.isTagWithContent(tag)))) {
299 					final TagWriter scb = new TagWriter(content);
300 
301 					final Properties properties = scb.propParser();
302 					try {
303 						scb.close();
304 					} catch (final IOException e) {
305 						LOG.warn("Failed to close tag writer", e);
306 					}
307 
308 					if (content[content.length - 1] == '/') {
309 						// A simple empty tag! This single tag is collected at once here
310 						this.scraper.scrapeTagWithContent(tag, properties, null);
311 					} else {
312 						// ok, start collecting
313 						this.filterTag = tag;
314 						this.filterOpts = properties;
315 						this.filterCont = new TagWriter();
316 					}
317 
318 					return;
319 				} else {
320 					// we ignore that thing and return it again
321 					return;
322 				}
323 			}
324 
325 			// we ignore that thing and return it again
326 			return;
327 
328 		}
329 
330 		// we are collection tag text for the tag 'filterTag'
331 		if (tag == null) {
332 			this.filterCont.append(content);
333 			return;
334 		}
335 
336 		// it's a tag! which one?
337 		//		if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
338 		//			// this tag is not our concern. just add it
339 		//			this.filterCont.append(genTag0raw(tag, opening, content));
340 		//			return;
341 		//		}
342 
343 		// it's our closing tag! return complete result.
344 		if (this.scraper != null) {
345 			this.scraper.scrapeTagWithContent(this.filterTag, this.filterOpts, this.filterCont.getChars());
346 		}
347 		this.filterTag = null;
348 		this.filterOpts = null;
349 		this.filterCont = null;
350 		return;
351 	}
352 
353 	private static int tagEnd(final char[] tag, final int start) {
354 		char c;
355 		for (int i = start; i < tag.length; i++) {
356 			c = tag[i];
357 			if (c != '!' && c != '-' && (c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
358 				return i;
359 			}
360 		}
361 		return tag.length - 1;
362 	}
363 
364 	private void filterFinalize(final char quotechar) {
365 		// it's our closing tag! return complete result.
366 		if (this.scraper != null && this.filterCont != null) {
367 			this.scraper.scrapeTagWithContent(this.filterTag, this.filterOpts, this.filterCont.getChars());
368 		}
369 		this.filterTag = null;
370 		this.filterOpts = null;
371 		this.filterCont = null;
372 	}
373 
374 	@Override
375 	public void close() throws IOException {
376 		final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
377 		if (this.buffer != null) {
378 			if (this.buffer.length() > 0) {
379 				filterSentence(this.buffer.getChars(), quotechar);
380 			}
381 			this.buffer = null;
382 		}
383 		filterFinalize(quotechar);
384 		this.filterTag = null;
385 		this.filterOpts = null;
386 		this.filterCont = null;
387 	}
388 
389 	@Override
390 	public void write(final char b[]) throws IOException {
391 		write(b, 0, b.length);
392 	}
393 
394 	@Override
395 	public void write(final char b[], final int off, final int len) throws IOException {
396 		if ((off | len | (b.length - (len + off)) | (off + len)) < 0) {
397 			throw new IndexOutOfBoundsException();
398 		}
399 		for (int i = off; i < (len - off); i++) {
400 			this.write(b[i]);
401 		}
402 	}
403 
404 	@Override
405 	public void flush() throws IOException {
406 		// Nothing to do
407 	}
408 }