View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  // serverCharBuffer.java 
19  // ---------------------------
20  // (C) by Michael Peter Christen; mc@yacy.net
21  // first published on http://www.anomic.de
22  // Frankfurt, Germany, 2004
23  //
24  // $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
25  // $LastChangedRevision: 1715 $
26  // $LastChangedBy: borg-0300 $
27  //
28  // This program is free software; you can redistribute it and/or modify
29  // it under the terms of the GNU General Public License as published by
30  // the Free Software Foundation; either version 2 of the License, or
31  // (at your option) any later version.
32  //
33  // This program is distributed in the hope that it will be useful,
34  // but WITHOUT ANY WARRANTY; without even the implied warranty of
35  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
36  // GNU General Public License for more details.
37  //
38  // You should have received a copy of the GNU General Public License
39  // along with this program; if not, write to the Free Software
40  // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
41  package simplespider.simplespider.bot.extractor.html.stream;
42  
43  import java.io.IOException;
44  import java.io.Writer;
45  import java.util.Properties;
46  
47  import org.apache.commons.logging.Log;
48  import org.apache.commons.logging.LogFactory;
49  
50  final class TagWriter extends Writer {
51  
52  	private static final Log	LOG			= LogFactory.getLog(TagWriter.class);
53  
54  	private static final char	singlequote	= '\'';
55  	private static final char	doublequote	= '"';
56  	private static final char	equal		= '=';
57  
58  	private char[]				buffer;
59  	private int					offset;
60  	private int					length;
61  
62  	public TagWriter() {
63  		this.buffer = new char[10];
64  		this.length = 0;
65  		this.offset = 0;
66  	}
67  
68  	public TagWriter(final int initLength) {
69  		this.buffer = new char[initLength];
70  		this.length = 0;
71  		this.offset = 0;
72  	}
73  
74  	public TagWriter(final char[] bb) {
75  		this.buffer = bb;
76  		this.length = bb.length;
77  		this.offset = 0;
78  	}
79  
80  	//	public TagWriter(final char[] bb, final int initLength) {
81  	//		this.buffer = new char[initLength];
82  	//		System.arraycopy(bb, 0, this.buffer, 0, bb.length);
83  	//		this.length = bb.length;
84  	//		this.offset = 0;
85  	//	}
86  
87  	//	public TagWriter(final char[] bb, final int of, final int le) {
88  	//		if (of * 2 > bb.length) {
89  	//			this.buffer = new char[le];
90  	//			System.arraycopy(bb, of, this.buffer, 0, le);
91  	//			this.length = le;
92  	//			this.offset = 0;
93  	//		} else {
94  	//			this.buffer = bb;
95  	//			this.length = le;
96  	//			this.offset = of;
97  	//		}
98  	//	}
99  
100 	//	public TagWriter(final TagWriter bb) {
101 	//		this.buffer = bb.buffer;
102 	//		this.length = bb.length;
103 	//		this.offset = bb.offset;
104 	//	}
105 
106 	//	public TagWriter(final File f) throws IOException {
107 	//		// initially fill the buffer with the content of a file
108 	//		if (f.length() > Integer.MAX_VALUE) {
109 	//			throw new IOException("file is too large for buffering");
110 	//		}
111 	//
112 	//		this.length = (int) f.length();
113 	//		this.buffer = new char[this.length * 2];
114 	//		this.offset = 0;
115 	//
116 	//		FileReader fr = null;
117 	//		try {
118 	//			fr = new FileReader(f);
119 	//			final char[] temp = new char[256];
120 	//			int c;
121 	//			while ((c = fr.read(temp)) > 0) {
122 	//				this.append(temp, 0, c);
123 	//			}
124 	//		} catch (final FileNotFoundException e) {
125 	//			throw new IOException("File not found: " + f.toString() + "; " + e.getMessage());
126 	//		} finally {
127 	//			if (fr != null) {
128 	//				fr.close();
129 	//			}
130 	//		}
131 	//	}
132 
133 	public void clear() {
134 		this.buffer = new char[0];
135 		this.length = 0;
136 		this.offset = 0;
137 	}
138 
139 	public int length() {
140 		return this.length;
141 	}
142 
143 	private void grow() {
144 		int newsize = this.buffer.length * 2 + 1;
145 		if (newsize < 256) {
146 			newsize = 256;
147 		}
148 		if (LOG.isDebugEnabled()) {
149 			LOG.debug("Increase tag writer buffer: from " + this.buffer.length + " to " + newsize);
150 		}
151 		final char[] tmp = new char[newsize];
152 		System.arraycopy(this.buffer, this.offset, tmp, 0, this.length);
153 		this.buffer = tmp;
154 		this.offset = 0;
155 	}
156 
157 	@Override
158 	public void write(final int b) {
159 		write((char) b);
160 	}
161 
162 	public void write(final char b) {
163 		if (this.offset + this.length + 1 > this.buffer.length) {
164 			grow();
165 		}
166 		this.buffer[this.offset + this.length++] = b;
167 	}
168 
169 	@Override
170 	public void write(final char[] bb) {
171 		write(bb, 0, bb.length);
172 	}
173 
174 	@Override
175 	public void write(final char[] bb, final int of, final int le) {
176 		while (this.offset + this.length + le > this.buffer.length) {
177 			grow();
178 		}
179 		System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
180 		this.length += le;
181 	}
182 
183 	// do not use/implement the following method, a
184 	// "overridden method is a bridge method"
185 	// will occur
186 	//    public serverCharBuffer append(char b) {
187 	//        write(b);
188 	//        return this;
189 	//    }
190 
191 	public TagWriter append(final int i) {
192 		write((char) (i));
193 		return this;
194 	}
195 
196 	public TagWriter append(final char[] bb) {
197 		write(bb);
198 		return this;
199 	}
200 
201 	public TagWriter append(final char[] bb, final int of, final int le) {
202 		write(bb, of, le);
203 		return this;
204 	}
205 
206 	public TagWriter append(final String s) {
207 		return append(s, 0, s.length());
208 	}
209 
210 	public TagWriter append(final String s, final int off, final int len) {
211 		final char[] temp = new char[len];
212 		s.getChars(off, (off + len), temp, 0);
213 		return append(temp);
214 	}
215 
216 	public TagWriter append(final TagWriter bb) {
217 		return append(bb.buffer, bb.offset, bb.length);
218 	}
219 
220 	//    public serverCharBuffer append(Object o) {
221 	//        if (o instanceof String) return append((String) o);
222 	//        if (o instanceof char[]) return append((char[]) o);
223 	//        return null;
224 	//    }
225 
226 	public char charAt(final int pos) {
227 		if (pos < 0) {
228 			throw new IndexOutOfBoundsException();
229 		}
230 		if (pos > this.length) {
231 			throw new IndexOutOfBoundsException();
232 		}
233 		return this.buffer[this.offset + pos];
234 	}
235 
236 	public void deleteCharAt(final int pos) {
237 		if (pos < 0) {
238 			return;
239 		}
240 		if (pos >= this.length) {
241 			return;
242 		}
243 		if (pos == this.length - 1) {
244 			this.length--;
245 		} else {
246 			System.arraycopy(this.buffer, this.offset + pos + 1, this.buffer, this.offset + pos, this.length - pos - 1);
247 		}
248 	}
249 
250 	public int indexOf(final char b) {
251 		return indexOf(b, 0);
252 	}
253 
254 	public int indexOf(final char[] bs) {
255 		return indexOf(bs, 0);
256 	}
257 
258 	public int indexOf(final char b, final int start) {
259 		if (start >= this.length) {
260 			return -1;
261 		}
262 		for (int i = start; i < this.length; i++) {
263 			if (this.buffer[this.offset + i] == b) {
264 				return i;
265 			}
266 		}
267 		return -1;
268 	}
269 
270 	public int indexOf(final char[] bs, final int start) {
271 		if (start + bs.length > this.length) {
272 			return -1;
273 		}
274 		loop: for (int i = start; i <= this.length - bs.length; i++) {
275 			// first test only first char
276 			if (this.buffer[this.offset + i] != bs[0]) {
277 				continue loop;
278 			}
279 
280 			// then test all remaining char
281 			for (int j = 1; j < bs.length; j++) {
282 				if (this.buffer[this.offset + i + j] != bs[j]) {
283 					continue loop;
284 				}
285 			}
286 
287 			// found hit
288 			return i;
289 		}
290 		return -1;
291 	}
292 
293 	public int lastIndexOf(final char b) {
294 		for (int i = this.length - 1; i >= 0; i--) {
295 			if (this.buffer[this.offset + i] == b) {
296 				return i;
297 			}
298 		}
299 		return -1;
300 	}
301 
302 	public boolean startsWith(final char[] bs) {
303 		if (this.length < bs.length) {
304 			return false;
305 		}
306 		for (int i = 0; i < bs.length; i++) {
307 			if (this.buffer[this.offset + i] != bs[i]) {
308 				return false;
309 			}
310 		}
311 		return true;
312 	}
313 
314 	public char[] getChars() {
315 		return getChars(0);
316 	}
317 
318 	public char[] getChars(final int start) {
319 		return getChars(start, this.length);
320 	}
321 
322 	public char[] getChars(final int start, final int end) {
323 		// start is inclusive, end is exclusive
324 		if (end > this.length) {
325 			throw new IndexOutOfBoundsException("getBytes: end > length");
326 		}
327 		if (start > this.length) {
328 			throw new IndexOutOfBoundsException("getBytes: start > length");
329 		}
330 		final char[] tmp = new char[end - start];
331 		System.arraycopy(this.buffer, this.offset + start, tmp, 0, end - start);
332 		return tmp;
333 	}
334 
335 	public TagWriter trim(final int start) {
336 		// the end value is outside (+1) of the wanted target array
337 		if (start > this.length) {
338 			throw new IndexOutOfBoundsException("trim: start > length");
339 		}
340 		this.offset = this.offset + start;
341 		this.length = this.length - start;
342 		return this;
343 	}
344 
345 	public TagWriter trim(final int start, final int end) {
346 		// the end value is outside (+1) of the wanted target array
347 		if (start > this.length) {
348 			throw new IndexOutOfBoundsException("trim: start > length");
349 		}
350 		if (end > this.length) {
351 			throw new IndexOutOfBoundsException("trim: end > length");
352 		}
353 		if (start > end) {
354 			throw new IndexOutOfBoundsException("trim: start > end");
355 		}
356 		this.offset = this.offset + start;
357 		this.length = end - start;
358 		return this;
359 	}
360 
361 	public TagWriter trim() {
362 		int l = 0;
363 		while ((l < this.length) && (this.buffer[this.offset + l] <= ' ')) {
364 			l++;
365 		}
366 		int r = this.length;
367 		while ((r > 0) && (this.buffer[this.offset + r - 1] <= ' ')) {
368 			r--;
369 		}
370 		if (l > r) {
371 			r = l;
372 		}
373 		return trim(l, r);
374 	}
375 
376 	public boolean isWhitespace(final boolean includeNonLetterBytes) {
377 		// returns true, if trim() would result in an empty serverByteBuffer
378 		if (includeNonLetterBytes) {
379 			char b;
380 			for (int i = 0; i < this.length; i++) {
381 				b = this.buffer[this.offset + i];
382 				if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) {
383 					return false;
384 				}
385 			}
386 		} else {
387 			for (int i = 0; i < this.length; i++) {
388 				if (this.buffer[this.offset + i] > 32) {
389 					return false;
390 				}
391 			}
392 		}
393 		return true;
394 	}
395 
396 	public int whitespaceStart(final boolean includeNonLetterBytes) {
397 		// returns number of whitespace char at the beginning of text
398 		if (includeNonLetterBytes) {
399 			char b;
400 			for (int i = 0; i < this.length; i++) {
401 				b = this.buffer[this.offset + i];
402 				if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) {
403 					return i;
404 				}
405 			}
406 		} else {
407 			for (int i = 0; i < this.length; i++) {
408 				if (this.buffer[this.offset + i] > 32) {
409 					return i;
410 				}
411 			}
412 		}
413 		return this.length;
414 	}
415 
416 	public int whitespaceEnd(final boolean includeNonLetterBytes) {
417 		// returns position of whitespace at the end of text
418 		if (includeNonLetterBytes) {
419 			char b;
420 			for (int i = this.length - 1; i >= 0; i--) {
421 				b = this.buffer[this.offset + i];
422 				if (((b >= '0') && (b <= '9')) || ((b >= 'A') && (b <= 'Z')) || ((b >= 'a') && (b <= 'z'))) {
423 					return i + 1;
424 				}
425 			}
426 		} else {
427 			for (int i = this.length - 1; i >= 0; i--) {
428 				if (this.buffer[this.offset + i] > 32) {
429 					return i + 1;
430 				}
431 			}
432 		}
433 		return 0;
434 	}
435 
436 	@Override
437 	public String toString() {
438 		return new String(this.buffer, this.offset, this.length);
439 	}
440 
441 	public String toString(final int left, final int rightbound) {
442 		return new String(this.buffer, this.offset + left, rightbound - left);
443 	}
444 
445 	public Properties propParser() {
446 		// extract a=b or a="b" - relations from the buffer
447 		int pos = this.offset;
448 		int start;
449 		String key;
450 		final Properties p = new Properties();
451 		// eat up spaces at beginning
452 		while ((pos < this.length) && (this.buffer[pos] <= 32)) {
453 			pos++;
454 		}
455 		while (pos < this.length) {
456 			// pos is at start of next key
457 			start = pos;
458 			while ((pos < this.length) && (this.buffer[pos] != equal)) {
459 				pos++;
460 			}
461 			if (pos >= this.length) {
462 				break; // this is the case if we found no equal
463 			}
464 			key = new String(this.buffer, start, pos - start).trim().toLowerCase();
465 			// we have a key
466 			pos++;
467 			// find start of value
468 			while ((pos < this.length) && (this.buffer[pos] <= 32)) {
469 				pos++;
470 			}
471 			// doublequotes are obligatory. However, we want to be fuzzy if they
472 			// are ommittet
473 			if (pos >= this.length) {
474 				// error case: input ended too early
475 				break;
476 			} else if (this.buffer[pos] == doublequote) {
477 				// search next doublequote
478 				pos++;
479 				start = pos;
480 				while ((pos < this.length) && (this.buffer[pos] != doublequote)) {
481 					pos++;
482 				}
483 				if (pos >= this.length) {
484 					break; // this is the case if we found no parent doublequote
485 				}
486 				p.setProperty(key, new String(this.buffer, start, pos - start).trim());
487 				pos++;
488 			} else if (this.buffer[pos] == singlequote) {
489 				// search next singlequote
490 				pos++;
491 				start = pos;
492 				while ((pos < this.length) && (this.buffer[pos] != singlequote)) {
493 					pos++;
494 				}
495 				if (pos >= this.length) {
496 					break; // this is the case if we found no parent singlequote
497 				}
498 				p.setProperty(key, new String(this.buffer, start, pos - start).trim());
499 				pos++;
500 			} else {
501 				// search next whitespace
502 				start = pos;
503 				while ((pos < this.length) && (this.buffer[pos] > 32)) {
504 					pos++;
505 				}
506 				p.setProperty(key, new String(this.buffer, start, pos - start).trim());
507 			}
508 			// pos should point now to a whitespace: eat up spaces
509 			while ((pos < this.length) && (this.buffer[pos] <= 32)) {
510 				pos++;
511 				// go on with next loop
512 			}
513 		}
514 		return p;
515 	}
516 
517 	public static boolean equals(final char[] buffer, final char[] pattern) {
518 		return equals(buffer, 0, pattern);
519 	}
520 
521 	public static boolean equals(final char[] buffer, final int offset, final char[] pattern) {
522 		// compares two char arrays: true, if pattern appears completely at offset position
523 		if (buffer.length < offset + pattern.length) {
524 			return false;
525 		}
526 		for (int i = 0; i < pattern.length; i++) {
527 			if (buffer[offset + i] != pattern[i]) {
528 				return false;
529 			}
530 		}
531 		return true;
532 	}
533 
534 	public void reset() {
535 		this.length = 0;
536 		this.offset = 0;
537 	}
538 
539 	//	public void reset(final int newSize) {
540 	//		resize(newSize);
541 	//		this.reset();
542 	//	}
543 	//
544 	//	public void resize(final int newSize) {
545 	//		if (newSize < 0) {
546 	//			throw new IllegalArgumentException("Illegal array size: " + newSize);
547 	//		}
548 	//		if (LOG.isDebugEnabled()) {
549 	//			LOG.debug("Resize tag writer before: from " + this.buffer.length + " to " + newSize);
550 	//		}
551 	//		final char[] v = new char[newSize];
552 	//		System.arraycopy(this.buffer, 0, v, 0, newSize > this.buffer.length ? this.buffer.length : newSize);
553 	//		this.buffer = v;
554 	//	}
555 
556 	public char toCharArray()[] {
557 		final char[] newbuf = new char[this.length];
558 		System.arraycopy(this.buffer, 0, newbuf, 0, this.length);
559 		return newbuf;
560 	}
561 
562 	@Override
563 	public void close() throws IOException {
564 		// TODO Auto-generated method stub        
565 	}
566 
567 	@Override
568 	public void flush() throws IOException {
569 		// TODO Auto-generated method stub        
570 	}
571 
572 }