View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  package simplespider.simplespider.util;
19  
20  import java.net.MalformedURLException;
21  import java.util.regex.Matcher;
22  import java.util.regex.Pattern;
23  
24  import org.apache.commons.logging.Log;
25  import org.apache.commons.logging.LogFactory;
26  
27  import simplespider.simplespider.util.Punycode.PunycodeException;
28  
29  /***
30   * this class exist to provide a system-wide normal form representation of urls, and to prevent that java.net.URL usage causes DNS queries which are
31   * used in java.net. <p /> Based on de.anomic.yacy.yacyURL
32   */
33  public class SimpleUrl {
34  	private static final Log		LOG				= LogFactory.getLog(SimpleUrl.class);
35  
36  	private final Pattern			backPathPattern	= Pattern.compile("(/[^/]+(?<!///.{1,2})/)[.]{2}(?=/|$)|///.(?=/)|/(?=/)");
37  
38  	private int						port;
39  
40  	// class variables
41  	private String					protocol;
42  	private String					host;
43  	private String					userInfo;
44  	private String					path;
45  	private String					quest;
46  	private String					ref;
47  
48  	private final static String[]	hex				= { "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", "%08", "%09", "%0A", "%0B", "%0C",
49  			"%0D", "%0E", "%0F", "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
50  			"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F", "%30", "%31", "%32",
51  			"%33", "%34", "%35", "%36", "%37", "%38", "%39", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", "%40", "%41", "%42", "%43", "%44", "%45",
52  			"%46", "%47", "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", "%58",
53  			"%59", "%5A", "%5B", "%5C", "%5D", "%5E", "%5F", "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B",
54  			"%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", "%78", "%79", "%7A", "%7B", "%7C", "%7D", "%7E",
55  			"%7F", "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", "%90", "%91",
56  			"%92", "%93", "%94", "%95", "%96", "%97", "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", "%A0", "%A1", "%A2", "%A3", "%A4",
57  			"%A5", "%A6", "%A7", "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
58  			"%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", "%C8", "%C9", "%CA",
59  			"%CB", "%CC", "%CD", "%CE", "%CF", "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", "%D8", "%D9", "%DA", "%DB", "%DC", "%DD",
60  			"%DE", "%DF", "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", "%F0",
61  			"%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF" };
62  
63  	public SimpleUrl(final SimpleUrl baseURL, String relPath) throws MalformedURLException {
64  		if (baseURL == null) {
65  			throw new MalformedURLException("base URL is null");
66  		}
67  		if (relPath == null) {
68  			throw new MalformedURLException("relPath is null");
69  		}
70  
71  		this.protocol = baseURL.protocol;
72  		this.host = baseURL.host;
73  		this.port = baseURL.port;
74  		this.userInfo = baseURL.userInfo;
75  
76  		if (relPath.startsWith("//")) {
77  			// a "network-path reference" as defined in rfc2396 denotes
78  			// a relative path that uses the protocol from the base url
79  			relPath = baseURL.protocol + ":" + relPath;
80  		}
81  
82  		// FIXME baseURL.path could be null
83  
84  		if (isAbsolute(relPath)) {
85  			this.path = baseURL.path;
86  		} else if (relPath.startsWith("/")) {
87  			this.path = relPath;
88  		} else if (baseURL.path.endsWith("/")) {
89  			//			if (relPath.startsWith("#") //
90  			//					|| relPath.startsWith("?")) {
91  			//				throw new MalformedURLException("relative path malformed: " + relPath);
92  			//			}
93  			this.path = baseURL.path + relPath;
94  		} else {
95  			if (relPath.startsWith("#") //
96  					|| relPath.startsWith("?")) {
97  				this.path = baseURL.path + relPath;
98  			} else {
99  				final int q = baseURL.path.lastIndexOf('/');
100 				if (q < 0) {
101 					this.path = relPath;
102 				} else {
103 					this.path = baseURL.path.substring(0, q + 1) + relPath;
104 				}
105 			}
106 		}
107 
108 		this.quest = baseURL.quest;
109 		this.ref = baseURL.ref;
110 
111 		this.path = resolveBackpath(this.path);
112 		identRef();
113 		identQuest();
114 		// escape();
115 	}
116 
117 	public SimpleUrl(final String url) throws MalformedURLException {
118 		checkNotEmpty("url", url);
119 
120 		parseURLString(url);
121 	}
122 
123 	public SimpleUrl(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
124 		checkNotEmpty("protocol", protocol);
125 		checkNotEmpty("host", host);
126 
127 		this.protocol = protocol;
128 		this.host = host;
129 		this.port = port;
130 		this.path = path;
131 		identRef();
132 		identQuest();
133 		// escape();
134 	}
135 
136 	private static void checkNotEmpty(final String name, final CharSequence value) throws MalformedURLException {
137 		ValidityHelper.checkNotEmpty("name", name);
138 
139 		if (value == null) {
140 			throw new MalformedURLException(name + " is null");
141 		}
142 		if (ValidityHelper.isEmpty(value)) {
143 			throw new MalformedURLException(name + " is empty");
144 		}
145 	}
146 
147 	/***
148 	 * Copy constructor
149 	 * 
150 	 * @param baseURL
151 	 *            must not be <code>null</code>
152 	 * @throws NullPointerException
153 	 *             if <code>baseUrl</code> is <code>null</code>
154 	 */
155 	public SimpleUrl(final SimpleUrl baseURL) {
156 		ValidityHelper.checkNotNull("baseURL", baseURL);
157 
158 		this.host = baseURL.host;
159 		this.path = baseURL.path;
160 		this.port = baseURL.port;
161 		this.protocol = baseURL.protocol;
162 		this.quest = baseURL.quest;
163 		this.ref = baseURL.ref;
164 		this.userInfo = baseURL.userInfo;
165 	}
166 
167 	/***
168 	 * Encode a string to the "x-www-form-urlencoded" form, enhanced with the UTF-8-in-URL proposal. This is what happens: <ul> <li>The ASCII
169 	 * characters 'a' through 'z', 'A' through 'Z', and '0' through '9' remain the same. <li>The unreserved characters - _ . ! ~ * ' ( ) remain the
170 	 * same. <li>All other ASCII characters are converted into the 3-character string "%xy", where xy is the two-digit hexadecimal representation of
171 	 * the character code <li>All non-ASCII characters are encoded in two steps: first to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
172 	 * secondly each of these bytes is encoded as "%xx". </ul>
173 	 * 
174 	 * @param s
175 	 *            The string to be encoded
176 	 * @return The encoded string
177 	 */
178 	// from: http://www.w3.org/International/URLUTF8Encoder.java
179 	public static String escape(final String s) {
180 		final StringBuilder sbuf = new StringBuilder();
181 		final int len = s.length();
182 		for (int i = 0; i < len; i++) {
183 			final int ch = s.charAt(i);
184 			if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
185 				sbuf.append((char) ch);
186 			} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
187 				sbuf.append((char) ch);
188 			} else if ('0' <= ch && ch <= '9') { // '0'..'9'
189 				sbuf.append((char) ch);
190 			} else if (ch == ' ') { // space
191 				sbuf.append("%20");
192 			} else if (ch == '&'
193 					|| ch == ':' // unreserved
194 					|| ch == '-' || ch == '_' || ch == '.' || ch == '!' || ch == '~' || ch == '*' || ch == '\'' || ch == '(' || ch == ')'
195 					|| ch == ';') {
196 				sbuf.append((char) ch);
197 			} else if (ch <= 0x007f) { // other ASCII
198 				sbuf.append(hex[ch]);
199 			} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
200 				sbuf.append(hex[0xc0 | (ch >> 6)]);
201 				sbuf.append(hex[0x80 | (ch & 0x3F)]);
202 			} else { // 0x7FF < ch <= 0xFFFF
203 				sbuf.append(hex[0xe0 | (ch >> 12)]);
204 				sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
205 				sbuf.append(hex[0x80 | (ch & 0x3F)]);
206 			}
207 		}
208 		return sbuf.toString();
209 	}
210 
211 	public static void main(final String[] args) {
212 		final String[][] test = new String[][] { new String[] { null, "http://www.anomic.de/home/test?x=1#home" },
213 				new String[] { null, "http://www.anomic.de/home/test?x=1" }, new String[] { null, "http://www.anomic.de/home/test#home" },
214 				new String[] { null, "ftp://ftp.anomic.de/home/test#home" }, new String[] { null, "http://www.anomic.de/home/../abc/" },
215 				new String[] { null, "mailto:abcdefg@nomailnomail.com" }, new String[] { "http://www.anomic.de/home", "test" },
216 				new String[] { "http://www.anomic.de/home", "test/" }, new String[] { "http://www.anomic.de/home/", "test" },
217 				new String[] { "http://www.anomic.de/home/", "test/" }, new String[] { "http://www.anomic.de/home/index.html", "test.htm" },
218 				new String[] { "http://www.anomic.de/home/index.html", "http://www.yacy.net/test" },
219 				new String[] { "http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test" },
220 				new String[] { "http://www.anomic.de/home/index.html", "../test" },
221 				new String[] { "http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com" }, new String[] { null, "news:de.test" },
222 				new String[] { "http://www.anomic.de/home", "news:de.test" },
223 				new String[] { "http://www.anomic.de/home", "ftp://ftp.anomic.de/src" }, new String[] { null, "ftp://ftp.delegate.org/" },
224 				new String[] { "http://www.anomic.de/home", "ftp://ftp.delegate.org/" },
225 				new String[] { "http://www.anomic.de", "mailto:yacy@weltherrschaft.org" }, new String[] { "http://www.anomic.de", "javascipt:temp" },
226 				new String[] { null, "http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history" },
227 				new String[] { null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" },
228 				new String[] { null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&amp;showuser=23585" } };
229 		String environment, url;
230 		SimpleUrl aURL, aURL1;
231 		java.net.URL jURL;
232 		for (int i = 0; i < test.length; i++) {
233 			environment = test[i][0];
234 			url = test[i][1];
235 			try {
236 				aURL = SimpleUrl.newURL(environment, url);
237 			} catch (final MalformedURLException e) {
238 				aURL = null;
239 			}
240 			if (environment == null) {
241 				try {
242 					jURL = new java.net.URL(url);
243 				} catch (final MalformedURLException e) {
244 					jURL = null;
245 				}
246 			} else {
247 				try {
248 					jURL = new java.net.URL(new java.net.URL(environment), url);
249 				} catch (final MalformedURLException e) {
250 					jURL = null;
251 				}
252 			}
253 
254 			// check equality to java.net.URL
255 			if (((aURL == null) && (jURL != null)) || ((aURL != null) && (jURL == null))
256 					|| ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) {
257 				System.out.println("Difference for environment=" + environment + ", url=" + url + ":");
258 				System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString());
259 				System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString());
260 			}
261 
262 			// check stability: the normalform of the normalform must be equal to the normalform
263 			if (aURL != null) {
264 				try {
265 					aURL1 = new SimpleUrl(aURL.toNormalform(true, true));
266 					if (!(aURL1.toNormalform(true, true).equals(aURL.toNormalform(true, true)))) {
267 						System.out.println("no stability for url:");
268 						System.out.println("aURL0=" + aURL.toString());
269 						System.out.println("aURL1=" + aURL1.toString());
270 					}
271 				} catch (final MalformedURLException e) {
272 					System.out.println("no stability for url:");
273 					System.out.println("aURL0=" + aURL.toString());
274 					System.out.println("aURL1 cannot be computed:" + e.getMessage());
275 				}
276 			}
277 		}
278 	}
279 
280 	// TODO Replace this logic by public constructor
281 	public static SimpleUrl newURL(final SimpleUrl baseURL, final String relPath) throws MalformedURLException {
282 		if (baseURL == null //
283 				|| isAbsolute(relPath)) {
284 			return new SimpleUrl(relPath);
285 		}
286 
287 		if (ValidityHelper.isEmpty(relPath)) {
288 			return new SimpleUrl(baseURL);
289 		}
290 
291 		return new SimpleUrl(baseURL, relPath);
292 	}
293 
294 	// TODO Replace this logic by public constructor
295 	public static SimpleUrl newURL(final String baseURL, final String relPath) throws MalformedURLException {
296 		if (baseURL == null //
297 				|| isAbsolute(relPath)) {
298 			return new SimpleUrl(relPath);
299 		}
300 
301 		if (ValidityHelper.isEmpty(relPath)) {
302 			return new SimpleUrl(baseURL);
303 		}
304 
305 		return new SimpleUrl(new SimpleUrl(baseURL), relPath);
306 	}
307 
308 	private static boolean isAbsolute(final String path) {
309 		if (ValidityHelper.isEmpty(path)) {
310 			return false;
311 		}
312 
313 		// Use only find, so we have no need to define the whole complex URI RegExp
314 		final Pattern protocalPattern = Pattern.compile("[a-zA-Z]+:");
315 		final Matcher protocolMatcher = protocalPattern.matcher(path);
316 		// If the expression is found AND is found at the beginning of given string, so we assume that is an absolute URI according to URI definition
317 		return protocolMatcher.find() && protocolMatcher.start() == 0;
318 	}
319 
320 	// from: http://www.w3.org/International/unescape.java
321 	public static String unescape(final String s) {
322 		final StringBuilder sbuf = new StringBuilder();
323 		final int l = s.length();
324 		int ch = -1;
325 		int b, sumb = 0;
326 		for (int i = 0, more = -1; i < l; i++) {
327 			/* Get next byte b from URL segment s */
328 			switch (ch = s.charAt(i)) {
329 			case '%':
330 				ch = s.charAt(++i);
331 				final int hb = (Character.isDigit((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
332 				ch = s.charAt(++i);
333 				final int lb = (Character.isDigit((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
334 				b = (hb << 4) | lb;
335 				break;
336 			case '+':
337 				b = ' ';
338 				break;
339 			default:
340 				b = ch;
341 			}
342 			/* Decode byte b as UTF-8, sumb collects incomplete chars */
343 			if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte)
344 				sumb = (sumb << 6) | (b & 0x3f); // Add 6 bits to sumb
345 				if (--more == 0) {
346 					sbuf.append((char) sumb); // Add char to sbuf
347 				}
348 			} else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits)
349 				sbuf.append((char) b); // Store in sbuf
350 			} else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits)
351 				sumb = b & 0x1f;
352 				more = 1; // Expect 1 more byte
353 			} else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits)
354 				sumb = b & 0x0f;
355 				more = 2; // Expect 2 more bytes
356 			} else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits)
357 				sumb = b & 0x07;
358 				more = 3; // Expect 3 more bytes
359 			} else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits)
360 				sumb = b & 0x03;
361 				more = 4; // Expect 4 more bytes
362 			} else /* if ((b & 0xfe) == 0xfc) */{ // 1111110x (yields 1 bit)
363 				sumb = b & 0x01;
364 				more = 5; // Expect 5 more bytes
365 			}
366 			/* We don't test if the UTF-8 encoding is well-formed */
367 		}
368 		return sbuf.toString();
369 	}
370 
371 	public int compareTo(final Object h) {
372 		assert (h instanceof SimpleUrl);
373 		return toString().compareTo(((SimpleUrl) h).toString());
374 	}
375 
376 	/*
377 	 * (non-Javadoc)
378 	 * @see java.lang.Object#equals(java.lang.Object)
379 	 */
380 	@Override
381 	public boolean equals(final Object obj) {
382 		if (this == obj) {
383 			return true;
384 		}
385 		if (obj == null) {
386 			return false;
387 		}
388 		if (!(obj instanceof SimpleUrl)) {
389 			return false;
390 		}
391 		final SimpleUrl other = (SimpleUrl) obj;
392 		if (this.host == null) {
393 			if (other.host != null) {
394 				return false;
395 			}
396 		} else if (!this.host.equals(other.host)) {
397 			return false;
398 		}
399 		if (this.path == null) {
400 			if (other.path != null) {
401 				return false;
402 			}
403 		} else if (!this.path.equals(other.path)) {
404 			return false;
405 		}
406 		if (this.port != other.port) {
407 			return false;
408 		}
409 		if (this.protocol == null) {
410 			if (other.protocol != null) {
411 				return false;
412 			}
413 		} else if (!this.protocol.equals(other.protocol)) {
414 			return false;
415 		}
416 		if (this.quest == null) {
417 			if (other.quest != null) {
418 				return false;
419 			}
420 		} else if (!this.quest.equals(other.quest)) {
421 			return false;
422 		}
423 		if (this.ref == null) {
424 			if (other.ref != null) {
425 				return false;
426 			}
427 		} else if (!this.ref.equals(other.ref)) {
428 			return false;
429 		}
430 		if (this.userInfo == null) {
431 			if (other.userInfo != null) {
432 				return false;
433 			}
434 		} else if (!this.userInfo.equals(other.userInfo)) {
435 			return false;
436 		}
437 		return true;
438 	}
439 
440 	/***
441 	 * Escapes the following parts of the url, this object already contains: <ul> <li>path: see {@link #escape(String)}</li> <li>ref: same as
442 	 * above</li> <li>quest: same as above without the ampersand ("&amp;") and the equals symbol</li> </ul>
443 	 */
444 	// private void escape() {
445 	// if (this.path != null && this.path.indexOf('%') == -1) {
446 	// escapePath();
447 	// }
448 	// if (this.quest != null && this.quest.indexOf('%') == -1) {
449 	// escapeQuest();
450 	// }
451 	// if (this.ref != null && this.ref.indexOf('%') == -1) {
452 	// escapeRef();
453 	// }
454 	// }
455 	// private void escapePath() {
456 	// final String[] pathp = this.path.split("/", -1);
457 	// String ptmp = "";
458 	// for (int i = 0; i < pathp.length; i++) {
459 	// ptmp += "/" + escape(pathp[i]);
460 	// }
461 	// this.path = ptmp.substring((ptmp.length() > 0) ? 1 : 0);
462 	// }
463 	//
464 	// private void escapeQuest() {
465 	// final String[] questp = this.quest.split("&", -1);
466 	// String qtmp = "";
467 	// for (int i = 0; i < questp.length; i++) {
468 	// if (questp[i].indexOf('=') != -1) {
469 	// qtmp += "&" + escape(questp[i].substring(0, questp[i].indexOf('=')));
470 	// qtmp += "=" + escape(questp[i].substring(questp[i].indexOf('=') + 1));
471 	// } else {
472 	// qtmp += "&" + escape(questp[i]);
473 	// }
474 	// }
475 	// this.quest = qtmp.substring((qtmp.length() > 0) ? 1 : 0);
476 	// }
477 	//
478 	// private void escapeRef() {
479 	// this.ref = escape(this.ref);
480 	// }
481 	public String getAuthority() {
482 		return ((this.port >= 0) && (this.host != null)) ? this.host + ":" + this.port : ((this.host != null) ? this.host : "");
483 	}
484 
485 	public String getFile() {
486 		return getFile(true);
487 	}
488 
489 	public String getFile(final boolean includeReference) {
490 		// this is the path plus quest plus ref
491 		// if there is no quest and no ref the result is identical to getPath
492 		// this is defined according to http://java.sun.com/j2se/1.4.2/docs/api/java/net/URL.html#getFile()
493 		final StringBuilder sb = new StringBuilder();
494 		sb.append(this.path);
495 
496 		if (!ValidityHelper.isEmpty(this.quest)) {
497 			sb.append('?').append(this.quest);
498 		}
499 
500 		if (includeReference && !ValidityHelper.isEmpty(this.ref)) {
501 			sb.append('#').append(this.ref);
502 		}
503 
504 		return sb.toString();
505 	}
506 
507 	public String getFileName() {
508 		// this is a method not defined in any sun api
509 		// it returns the last portion of a path without any reference
510 		final int p = this.path.lastIndexOf('/');
511 		if (p < 0) {
512 			return this.path;
513 		}
514 		if (p == this.path.length() - 1) {
515 			return ""; // no file name, this is a path to a directory
516 		}
517 		return this.path.substring(p + 1); // the 'real' file name
518 	}
519 
520 	public String getHost() {
521 		return this.host;
522 	}
523 
524 	public String getPath() {
525 		return this.path;
526 	}
527 
528 	public int getPort() {
529 		return this.port;
530 	}
531 
532 	public String getProtocol() {
533 		return this.protocol;
534 	}
535 
536 	public String getQuery() {
537 		return this.quest;
538 	}
539 
540 	public String getRef() {
541 		return this.ref;
542 	}
543 
544 	public String getUserInfo() {
545 		return this.userInfo;
546 	}
547 
548 	/*
549 	 * (non-Javadoc)
550 	 * @see java.lang.Object#hashCode()
551 	 */
552 	@Override
553 	public int hashCode() {
554 		final int prime = 31;
555 		int result = 1;
556 		result = prime * result + ((this.host == null) ? 0 : this.host.hashCode());
557 		result = prime * result + ((this.path == null) ? 0 : this.path.hashCode());
558 		result = prime * result + this.port;
559 		result = prime * result + ((this.protocol == null) ? 0 : this.protocol.hashCode());
560 		result = prime * result + ((this.quest == null) ? 0 : this.quest.hashCode());
561 		result = prime * result + ((this.ref == null) ? 0 : this.ref.hashCode());
562 		result = prime * result + ((this.userInfo == null) ? 0 : this.userInfo.hashCode());
563 		return result;
564 	}
565 
566 	private void identPort(final String inputURL, final int dflt) throws MalformedURLException {
567 		// identify ref in file
568 		final int r = this.host.indexOf(':');
569 		if (r < 0) {
570 			this.port = dflt;
571 		} else {
572 			try {
573 				final String portStr = this.host.substring(r + 1);
574 				if (portStr.trim().length() > 0) {
575 					this.port = Integer.parseInt(portStr);
576 				} else {
577 					this.port = -1;
578 				}
579 				this.host = this.host.substring(0, r);
580 			} catch (final NumberFormatException e) {
581 				throw new MalformedURLException("wrong port in host fragment '" + this.host + "' of input url '" + inputURL + "'");
582 			}
583 		}
584 	}
585 
586 	private void identQuest() {
587 		// identify quest in file
588 		final int r = this.path.indexOf('?');
589 		if (r < 0) {
590 			this.quest = null;
591 		} else {
592 			this.quest = this.path.substring(r + 1);
593 			this.path = this.path.substring(0, r);
594 		}
595 	}
596 
597 	private void identRef() {
598 		// identify ref in file
599 		final int r = this.path.indexOf('#');
600 		if (r < 0) {
601 			this.ref = null;
602 		} else {
603 			this.ref = this.path.substring(r + 1);
604 			this.path = this.path.substring(0, r);
605 		}
606 	}
607 
608 	public boolean isCGI() {
609 		final String ls = this.path.toLowerCase();
610 		return ((ls.indexOf(".cgi") >= 0) || (ls.indexOf(".exe") >= 0) || (ls.indexOf(";jsessionid=") >= 0) || (ls.indexOf("sessionid/") >= 0)
611 				|| (ls.indexOf("phpsessid=") >= 0) || (ls.indexOf("search.php?sid=") >= 0) || (ls.indexOf("memberlist.php?sid=") >= 0));
612 	}
613 
614 	public boolean isPOST() {
615 		return !ValidityHelper.isEmpty(this.quest);
616 	}
617 
618 	// language calculation
619 	public String language() {
620 		String language = "en";
621 		final int pos = this.host.lastIndexOf(".");
622 		if ((pos > 0) && (this.host.length() - pos == 3)) {
623 			language = this.host.substring(pos + 1).toLowerCase();
624 		}
625 		return language;
626 	}
627 
628 	private void parseURLString(String url) throws MalformedURLException {
629 		// identify protocol
630 		assert (url != null);
631 		url = url.trim();
632 		int p = url.indexOf(':');
633 		if (p < 0) {
634 			if (url.startsWith("www.")) {
635 				url = "http://" + url;
636 				p = 4;
637 			} else {
638 				throw new MalformedURLException("protocol is not given in '" + url + "'");
639 			}
640 		}
641 		this.protocol = url.substring(0, p).toLowerCase().trim();
642 		if (url.length() < p + 4) {
643 			throw new MalformedURLException("URL not parseable: '" + url + "'");
644 		}
645 		if (url.substring(p + 1, p + 3).equals("//")) {
646 			// identify host, userInfo and file for http and ftp protocol
647 			final int q = url.indexOf('/', p + 3);
648 			int r;
649 			if (q < 0) {
650 				if ((r = url.indexOf('@', p + 3)) < 0) {
651 					this.host = url.substring(p + 3);
652 					this.userInfo = null;
653 				} else {
654 					this.host = url.substring(r + 1);
655 					this.userInfo = url.substring(p + 3, r);
656 				}
657 				this.path = "/";
658 			} else {
659 				this.host = url.substring(p + 3, q);
660 				if ((r = this.host.indexOf('@')) < 0) {
661 					this.userInfo = null;
662 				} else {
663 					this.userInfo = this.host.substring(0, r);
664 					this.host = this.host.substring(r + 1);
665 				}
666 				this.path = url.substring(q);
667 			}
668 
669 			this.path = resolveBackpath(this.path);
670 			identPort(url, (this.protocol.equals("http") ? 80 : ((this.protocol.equals("https")) ? 443 : ((this.protocol.equals("ftp")) ? 21 : -1))));
671 			identRef();
672 			identQuest();
673 			// escape();
674 		} else {
675 			// this is not a http or ftp url
676 			if (this.protocol.equals("mailto")) {
677 				// parse email url
678 				final int q = url.indexOf('@', p + 3);
679 				if (q < 0) {
680 					throw new MalformedURLException("wrong email address: " + url);
681 				}
682 				this.userInfo = url.substring(p + 1, q);
683 				this.host = url.substring(q + 1);
684 				this.path = null;
685 				this.port = -1;
686 				this.quest = null;
687 				this.ref = null;
688 			} else if (this.protocol.equals("javascript")) {
689 				// parse email url
690 				this.userInfo = null;
691 				this.host = null;
692 				this.path = url.substring(p + 1);
693 				this.port = -1;
694 				this.quest = null;
695 				this.ref = null;
696 			} else {
697 				throw new MalformedURLException("unknown protocol: " + url);
698 			}
699 		}
700 
701 		// handle international domains
702 		if (this.host != null // 
703 				&& !Punycode.isBasic(this.host)) {
704 			try {
705 				final int d1 = this.host.lastIndexOf('.');
706 				if (d1 >= 0) {
707 					final String tld = this.host.substring(d1 + 1);
708 					final String dom = this.host.substring(0, d1);
709 					final int d0 = dom.lastIndexOf('.');
710 					if (d0 >= 0) {
711 						this.host = dom.substring(0, d0) + ".xn--" + Punycode.encode(dom.substring(d0 + 1)) + "." + tld;
712 					} else {
713 						this.host = "xn--" + Punycode.encode(dom) + "." + tld;
714 					}
715 				}
716 			} catch (final PunycodeException e) {
717 				LOG.warn("Failed to handle international domain: \"" + this.host + "\"", e);
718 			}
719 		}
720 	}
721 
722 	/*** resolve '..' */
723 	private String resolveBackpath(String myPath) {
724 		if (myPath.length() == 0 || myPath.charAt(0) != '/') {
725 			myPath = "/" + myPath;
726 		}
727 
728 		final Matcher matcher = this.backPathPattern.matcher(myPath);
729 		while (matcher.find()) {
730 			myPath = matcher.replaceAll("");
731 			matcher.reset(myPath);
732 		}
733 
734 		return myPath.equals("") ? "/" : myPath;
735 	}
736 
737 	public String toNormalform(final boolean includeReference) {
738 		// generates a normal form of the URL
739 		if (this.protocol.equals("mailto")) {
740 			return this.protocol + ":" + this.userInfo + "@" + this.host;
741 		}
742 
743 		final String resolvedPath = resolveBackpath(this.getFile(includeReference));
744 
745 		return this.protocol + "://" //
746 				+ ((this.userInfo != null) ? (this.userInfo + "@") : ("")) //
747 				+ ((getHost() != null) ? getHost().toLowerCase() : ("")) //
748 				+ (hasDefaultPort() ? "" : ":" + this.port) //
749 				+ resolvedPath;
750 	}
751 
752 	public String toNormalform(final boolean includeReference, final boolean stripAmp) {
753 		String result = toNormalform(includeReference);
754 		if (stripAmp) {
755 			result = result.replaceAll("&amp;", "&");
756 		}
757 		return result;
758 	}
759 
760 	private boolean hasDefaultPort() {
761 		return this.port < 0 //
762 				|| (this.port == 21 && this.protocol.equals("ftp")) //
763 				|| (this.port == 80 && this.protocol.equals("http")) //
764 				|| (this.port == 443 && this.protocol.equals("https"));
765 	}
766 
767 	@Override
768 	public String toString() {
769 		return toNormalform(true, false);
770 	}
771 }