1 /***
2 * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3 * Copyright (C) 2009 <berendona@users.sourceforge.net>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 package simplespider.simplespider.util;
19
20 import java.net.MalformedURLException;
21 import java.util.regex.Matcher;
22 import java.util.regex.Pattern;
23
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26
27 import simplespider.simplespider.util.Punycode.PunycodeException;
28
29 /***
30 * this class exist to provide a system-wide normal form representation of urls, and to prevent that java.net.URL usage causes DNS queries which are
31 * used in java.net. <p /> Based on de.anomic.yacy.yacyURL
32 */
33 public class SimpleUrl {
34 private static final Log LOG = LogFactory.getLog(SimpleUrl.class);
35
36 private final Pattern backPathPattern = Pattern.compile("(/[^/]+(?<!///.{1,2})/)[.]{2}(?=/|$)|///.(?=/)|/(?=/)");
37
38 private int port;
39
40
41 private String protocol;
42 private String host;
43 private String userInfo;
44 private String path;
45 private String quest;
46 private String ref;
47
48 private final static String[] hex = { "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", "%08", "%09", "%0A", "%0B", "%0C",
49 "%0D", "%0E", "%0F", "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F",
50 "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", "%28", "%29", "%2A", "%2B", "%2C", "%2D", "%2E", "%2F", "%30", "%31", "%32",
51 "%33", "%34", "%35", "%36", "%37", "%38", "%39", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", "%40", "%41", "%42", "%43", "%44", "%45",
52 "%46", "%47", "%48", "%49", "%4A", "%4B", "%4C", "%4D", "%4E", "%4F", "%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57", "%58",
53 "%59", "%5A", "%5B", "%5C", "%5D", "%5E", "%5F", "%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67", "%68", "%69", "%6A", "%6B",
54 "%6C", "%6D", "%6E", "%6F", "%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77", "%78", "%79", "%7A", "%7B", "%7C", "%7D", "%7E",
55 "%7F", "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", "%90", "%91",
56 "%92", "%93", "%94", "%95", "%96", "%97", "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", "%A0", "%A1", "%A2", "%A3", "%A4",
57 "%A5", "%A6", "%A7", "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7",
58 "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", "%C8", "%C9", "%CA",
59 "%CB", "%CC", "%CD", "%CE", "%CF", "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", "%D8", "%D9", "%DA", "%DB", "%DC", "%DD",
60 "%DE", "%DF", "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", "%F0",
61 "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF" };
62
63 public SimpleUrl(final SimpleUrl baseURL, String relPath) throws MalformedURLException {
64 if (baseURL == null) {
65 throw new MalformedURLException("base URL is null");
66 }
67 if (relPath == null) {
68 throw new MalformedURLException("relPath is null");
69 }
70
71 this.protocol = baseURL.protocol;
72 this.host = baseURL.host;
73 this.port = baseURL.port;
74 this.userInfo = baseURL.userInfo;
75
76 if (relPath.startsWith("//")) {
77
78
79 relPath = baseURL.protocol + ":" + relPath;
80 }
81
82
83
84 if (isAbsolute(relPath)) {
85 this.path = baseURL.path;
86 } else if (relPath.startsWith("/")) {
87 this.path = relPath;
88 } else if (baseURL.path.endsWith("/")) {
89
90
91
92
93 this.path = baseURL.path + relPath;
94 } else {
95 if (relPath.startsWith("#")
96 || relPath.startsWith("?")) {
97 this.path = baseURL.path + relPath;
98 } else {
99 final int q = baseURL.path.lastIndexOf('/');
100 if (q < 0) {
101 this.path = relPath;
102 } else {
103 this.path = baseURL.path.substring(0, q + 1) + relPath;
104 }
105 }
106 }
107
108 this.quest = baseURL.quest;
109 this.ref = baseURL.ref;
110
111 this.path = resolveBackpath(this.path);
112 identRef();
113 identQuest();
114
115 }
116
117 public SimpleUrl(final String url) throws MalformedURLException {
118 checkNotEmpty("url", url);
119
120 parseURLString(url);
121 }
122
123 public SimpleUrl(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
124 checkNotEmpty("protocol", protocol);
125 checkNotEmpty("host", host);
126
127 this.protocol = protocol;
128 this.host = host;
129 this.port = port;
130 this.path = path;
131 identRef();
132 identQuest();
133
134 }
135
136 private static void checkNotEmpty(final String name, final CharSequence value) throws MalformedURLException {
137 ValidityHelper.checkNotEmpty("name", name);
138
139 if (value == null) {
140 throw new MalformedURLException(name + " is null");
141 }
142 if (ValidityHelper.isEmpty(value)) {
143 throw new MalformedURLException(name + " is empty");
144 }
145 }
146
147 /***
148 * Copy constructor
149 *
150 * @param baseURL
151 * must not be <code>null</code>
152 * @throws NullPointerException
153 * if <code>baseUrl</code> is <code>null</code>
154 */
155 public SimpleUrl(final SimpleUrl baseURL) {
156 ValidityHelper.checkNotNull("baseURL", baseURL);
157
158 this.host = baseURL.host;
159 this.path = baseURL.path;
160 this.port = baseURL.port;
161 this.protocol = baseURL.protocol;
162 this.quest = baseURL.quest;
163 this.ref = baseURL.ref;
164 this.userInfo = baseURL.userInfo;
165 }
166
167 /***
168 * Encode a string to the "x-www-form-urlencoded" form, enhanced with the UTF-8-in-URL proposal. This is what happens: <ul> <li>The ASCII
169 * characters 'a' through 'z', 'A' through 'Z', and '0' through '9' remain the same. <li>The unreserved characters - _ . ! ~ * ' ( ) remain the
170 * same. <li>All other ASCII characters are converted into the 3-character string "%xy", where xy is the two-digit hexadecimal representation of
171 * the character code <li>All non-ASCII characters are encoded in two steps: first to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
172 * secondly each of these bytes is encoded as "%xx". </ul>
173 *
174 * @param s
175 * The string to be encoded
176 * @return The encoded string
177 */
178
179 public static String escape(final String s) {
180 final StringBuilder sbuf = new StringBuilder();
181 final int len = s.length();
182 for (int i = 0; i < len; i++) {
183 final int ch = s.charAt(i);
184 if ('A' <= ch && ch <= 'Z') {
185 sbuf.append((char) ch);
186 } else if ('a' <= ch && ch <= 'z') {
187 sbuf.append((char) ch);
188 } else if ('0' <= ch && ch <= '9') {
189 sbuf.append((char) ch);
190 } else if (ch == ' ') {
191 sbuf.append("%20");
192 } else if (ch == '&'
193 || ch == ':'
194 || ch == '-' || ch == '_' || ch == '.' || ch == '!' || ch == '~' || ch == '*' || ch == '\'' || ch == '(' || ch == ')'
195 || ch == ';') {
196 sbuf.append((char) ch);
197 } else if (ch <= 0x007f) {
198 sbuf.append(hex[ch]);
199 } else if (ch <= 0x07FF) {
200 sbuf.append(hex[0xc0 | (ch >> 6)]);
201 sbuf.append(hex[0x80 | (ch & 0x3F)]);
202 } else {
203 sbuf.append(hex[0xe0 | (ch >> 12)]);
204 sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
205 sbuf.append(hex[0x80 | (ch & 0x3F)]);
206 }
207 }
208 return sbuf.toString();
209 }
210
211 public static void main(final String[] args) {
212 final String[][] test = new String[][] { new String[] { null, "http://www.anomic.de/home/test?x=1#home" },
213 new String[] { null, "http://www.anomic.de/home/test?x=1" }, new String[] { null, "http://www.anomic.de/home/test#home" },
214 new String[] { null, "ftp://ftp.anomic.de/home/test#home" }, new String[] { null, "http://www.anomic.de/home/../abc/" },
215 new String[] { null, "mailto:abcdefg@nomailnomail.com" }, new String[] { "http://www.anomic.de/home", "test" },
216 new String[] { "http://www.anomic.de/home", "test/" }, new String[] { "http://www.anomic.de/home/", "test" },
217 new String[] { "http://www.anomic.de/home/", "test/" }, new String[] { "http://www.anomic.de/home/index.html", "test.htm" },
218 new String[] { "http://www.anomic.de/home/index.html", "http://www.yacy.net/test" },
219 new String[] { "http://www.anomic.de/home/index.html", "ftp://ftp.yacy.net/test" },
220 new String[] { "http://www.anomic.de/home/index.html", "../test" },
221 new String[] { "http://www.anomic.de/home/index.html", "mailto:abcdefg@nomailnomail.com" }, new String[] { null, "news:de.test" },
222 new String[] { "http://www.anomic.de/home", "news:de.test" },
223 new String[] { "http://www.anomic.de/home", "ftp://ftp.anomic.de/src" }, new String[] { null, "ftp://ftp.delegate.org/" },
224 new String[] { "http://www.anomic.de/home", "ftp://ftp.delegate.org/" },
225 new String[] { "http://www.anomic.de", "mailto:yacy@weltherrschaft.org" }, new String[] { "http://www.anomic.de", "javascipt:temp" },
226 new String[] { null, "http://yacy-websuche.de/wiki/index.php?title=De:IntroInformationFreedom&action=history" },
227 new String[] { null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" },
228 new String[] { null, "http://diskusjion.no/index.php?s=5bad5f431a106d9a8355429b81bb0ca5&showuser=23585" } };
229 String environment, url;
230 SimpleUrl aURL, aURL1;
231 java.net.URL jURL;
232 for (int i = 0; i < test.length; i++) {
233 environment = test[i][0];
234 url = test[i][1];
235 try {
236 aURL = SimpleUrl.newURL(environment, url);
237 } catch (final MalformedURLException e) {
238 aURL = null;
239 }
240 if (environment == null) {
241 try {
242 jURL = new java.net.URL(url);
243 } catch (final MalformedURLException e) {
244 jURL = null;
245 }
246 } else {
247 try {
248 jURL = new java.net.URL(new java.net.URL(environment), url);
249 } catch (final MalformedURLException e) {
250 jURL = null;
251 }
252 }
253
254
255 if (((aURL == null) && (jURL != null)) || ((aURL != null) && (jURL == null))
256 || ((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) {
257 System.out.println("Difference for environment=" + environment + ", url=" + url + ":");
258 System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString());
259 System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString());
260 }
261
262
263 if (aURL != null) {
264 try {
265 aURL1 = new SimpleUrl(aURL.toNormalform(true, true));
266 if (!(aURL1.toNormalform(true, true).equals(aURL.toNormalform(true, true)))) {
267 System.out.println("no stability for url:");
268 System.out.println("aURL0=" + aURL.toString());
269 System.out.println("aURL1=" + aURL1.toString());
270 }
271 } catch (final MalformedURLException e) {
272 System.out.println("no stability for url:");
273 System.out.println("aURL0=" + aURL.toString());
274 System.out.println("aURL1 cannot be computed:" + e.getMessage());
275 }
276 }
277 }
278 }
279
280
281 public static SimpleUrl newURL(final SimpleUrl baseURL, final String relPath) throws MalformedURLException {
282 if (baseURL == null
283 || isAbsolute(relPath)) {
284 return new SimpleUrl(relPath);
285 }
286
287 if (ValidityHelper.isEmpty(relPath)) {
288 return new SimpleUrl(baseURL);
289 }
290
291 return new SimpleUrl(baseURL, relPath);
292 }
293
294
295 public static SimpleUrl newURL(final String baseURL, final String relPath) throws MalformedURLException {
296 if (baseURL == null
297 || isAbsolute(relPath)) {
298 return new SimpleUrl(relPath);
299 }
300
301 if (ValidityHelper.isEmpty(relPath)) {
302 return new SimpleUrl(baseURL);
303 }
304
305 return new SimpleUrl(new SimpleUrl(baseURL), relPath);
306 }
307
308 private static boolean isAbsolute(final String path) {
309 if (ValidityHelper.isEmpty(path)) {
310 return false;
311 }
312
313
314 final Pattern protocalPattern = Pattern.compile("[a-zA-Z]+:");
315 final Matcher protocolMatcher = protocalPattern.matcher(path);
316
317 return protocolMatcher.find() && protocolMatcher.start() == 0;
318 }
319
320
321 public static String unescape(final String s) {
322 final StringBuilder sbuf = new StringBuilder();
323 final int l = s.length();
324 int ch = -1;
325 int b, sumb = 0;
326 for (int i = 0, more = -1; i < l; i++) {
327
328 switch (ch = s.charAt(i)) {
329 case '%':
330 ch = s.charAt(++i);
331 final int hb = (Character.isDigit((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
332 ch = s.charAt(++i);
333 final int lb = (Character.isDigit((char) ch) ? ch - '0' : 10 + Character.toLowerCase((char) ch) - 'a') & 0xF;
334 b = (hb << 4) | lb;
335 break;
336 case '+':
337 b = ' ';
338 break;
339 default:
340 b = ch;
341 }
342
343 if ((b & 0xc0) == 0x80) {
344 sumb = (sumb << 6) | (b & 0x3f);
345 if (--more == 0) {
346 sbuf.append((char) sumb);
347 }
348 } else if ((b & 0x80) == 0x00) {
349 sbuf.append((char) b);
350 } else if ((b & 0xe0) == 0xc0) {
351 sumb = b & 0x1f;
352 more = 1;
353 } else if ((b & 0xf0) == 0xe0) {
354 sumb = b & 0x0f;
355 more = 2;
356 } else if ((b & 0xf8) == 0xf0) {
357 sumb = b & 0x07;
358 more = 3;
359 } else if ((b & 0xfc) == 0xf8) {
360 sumb = b & 0x03;
361 more = 4;
362 } else
363 sumb = b & 0x01;
364 more = 5;
365 }
366
367 }
368 return sbuf.toString();
369 }
370
371 public int compareTo(final Object h) {
372 assert (h instanceof SimpleUrl);
373 return toString().compareTo(((SimpleUrl) h).toString());
374 }
375
376
377
378
379
380 @Override
381 public boolean equals(final Object obj) {
382 if (this == obj) {
383 return true;
384 }
385 if (obj == null) {
386 return false;
387 }
388 if (!(obj instanceof SimpleUrl)) {
389 return false;
390 }
391 final SimpleUrl other = (SimpleUrl) obj;
392 if (this.host == null) {
393 if (other.host != null) {
394 return false;
395 }
396 } else if (!this.host.equals(other.host)) {
397 return false;
398 }
399 if (this.path == null) {
400 if (other.path != null) {
401 return false;
402 }
403 } else if (!this.path.equals(other.path)) {
404 return false;
405 }
406 if (this.port != other.port) {
407 return false;
408 }
409 if (this.protocol == null) {
410 if (other.protocol != null) {
411 return false;
412 }
413 } else if (!this.protocol.equals(other.protocol)) {
414 return false;
415 }
416 if (this.quest == null) {
417 if (other.quest != null) {
418 return false;
419 }
420 } else if (!this.quest.equals(other.quest)) {
421 return false;
422 }
423 if (this.ref == null) {
424 if (other.ref != null) {
425 return false;
426 }
427 } else if (!this.ref.equals(other.ref)) {
428 return false;
429 }
430 if (this.userInfo == null) {
431 if (other.userInfo != null) {
432 return false;
433 }
434 } else if (!this.userInfo.equals(other.userInfo)) {
435 return false;
436 }
437 return true;
438 }
439
440 /***
441 * Escapes the following parts of the url, this object already contains: <ul> <li>path: see {@link #escape(String)}</li> <li>ref: same as
442 * above</li> <li>quest: same as above without the ampersand ("&") and the equals symbol</li> </ul>
443 */
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481 public String getAuthority() {
482 return ((this.port >= 0) && (this.host != null)) ? this.host + ":" + this.port : ((this.host != null) ? this.host : "");
483 }
484
485 public String getFile() {
486 return getFile(true);
487 }
488
489 public String getFile(final boolean includeReference) {
490
491
492
493 final StringBuilder sb = new StringBuilder();
494 sb.append(this.path);
495
496 if (!ValidityHelper.isEmpty(this.quest)) {
497 sb.append('?').append(this.quest);
498 }
499
500 if (includeReference && !ValidityHelper.isEmpty(this.ref)) {
501 sb.append('#').append(this.ref);
502 }
503
504 return sb.toString();
505 }
506
507 public String getFileName() {
508
509
510 final int p = this.path.lastIndexOf('/');
511 if (p < 0) {
512 return this.path;
513 }
514 if (p == this.path.length() - 1) {
515 return "";
516 }
517 return this.path.substring(p + 1);
518 }
519
520 public String getHost() {
521 return this.host;
522 }
523
524 public String getPath() {
525 return this.path;
526 }
527
528 public int getPort() {
529 return this.port;
530 }
531
532 public String getProtocol() {
533 return this.protocol;
534 }
535
536 public String getQuery() {
537 return this.quest;
538 }
539
540 public String getRef() {
541 return this.ref;
542 }
543
544 public String getUserInfo() {
545 return this.userInfo;
546 }
547
548
549
550
551
552 @Override
553 public int hashCode() {
554 final int prime = 31;
555 int result = 1;
556 result = prime * result + ((this.host == null) ? 0 : this.host.hashCode());
557 result = prime * result + ((this.path == null) ? 0 : this.path.hashCode());
558 result = prime * result + this.port;
559 result = prime * result + ((this.protocol == null) ? 0 : this.protocol.hashCode());
560 result = prime * result + ((this.quest == null) ? 0 : this.quest.hashCode());
561 result = prime * result + ((this.ref == null) ? 0 : this.ref.hashCode());
562 result = prime * result + ((this.userInfo == null) ? 0 : this.userInfo.hashCode());
563 return result;
564 }
565
566 private void identPort(final String inputURL, final int dflt) throws MalformedURLException {
567
568 final int r = this.host.indexOf(':');
569 if (r < 0) {
570 this.port = dflt;
571 } else {
572 try {
573 final String portStr = this.host.substring(r + 1);
574 if (portStr.trim().length() > 0) {
575 this.port = Integer.parseInt(portStr);
576 } else {
577 this.port = -1;
578 }
579 this.host = this.host.substring(0, r);
580 } catch (final NumberFormatException e) {
581 throw new MalformedURLException("wrong port in host fragment '" + this.host + "' of input url '" + inputURL + "'");
582 }
583 }
584 }
585
586 private void identQuest() {
587
588 final int r = this.path.indexOf('?');
589 if (r < 0) {
590 this.quest = null;
591 } else {
592 this.quest = this.path.substring(r + 1);
593 this.path = this.path.substring(0, r);
594 }
595 }
596
597 private void identRef() {
598
599 final int r = this.path.indexOf('#');
600 if (r < 0) {
601 this.ref = null;
602 } else {
603 this.ref = this.path.substring(r + 1);
604 this.path = this.path.substring(0, r);
605 }
606 }
607
608 public boolean isCGI() {
609 final String ls = this.path.toLowerCase();
610 return ((ls.indexOf(".cgi") >= 0) || (ls.indexOf(".exe") >= 0) || (ls.indexOf(";jsessionid=") >= 0) || (ls.indexOf("sessionid/") >= 0)
611 || (ls.indexOf("phpsessid=") >= 0) || (ls.indexOf("search.php?sid=") >= 0) || (ls.indexOf("memberlist.php?sid=") >= 0));
612 }
613
614 public boolean isPOST() {
615 return !ValidityHelper.isEmpty(this.quest);
616 }
617
618
619 public String language() {
620 String language = "en";
621 final int pos = this.host.lastIndexOf(".");
622 if ((pos > 0) && (this.host.length() - pos == 3)) {
623 language = this.host.substring(pos + 1).toLowerCase();
624 }
625 return language;
626 }
627
628 private void parseURLString(String url) throws MalformedURLException {
629
630 assert (url != null);
631 url = url.trim();
632 int p = url.indexOf(':');
633 if (p < 0) {
634 if (url.startsWith("www.")) {
635 url = "http://" + url;
636 p = 4;
637 } else {
638 throw new MalformedURLException("protocol is not given in '" + url + "'");
639 }
640 }
641 this.protocol = url.substring(0, p).toLowerCase().trim();
642 if (url.length() < p + 4) {
643 throw new MalformedURLException("URL not parseable: '" + url + "'");
644 }
645 if (url.substring(p + 1, p + 3).equals("//")) {
646
647 final int q = url.indexOf('/', p + 3);
648 int r;
649 if (q < 0) {
650 if ((r = url.indexOf('@', p + 3)) < 0) {
651 this.host = url.substring(p + 3);
652 this.userInfo = null;
653 } else {
654 this.host = url.substring(r + 1);
655 this.userInfo = url.substring(p + 3, r);
656 }
657 this.path = "/";
658 } else {
659 this.host = url.substring(p + 3, q);
660 if ((r = this.host.indexOf('@')) < 0) {
661 this.userInfo = null;
662 } else {
663 this.userInfo = this.host.substring(0, r);
664 this.host = this.host.substring(r + 1);
665 }
666 this.path = url.substring(q);
667 }
668
669 this.path = resolveBackpath(this.path);
670 identPort(url, (this.protocol.equals("http") ? 80 : ((this.protocol.equals("https")) ? 443 : ((this.protocol.equals("ftp")) ? 21 : -1))));
671 identRef();
672 identQuest();
673
674 } else {
675
676 if (this.protocol.equals("mailto")) {
677
678 final int q = url.indexOf('@', p + 3);
679 if (q < 0) {
680 throw new MalformedURLException("wrong email address: " + url);
681 }
682 this.userInfo = url.substring(p + 1, q);
683 this.host = url.substring(q + 1);
684 this.path = null;
685 this.port = -1;
686 this.quest = null;
687 this.ref = null;
688 } else if (this.protocol.equals("javascript")) {
689
690 this.userInfo = null;
691 this.host = null;
692 this.path = url.substring(p + 1);
693 this.port = -1;
694 this.quest = null;
695 this.ref = null;
696 } else {
697 throw new MalformedURLException("unknown protocol: " + url);
698 }
699 }
700
701
702 if (this.host != null
703 && !Punycode.isBasic(this.host)) {
704 try {
705 final int d1 = this.host.lastIndexOf('.');
706 if (d1 >= 0) {
707 final String tld = this.host.substring(d1 + 1);
708 final String dom = this.host.substring(0, d1);
709 final int d0 = dom.lastIndexOf('.');
710 if (d0 >= 0) {
711 this.host = dom.substring(0, d0) + ".xn--" + Punycode.encode(dom.substring(d0 + 1)) + "." + tld;
712 } else {
713 this.host = "xn--" + Punycode.encode(dom) + "." + tld;
714 }
715 }
716 } catch (final PunycodeException e) {
717 LOG.warn("Failed to handle international domain: \"" + this.host + "\"", e);
718 }
719 }
720 }
721
722 /*** resolve '..' */
723 private String resolveBackpath(String myPath) {
724 if (myPath.length() == 0 || myPath.charAt(0) != '/') {
725 myPath = "/" + myPath;
726 }
727
728 final Matcher matcher = this.backPathPattern.matcher(myPath);
729 while (matcher.find()) {
730 myPath = matcher.replaceAll("");
731 matcher.reset(myPath);
732 }
733
734 return myPath.equals("") ? "/" : myPath;
735 }
736
737 public String toNormalform(final boolean includeReference) {
738
739 if (this.protocol.equals("mailto")) {
740 return this.protocol + ":" + this.userInfo + "@" + this.host;
741 }
742
743 final String resolvedPath = resolveBackpath(this.getFile(includeReference));
744
745 return this.protocol + "://" //
746 + ((this.userInfo != null) ? (this.userInfo + "@") : (""))
747 + ((getHost() != null) ? getHost().toLowerCase() : (""))
748 + (hasDefaultPort() ? "" : ":" + this.port)
749 + resolvedPath;
750 }
751
752 public String toNormalform(final boolean includeReference, final boolean stripAmp) {
753 String result = toNormalform(includeReference);
754 if (stripAmp) {
755 result = result.replaceAll("&", "&");
756 }
757 return result;
758 }
759
760 private boolean hasDefaultPort() {
761 return this.port < 0
762 || (this.port == 21 && this.protocol.equals("ftp"))
763 || (this.port == 80 && this.protocol.equals("http"))
764 || (this.port == 443 && this.protocol.equals("https"));
765 }
766
767 @Override
768 public String toString() {
769 return toNormalform(true, false);
770 }
771 }