1 /***
2 * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3 * Copyright (C) 2009 <berendona@users.sourceforge.net>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 package simplespider.simplespider.bot.extractor.html.stream;
19
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.io.Reader;
24 import java.net.MalformedURLException;
25 import java.util.ArrayList;
26 import java.util.List;
27
28 import org.apache.commons.logging.Log;
29 import org.apache.commons.logging.LogFactory;
30
31 import simplespider.simplespider.bot.extractor.LinkExtractor;
32 import simplespider.simplespider.util.SimpleUrl;
33 import simplespider.simplespider.util.StringUtils;
34 import simplespider.simplespider.util.ValidityHelper;
35
36 public class StreamExtractor implements LinkExtractor {
37 private static final Log LOG = LogFactory.getLog(StreamExtractor.class);
38
39
40 private static final int DEFAULT_BUFFER_SIZE = 4096;
41
42 private final int maxUrlLength;
43
44 public StreamExtractor(final int maxUrlLength) {
45 this.maxUrlLength = maxUrlLength;
46 }
47
48 public List<String> getUrls(final InputStream body, final String baseUrl) throws IOException {
49 ValidityHelper.checkNotNull("body", body);
50
51 final TagListenerImpl listener = new TagListenerImpl();
52 final HtmlWriter htmlWriter = new HtmlWriter(true, listener, DEFAULT_BUFFER_SIZE);
53
54 parse(body, htmlWriter, baseUrl);
55
56 final List<String> links = getLinks(baseUrl, listener.getLinks());
57
58 return links;
59 }
60
61 private List<String> getLinks(final String baseUrl, final List<String> extractedLinks) throws MalformedURLException {
62 final SimpleUrl url = new SimpleUrl(baseUrl);
63 final List<String> links = new ArrayList<String>(extractedLinks.size());
64 for (final String reference : extractedLinks) {
65 if (reference.contains("<") || reference.contains(">")) {
66 LOG.warn("Ignoring possible invalid reference based on URL \"" + baseUrl + "\":\n" + StringUtils.clipping(reference, 128));
67 continue;
68 }
69 try {
70 final SimpleUrl newUrl = SimpleUrl.newURL(url, reference);
71 if (newUrl == null) {
72 if (LOG.isDebugEnabled()) {
73 LOG.debug("Ignoring reference \"" + reference + "\" based on URL \"" + baseUrl + "\", because it contains nothing");
74 }
75 continue;
76 }
77 final String normalformedUrl = newUrl.toNormalform(false, true);
78
79 if (normalformedUrl.length() > this.maxUrlLength) {
80 if (LOG.isDebugEnabled()) {
81 LOG.debug("Ignoring reference \"" + reference + "\" based on URL \"" + baseUrl + "\", because its size is greater than "
82 + this.maxUrlLength);
83 }
84 continue;
85 }
86 links.add(normalformedUrl);
87 } catch (final Exception e) {
88 LOG.warn("Ignoring reference \"" + reference + "\" based on URL \"" + baseUrl + "\"", e);
89 }
90 }
91 return links;
92 }
93
94 private void parse(final InputStream sourceStream, final HtmlWriter target, final String baseUrl) throws IOException {
95 final Reader source = new InputStreamReader(sourceStream);
96 final char[] buffer = new char[DEFAULT_BUFFER_SIZE];
97 long count = 0;
98
99 for (int n = 0; -1 != (n = source.read(buffer));) {
100 target.write(buffer, 0, n);
101 count += n;
102
103 if (target.binarySuspect()) {
104 if (LOG.isInfoEnabled()) {
105 LOG.info("Skip binary content: \"" + baseUrl + "\"");
106 }
107 break;
108 }
109 }
110 target.flush();
111
112 if (LOG.isDebugEnabled()) {
113 LOG.debug("Loaded url \"" + baseUrl + "\": " + count + " bytes");
114 }
115
116 target.close();
117 }
118
119 }