View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  package simplespider.simplespider.bot.extractor.html.stream;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.InputStreamReader;
23  import java.io.Reader;
24  import java.net.MalformedURLException;
25  import java.util.ArrayList;
26  import java.util.List;
27  
28  import org.apache.commons.logging.Log;
29  import org.apache.commons.logging.LogFactory;
30  
31  import simplespider.simplespider.bot.extractor.LinkExtractor;
32  import simplespider.simplespider.util.SimpleUrl;
33  import simplespider.simplespider.util.StringUtils;
34  import simplespider.simplespider.util.ValidityHelper;
35  
36  public class StreamExtractor implements LinkExtractor {
37  	private static final Log	LOG					= LogFactory.getLog(StreamExtractor.class);
38  
39  	// TODO Configure this
40  	private static final int	DEFAULT_BUFFER_SIZE	= 4096;
41  
42  	private final int			maxUrlLength;
43  
44  	public StreamExtractor(final int maxUrlLength) {
45  		this.maxUrlLength = maxUrlLength;
46  	}
47  
48  	public List<String> getUrls(final InputStream body, final String baseUrl) throws IOException {
49  		ValidityHelper.checkNotNull("body", body);
50  
51  		final TagListenerImpl listener = new TagListenerImpl();
52  		final HtmlWriter htmlWriter = new HtmlWriter(true, listener, DEFAULT_BUFFER_SIZE);
53  
54  		parse(body, htmlWriter, baseUrl);
55  
56  		final List<String> links = getLinks(baseUrl, listener.getLinks());
57  
58  		return links;
59  	}
60  
61  	private List<String> getLinks(final String baseUrl, final List<String> extractedLinks) throws MalformedURLException {
62  		final SimpleUrl url = new SimpleUrl(baseUrl);
63  		final List<String> links = new ArrayList<String>(extractedLinks.size());
64  		for (final String reference : extractedLinks) {
65  			if (reference.contains("<") || reference.contains(">")) {
66  				LOG.warn("Ignoring possible invalid reference based on URL \"" + baseUrl + "\":\n" + StringUtils.clipping(reference, 128));
67  				continue;
68  			}
69  			try {
70  				final SimpleUrl newUrl = SimpleUrl.newURL(url, reference);
71  				if (newUrl == null) {
72  					if (LOG.isDebugEnabled()) {
73  						LOG.debug("Ignoring reference \"" + reference + "\" based on URL \"" + baseUrl + "\", because it contains nothing");
74  					}
75  					continue;
76  				}
77  				final String normalformedUrl = newUrl.toNormalform(false, true);
78  
79  				if (normalformedUrl.length() > this.maxUrlLength) {
80  					if (LOG.isDebugEnabled()) {
81  						LOG.debug("Ignoring reference \"" + reference + "\" based on URL \"" + baseUrl + "\", because its size is greater than "
82  								+ this.maxUrlLength);
83  					}
84  					continue;
85  				}
86  				links.add(normalformedUrl);
87  			} catch (final Exception e) {
88  				LOG.warn("Ignoring reference \"" + reference + "\" based on URL \"" + baseUrl + "\"", e);
89  			}
90  		}
91  		return links;
92  	}
93  
94  	private void parse(final InputStream sourceStream, final HtmlWriter target, final String baseUrl) throws IOException {
95  		final Reader source = new InputStreamReader(sourceStream);
96  		final char[] buffer = new char[DEFAULT_BUFFER_SIZE];
97  		long count = 0;
98  
99  		for (int n = 0; -1 != (n = source.read(buffer));) {
100 			target.write(buffer, 0, n);
101 			count += n;
102 
103 			if (target.binarySuspect()) {
104 				if (LOG.isInfoEnabled()) {
105 					LOG.info("Skip binary content: \"" + baseUrl + "\"");
106 				}
107 				break;
108 			}
109 		}
110 		target.flush();
111 
112 		if (LOG.isDebugEnabled()) {
113 			LOG.debug("Loaded url \"" + baseUrl + "\": " + count + " bytes");
114 		}
115 
116 		target.close();
117 	}
118 
119 }