View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  package simplespider.simplespider.bot;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.MalformedURLException;
23  import java.net.SocketTimeoutException;
24  import java.sql.SQLException;
25  import java.util.ArrayList;
26  import java.util.List;
27  import java.util.concurrent.TimeUnit;
28  
29  import org.apache.commons.httpclient.CircularRedirectException;
30  import org.apache.commons.httpclient.URIException;
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  
34  import simplespider.simplespider.bot.extractor.LinkExtractor;
35  import simplespider.simplespider.bot.http.HttpClient;
36  import simplespider.simplespider.bot.http.HttpClientFactory;
37  import simplespider.simplespider.dao.DbHelper;
38  import simplespider.simplespider.dao.DbHelperFactory;
39  import simplespider.simplespider.dao.LinkDao;
40  import simplespider.simplespider.util.SimpleUrl;
41  import simplespider.simplespider.util.ValidityHelper;
42  
43  public class CrawlerImpl implements Crawler {
44  
45  	// TODO Sleeping on error should be solved not here
46  	// TODO Configure this
47  	private static final int		SLEEP_SECONDS_ON_ERROR	= 10;
48  
49  	private static final Log		LOG						= LogFactory.getLog(CrawlerImpl.class);
50  
51  	private final DbHelperFactory	dbHelperFactory;
52  	private final LinkExtractor		linkExtractor;
53  	private final HttpClientFactory	httpClientFactory;
54  
55  	public CrawlerImpl(final DbHelperFactory dbHelperFactory, final LinkExtractor linkExtractor, final HttpClientFactory httpClientFactory) {
56  		this.dbHelperFactory = dbHelperFactory;
57  		this.linkExtractor = linkExtractor;
58  		this.httpClientFactory = httpClientFactory;
59  	}
60  
61  	private HttpClient getHttpConnection(final String baseUrl) {
62  		final HttpClient httpClient = this.httpClientFactory.buildHttpClient();
63  
64  		try {
65  			httpClient.createConnection(baseUrl);
66  		} catch (final Exception e) {
67  			if (e instanceof SocketTimeoutException) {
68  				if (LOG.isInfoEnabled()) {
69  					LOG.info("Failed to load URL \"" + baseUrl + "\": " + e);
70  				}
71  			} else if (e instanceof CircularRedirectException) {
72  				if (LOG.isInfoEnabled()) {
73  					LOG.info("Failed to load URL \"" + baseUrl + "\": " + e);
74  				}
75  			} else {
76  				if (LOG.isInfoEnabled()) {
77  					LOG.info("Failed to load URL \"" + baseUrl + "\"", e);
78  				}
79  			}
80  			return null;
81  		}
82  
83  		final int statusCode = httpClient.getStatusCode();
84  		if (statusCode < 200 || statusCode >= 300) {
85  			if (LOG.isInfoEnabled()) {
86  				LOG.info("Failed to load URL \"" + baseUrl + "\":" + httpClient.getStatusLine());
87  			}
88  			httpClient.releaseConnection();
89  			return null;
90  		}
91  
92  		return httpClient;
93  	}
94  
95  	/*
96  	 * (non-Javadoc)
97  	 * @see simplespider.simplespider_core.bot.Crawler#crawl(java.lang.String)
98  	 */
99  	public void crawl(final String baseUrl) {
100 		ValidityHelper.checkNotEmpty("baseUrl", baseUrl);
101 
102 		try {
103 			final HttpClient httpClient = getHttpConnection(baseUrl);
104 			if (httpClient == null) {
105 				// Error occurs, try it later
106 				setLinkUndone(baseUrl);
107 				// Slow down thread
108 				sleepOnError();
109 				return;
110 			}
111 
112 			final List<String> urls;
113 			try {
114 				urls = getLinks(baseUrl, httpClient);
115 			} finally {
116 				// clean up the connection resources
117 				httpClient.releaseConnection();
118 			}
119 
120 			if (urls == null) {
121 				// Error occurs, try it later
122 				setLinkUndone(baseUrl);
123 				// Slow down thread
124 				sleepOnError();
125 			} else {
126 				saveLinks(urls);
127 			}
128 		} catch (final Exception e) {
129 			LOG.warn("Failed to crawl URL \"" + baseUrl + "\"", e);
130 		}
131 	}
132 
133 	private void sleepOnError() {
134 		try {
135 			TimeUnit.SECONDS.sleep(SLEEP_SECONDS_ON_ERROR);
136 		} catch (final InterruptedException e) {
137 			if (LOG.isDebugEnabled()) {
138 				LOG.debug("Sleep was interrupted", e);
139 			}
140 		}
141 	}
142 
143 	private boolean isProtocolSupported(String url) {
144 		url = url.trim();
145 		final int p = url.indexOf(':');
146 		if (p < 0) {
147 			if (url.startsWith("www.")) {
148 				return true;
149 			}
150 			if (LOG.isInfoEnabled()) {
151 				LOG.info("Protocol is not given: " + url);
152 			}
153 			return false;
154 		}
155 
156 		final String protocol = url.substring(0, p).trim().toLowerCase();
157 		return "http".equals(protocol) // 
158 				|| "https".equals(protocol);
159 	}
160 
161 	private void saveLinks(final List<String> urls) throws SQLException {
162 		final DbHelper dbHelper = this.dbHelperFactory.buildDbHelper();
163 		try {
164 			final LinkDao linkDao = dbHelper.getLinkDao();
165 
166 			for (final String url : urls) {
167 				if (!isProtocolSupported(url)) {
168 					if (LOG.isDebugEnabled()) {
169 						LOG.debug("Ignoring not supported protocol; url: " + url);
170 					}
171 					continue;
172 				}
173 
174 				SimpleUrl simpleUrl;
175 				try {
176 					simpleUrl = new SimpleUrl(url);
177 				} catch (final Exception e) {
178 					if (LOG.isInfoEnabled()) {
179 						LOG.info("Ignoring malformed URL \"" + url + "\"", e);
180 					}
181 					continue;
182 				}
183 
184 				final String cleanedUrl = simpleUrl.toNormalform(false, true);
185 				try {
186 					linkDao.saveAndCommit(cleanedUrl);
187 				} catch (final Exception e) {
188 					LOG.warn("Failed to save url: " + cleanedUrl, e);
189 					dbHelper.rollbackTransaction();
190 				}
191 			}
192 		} finally {
193 			try {
194 				dbHelper.close();
195 			} catch (final Exception e) {
196 				LOG.warn("Failed to close database connection", e);
197 			}
198 		}
199 	}
200 
201 	private List<String> getLinks(final String baseUrl, final HttpClient httpClient) throws SQLException, MalformedURLException {
202 		final String realBaseUrl;
203 		try {
204 			realBaseUrl = httpClient.getRedirectedUrl();
205 		} catch (final URIException e) {
206 			LOG.warn("Failed to get URI after redirection for URL \"" + baseUrl + "\"", e);
207 			return null;
208 		}
209 
210 		final String cleanedRealBaseUrl = new SimpleUrl(realBaseUrl).toNormalform(false, true);
211 
212 		final InputStream bodyAsStream;
213 		try {
214 			bodyAsStream = httpClient.getResponseBodyAsStream();
215 		} catch (final IOException e) {
216 			LOG.warn("Failed to get body for url \"" + cleanedRealBaseUrl + "\"", e);
217 			return null;
218 		}
219 
220 		if (bodyAsStream == null) {
221 			LOG.warn("Failed to get body for url \"" + cleanedRealBaseUrl + "\"");
222 			return null;
223 		}
224 
225 		final String mimeType = httpClient.getMimeType();
226 		// Only supporting HTTP and mime type plain and html
227 		// If not mime type is defined, so hope it will be plain or html ;-)
228 		if (ValidityHelper.isEmpty(mimeType) //
229 				|| isMimeSupported(mimeType) //
230 		) {
231 			try {
232 				return this.linkExtractor.getUrls(bodyAsStream, cleanedRealBaseUrl);
233 			} catch (final IOException e) {
234 				LOG.warn("Failed to extract links from body for url \"" + cleanedRealBaseUrl + "\"", e);
235 				return null;
236 			}
237 		} else {
238 			if (isMimeExcluded(mimeType)) {
239 				if (LOG.isDebugEnabled()) {
240 					LOG.debug("Excluded mime type \"" + mimeType + "\": Ignoring URL \"" + baseUrl + "\"");
241 				}
242 			} else {
243 				if (LOG.isInfoEnabled()) {
244 					LOG.info("Not supporting mime type \"" + mimeType + "\": Ignoring URL \"" + baseUrl + "\"");
245 				}
246 			}
247 			return new ArrayList<String>(0);
248 		}
249 	}
250 
251 	private boolean isMimeSupported(String mimeType) {
252 		if (ValidityHelper.isEmpty(mimeType)) {
253 			return false;
254 		}
255 
256 		mimeType = mimeType.toLowerCase();
257 		return "text/plain".equals(mimeType) //
258 				|| "text/html".equals(mimeType) //
259 				|| "text/xml".equals(mimeType) //
260 				|| "text/x-opml".equals(mimeType) //
261 				|| "text/x-opml+xml".equals(mimeType) //
262 				|| "application/atom+xml".equals(mimeType) //
263 				|| "application/atomcoll+xml".equals(mimeType) //
264 				|| "application/atomserv+xml".equals(mimeType) //
265 				|| "application/html+xml".equals(mimeType) //
266 				|| "application/rdf+xml".equals(mimeType) //
267 				|| "application/rss+xml".equals(mimeType) //
268 				|| "application/xml".equals(mimeType);
269 	}
270 
271 	private boolean isMimeExcluded(String mimeType) {
272 		if (ValidityHelper.isEmpty(mimeType)) {
273 			return false;
274 		}
275 
276 		mimeType = mimeType.toLowerCase();
277 		return mimeType.startsWith("image/") //
278 				|| "text/css".equals(mimeType);
279 	}
280 
281 	private void setLinkUndone(final String baseUrl) {
282 		try {
283 			final DbHelper dbHelper = this.dbHelperFactory.buildDbHelper();
284 			try {
285 				dbHelper.beginTransaction();
286 				try {
287 					final LinkDao linkDao = dbHelper.getLinkDao();
288 					linkDao.saveForced(baseUrl);
289 				} catch (final Exception e) {
290 					try {
291 						dbHelper.rollbackTransaction();
292 					} catch (final Exception e2) {
293 						LOG.warn("Failed to rollback connection", e2);
294 					}
295 					throw e;
296 				}
297 			} finally {
298 				try {
299 					dbHelper.close();
300 				} catch (final Exception e) {
301 					LOG.warn("Failed to close database connection", e);
302 				}
303 			}
304 		} catch (final Exception e) {
305 			LOG.warn("Failed to resave url: " + baseUrl, e);
306 		}
307 	}
308 
309 }