1 /***
2 * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3 * Copyright (C) 2009 <berendona@users.sourceforge.net>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 package simplespider.simplespider.bot;
19
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.MalformedURLException;
23 import java.net.SocketTimeoutException;
24 import java.sql.SQLException;
25 import java.util.ArrayList;
26 import java.util.List;
27 import java.util.concurrent.TimeUnit;
28
29 import org.apache.commons.httpclient.CircularRedirectException;
30 import org.apache.commons.httpclient.URIException;
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33
34 import simplespider.simplespider.bot.extractor.LinkExtractor;
35 import simplespider.simplespider.bot.http.HttpClient;
36 import simplespider.simplespider.bot.http.HttpClientFactory;
37 import simplespider.simplespider.dao.DbHelper;
38 import simplespider.simplespider.dao.DbHelperFactory;
39 import simplespider.simplespider.dao.LinkDao;
40 import simplespider.simplespider.util.SimpleUrl;
41 import simplespider.simplespider.util.ValidityHelper;
42
43 public class CrawlerImpl implements Crawler {
44
45
46
47 private static final int SLEEP_SECONDS_ON_ERROR = 10;
48
49 private static final Log LOG = LogFactory.getLog(CrawlerImpl.class);
50
51 private final DbHelperFactory dbHelperFactory;
52 private final LinkExtractor linkExtractor;
53 private final HttpClientFactory httpClientFactory;
54
55 public CrawlerImpl(final DbHelperFactory dbHelperFactory, final LinkExtractor linkExtractor, final HttpClientFactory httpClientFactory) {
56 this.dbHelperFactory = dbHelperFactory;
57 this.linkExtractor = linkExtractor;
58 this.httpClientFactory = httpClientFactory;
59 }
60
61 private HttpClient getHttpConnection(final String baseUrl) {
62 final HttpClient httpClient = this.httpClientFactory.buildHttpClient();
63
64 try {
65 httpClient.createConnection(baseUrl);
66 } catch (final Exception e) {
67 if (e instanceof SocketTimeoutException) {
68 if (LOG.isInfoEnabled()) {
69 LOG.info("Failed to load URL \"" + baseUrl + "\": " + e);
70 }
71 } else if (e instanceof CircularRedirectException) {
72 if (LOG.isInfoEnabled()) {
73 LOG.info("Failed to load URL \"" + baseUrl + "\": " + e);
74 }
75 } else {
76 if (LOG.isInfoEnabled()) {
77 LOG.info("Failed to load URL \"" + baseUrl + "\"", e);
78 }
79 }
80 return null;
81 }
82
83 final int statusCode = httpClient.getStatusCode();
84 if (statusCode < 200 || statusCode >= 300) {
85 if (LOG.isInfoEnabled()) {
86 LOG.info("Failed to load URL \"" + baseUrl + "\":" + httpClient.getStatusLine());
87 }
88 httpClient.releaseConnection();
89 return null;
90 }
91
92 return httpClient;
93 }
94
95
96
97
98
99 public void crawl(final String baseUrl) {
100 ValidityHelper.checkNotEmpty("baseUrl", baseUrl);
101
102 try {
103 final HttpClient httpClient = getHttpConnection(baseUrl);
104 if (httpClient == null) {
105
106 setLinkUndone(baseUrl);
107
108 sleepOnError();
109 return;
110 }
111
112 final List<String> urls;
113 try {
114 urls = getLinks(baseUrl, httpClient);
115 } finally {
116
117 httpClient.releaseConnection();
118 }
119
120 if (urls == null) {
121
122 setLinkUndone(baseUrl);
123
124 sleepOnError();
125 } else {
126 saveLinks(urls);
127 }
128 } catch (final Exception e) {
129 LOG.warn("Failed to crawl URL \"" + baseUrl + "\"", e);
130 }
131 }
132
133 private void sleepOnError() {
134 try {
135 TimeUnit.SECONDS.sleep(SLEEP_SECONDS_ON_ERROR);
136 } catch (final InterruptedException e) {
137 if (LOG.isDebugEnabled()) {
138 LOG.debug("Sleep was interrupted", e);
139 }
140 }
141 }
142
143 private boolean isProtocolSupported(String url) {
144 url = url.trim();
145 final int p = url.indexOf(':');
146 if (p < 0) {
147 if (url.startsWith("www.")) {
148 return true;
149 }
150 if (LOG.isInfoEnabled()) {
151 LOG.info("Protocol is not given: " + url);
152 }
153 return false;
154 }
155
156 final String protocol = url.substring(0, p).trim().toLowerCase();
157 return "http".equals(protocol)
158 || "https".equals(protocol);
159 }
160
161 private void saveLinks(final List<String> urls) throws SQLException {
162 final DbHelper dbHelper = this.dbHelperFactory.buildDbHelper();
163 try {
164 final LinkDao linkDao = dbHelper.getLinkDao();
165
166 for (final String url : urls) {
167 if (!isProtocolSupported(url)) {
168 if (LOG.isDebugEnabled()) {
169 LOG.debug("Ignoring not supported protocol; url: " + url);
170 }
171 continue;
172 }
173
174 SimpleUrl simpleUrl;
175 try {
176 simpleUrl = new SimpleUrl(url);
177 } catch (final Exception e) {
178 if (LOG.isInfoEnabled()) {
179 LOG.info("Ignoring malformed URL \"" + url + "\"", e);
180 }
181 continue;
182 }
183
184 final String cleanedUrl = simpleUrl.toNormalform(false, true);
185 try {
186 linkDao.saveAndCommit(cleanedUrl);
187 } catch (final Exception e) {
188 LOG.warn("Failed to save url: " + cleanedUrl, e);
189 dbHelper.rollbackTransaction();
190 }
191 }
192 } finally {
193 try {
194 dbHelper.close();
195 } catch (final Exception e) {
196 LOG.warn("Failed to close database connection", e);
197 }
198 }
199 }
200
201 private List<String> getLinks(final String baseUrl, final HttpClient httpClient) throws SQLException, MalformedURLException {
202 final String realBaseUrl;
203 try {
204 realBaseUrl = httpClient.getRedirectedUrl();
205 } catch (final URIException e) {
206 LOG.warn("Failed to get URI after redirection for URL \"" + baseUrl + "\"", e);
207 return null;
208 }
209
210 final String cleanedRealBaseUrl = new SimpleUrl(realBaseUrl).toNormalform(false, true);
211
212 final InputStream bodyAsStream;
213 try {
214 bodyAsStream = httpClient.getResponseBodyAsStream();
215 } catch (final IOException e) {
216 LOG.warn("Failed to get body for url \"" + cleanedRealBaseUrl + "\"", e);
217 return null;
218 }
219
220 if (bodyAsStream == null) {
221 LOG.warn("Failed to get body for url \"" + cleanedRealBaseUrl + "\"");
222 return null;
223 }
224
225 final String mimeType = httpClient.getMimeType();
226
227
228 if (ValidityHelper.isEmpty(mimeType)
229 || isMimeSupported(mimeType)
230 ) {
231 try {
232 return this.linkExtractor.getUrls(bodyAsStream, cleanedRealBaseUrl);
233 } catch (final IOException e) {
234 LOG.warn("Failed to extract links from body for url \"" + cleanedRealBaseUrl + "\"", e);
235 return null;
236 }
237 } else {
238 if (isMimeExcluded(mimeType)) {
239 if (LOG.isDebugEnabled()) {
240 LOG.debug("Excluded mime type \"" + mimeType + "\": Ignoring URL \"" + baseUrl + "\"");
241 }
242 } else {
243 if (LOG.isInfoEnabled()) {
244 LOG.info("Not supporting mime type \"" + mimeType + "\": Ignoring URL \"" + baseUrl + "\"");
245 }
246 }
247 return new ArrayList<String>(0);
248 }
249 }
250
251 private boolean isMimeSupported(String mimeType) {
252 if (ValidityHelper.isEmpty(mimeType)) {
253 return false;
254 }
255
256 mimeType = mimeType.toLowerCase();
257 return "text/plain".equals(mimeType)
258 || "text/html".equals(mimeType)
259 || "text/xml".equals(mimeType)
260 || "text/x-opml".equals(mimeType)
261 || "text/x-opml+xml".equals(mimeType)
262 || "application/atom+xml".equals(mimeType)
263 || "application/atomcoll+xml".equals(mimeType)
264 || "application/atomserv+xml".equals(mimeType)
265 || "application/html+xml".equals(mimeType)
266 || "application/rdf+xml".equals(mimeType)
267 || "application/rss+xml".equals(mimeType)
268 || "application/xml".equals(mimeType);
269 }
270
271 private boolean isMimeExcluded(String mimeType) {
272 if (ValidityHelper.isEmpty(mimeType)) {
273 return false;
274 }
275
276 mimeType = mimeType.toLowerCase();
277 return mimeType.startsWith("image/")
278 || "text/css".equals(mimeType);
279 }
280
281 private void setLinkUndone(final String baseUrl) {
282 try {
283 final DbHelper dbHelper = this.dbHelperFactory.buildDbHelper();
284 try {
285 dbHelper.beginTransaction();
286 try {
287 final LinkDao linkDao = dbHelper.getLinkDao();
288 linkDao.saveForced(baseUrl);
289 } catch (final Exception e) {
290 try {
291 dbHelper.rollbackTransaction();
292 } catch (final Exception e2) {
293 LOG.warn("Failed to rollback connection", e2);
294 }
295 throw e;
296 }
297 } finally {
298 try {
299 dbHelper.close();
300 } catch (final Exception e) {
301 LOG.warn("Failed to close database connection", e);
302 }
303 }
304 } catch (final Exception e) {
305 LOG.warn("Failed to resave url: " + baseUrl, e);
306 }
307 }
308
309 }