1 /***
2 * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3 * Copyright (C) 2009 <berendona@users.sourceforge.net>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 package simplespider.simplespider;
19
20 import java.io.File;
21 import java.sql.SQLException;
22 import java.util.concurrent.LinkedBlockingQueue;
23 import java.util.concurrent.ThreadPoolExecutor;
24 import java.util.concurrent.TimeUnit;
25
26 import org.apache.commons.logging.Log;
27 import org.apache.commons.logging.LogFactory;
28
29 import simplespider.simplespider.bot.Crawler;
30 import simplespider.simplespider.bot.CrawlerImpl;
31 import simplespider.simplespider.bot.CrawlerRunner;
32 import simplespider.simplespider.bot.extractor.LinkExtractor;
33 import simplespider.simplespider.bot.extractor.html.stream.StreamExtractor;
34 import simplespider.simplespider.bot.http.HttpClientFactory;
35 import simplespider.simplespider.bot.http.apache.ApacheHttpClientFactory;
36 import simplespider.simplespider.dao.DbHelper;
37 import simplespider.simplespider.dao.DbHelperFactory;
38 import simplespider.simplespider.dao.LinkDao;
39 import simplespider.simplespider.dao.db4o.Db4oDbHelperFactory;
40 import simplespider.simplespider.importing.EntityImporter;
41 import simplespider.simplespider.importing.simplefile.SimpleFileImporter;
42
43 /***
44 * Hello world!
45 */
46 public class Main {
47 private static final Log LOG = LogFactory.getLog(Main.class);
48
49 private static final String PID_FILENAME_KEY = "sws.daemon.pidfile";
50 private static final String PID_FILENAME_DEFAULT = "simple-web-spider.pid";
51
52 private static final String LINK_IMPORT_FILENAME = "bootstrapping.txt";
53
54 private static final int WAIT_FOR_THREAD_ON_SHUTDOWN = 3 * 60;
55
56 private static final int MAX_CURRENT_THREADS = 4;
57
58 private static final int MAX_THREADS_PER_MINUTE = 10;
59
60 private static final int MAX_URL_LENGTH = 1024;
61
62 private static Thread mainThread;
63
64 private static Thread getMainDaemonThread() {
65 return mainThread;
66 }
67
68 private volatile boolean cancled = false;
69 private final DbHelperFactory dbHelperFactory;
70 final HttpClientFactory httpClientFactory;
71 private Thread listener;
72
73 private Main(final DbHelperFactory dbHelperFactory, final HttpClientFactory httpClientFactory) {
74 this.dbHelperFactory = dbHelperFactory;
75 this.httpClientFactory = httpClientFactory;
76 }
77
78 static private void daemonize() {
79 mainThread = Thread.currentThread();
80 getPidFile().deleteOnExit();
81 }
82
83 private static File getPidFile() {
84 return new File(System.getProperty(PID_FILENAME_KEY, PID_FILENAME_DEFAULT));
85 }
86
87 private void startCancleListener() {
88 this.listener = new Thread() {
89 @Override
90 public void run() {
91 LOG.warn("Invoke stopping crawler...");
92 Main.this.cancled = true;
93
94 try {
95 getMainDaemonThread().join();
96 } catch (final InterruptedException e) {
97 LOG.error("Interrupted which waiting on main daemon thread to complete.");
98 }
99
100 }
101
102 };
103
104 if (LOG.isInfoEnabled()) {
105 LOG.info("Add shutdown hook...");
106 }
107 this.listener.setDaemon(true);
108 Runtime.getRuntime().addShutdownHook(this.listener);
109 }
110
111 private void interruptCancleListener() {
112 if (LOG.isInfoEnabled()) {
113 LOG.info("Interrupting cancle listner...");
114 }
115 Runtime.getRuntime().removeShutdownHook(this.listener);
116 this.listener.interrupt();
117 }
118
119 private void runCrawler() throws SQLException {
120 if (LOG.isInfoEnabled()) {
121 LOG.info("Start crawler...");
122 LOG.info("Open database connection... This could took time...");
123 }
124 final DbHelper db = this.dbHelperFactory.buildDbHelper();
125 try {
126 if (LOG.isInfoEnabled()) {
127 LOG.info("Importing bootstrap links...");
128 }
129 importLinks(LINK_IMPORT_FILENAME, this.dbHelperFactory);
130 db.commitTransaction();
131
132 final LimitThroughPut limitThroughPut = new LimitThroughPut(Main.MAX_THREADS_PER_MINUTE);
133 if (LOG.isInfoEnabled()) {
134 LOG.info("Crawl LINK entries...");
135 }
136
137 final ThreadPoolExecutor threadPool = new ThreadPoolExecutor(MAX_CURRENT_THREADS, MAX_CURRENT_THREADS, 0L, TimeUnit.MILLISECONDS,
138 new LinkedBlockingQueue<Runnable>());
139
140 runCrawler(this.dbHelperFactory, this.httpClientFactory, threadPool, limitThroughPut, false);
141
142 if (LOG.isInfoEnabled()) {
143 LOG.info("Invoke shutting down threads...");
144 }
145 threadPool.shutdown();
146 try {
147 threadPool.awaitTermination(WAIT_FOR_THREAD_ON_SHUTDOWN, TimeUnit.NANOSECONDS);
148 } catch (final InterruptedException e) {
149 LOG.warn("failed to wait for ending of all threads", e);
150 }
151 threadPool.shutdownNow();
152 if (LOG.isInfoEnabled()) {
153 LOG.info("Crawler stops");
154 }
155 } finally {
156 if (LOG.isInfoEnabled()) {
157 LOG.info("Shutting down database");
158 }
159 db.shutdown();
160
161 }
162 }
163
164 private void importLinks(final String filename, final DbHelperFactory dbHelperFactory) {
165 final EntityImporter importer = new SimpleFileImporter(filename);
166 final long importLink = importer.importLink(dbHelperFactory);
167 if (LOG.isInfoEnabled()) {
168 LOG.info("Imported links from file \"" + filename + "\": " + importLink);
169 }
170 }
171
172 private void runCrawler(final DbHelperFactory dbHelperFactory, final HttpClientFactory httpClientFactory, final ThreadPoolExecutor threadPool,
173 final LimitThroughPut limitThroughPut, final boolean bootstrapping) throws SQLException {
174 final DbHelper db = dbHelperFactory.buildDbHelper();
175 try {
176 final LinkDao linkDao = db.getLinkDao();
177
178 int retryCountOnNoLinks = 0;
179 while (!this.cancled) {
180
181 limitThroughPut.next();
182
183
184 String next;
185 try {
186 next = linkDao.removeNextAndCommit();
187 } catch (final RuntimeException e) {
188 LOG.error("Failed to get next url", e);
189 try {
190 db.rollbackTransaction();
191 } catch (final Exception e2) {
192 LOG.error("Failed to rollback database transaction", e2);
193 }
194 next = null;
195 }
196
197 if (next == null) {
198
199 if (bootstrapping) {
200 if (LOG.isInfoEnabled()) {
201 LOG.info("Bootstrapping: No more links available...");
202 }
203 break;
204 }
205
206 if (threadPool.getActiveCount() == 0
207 || retryCountOnNoLinks > 3) {
208 LOG.fatal("No more links available...");
209 break;
210 }
211
212
213 retryCountOnNoLinks++;
214 if (LOG.isInfoEnabled()) {
215 LOG.info("No more links available... Waiting for running thread and retry... Count " + retryCountOnNoLinks);
216 }
217 try {
218 threadPool.awaitTermination(WAIT_FOR_THREAD_ON_SHUTDOWN, TimeUnit.SECONDS);
219 } catch (final InterruptedException e) {
220 LOG.warn("failed to wait for ending of all threads", e);
221 }
222 continue;
223 } else {
224 retryCountOnNoLinks = 0;
225 }
226
227 if (LOG.isInfoEnabled()) {
228 LOG.info("Start crawling URL: \"" + next + "\"");
229 }
230
231 final LinkExtractor extractor = new StreamExtractor(MAX_URL_LENGTH);
232 final Crawler crawler = new CrawlerImpl(dbHelperFactory, extractor, httpClientFactory);
233 threadPool.execute(new CrawlerRunner(crawler, next));
234 }
235 } finally {
236 try {
237 db.close();
238 } catch (final Exception e) {
239 LOG.warn("Failed to cloase database connection", e);
240 }
241 }
242 }
243
244 public static void main(final String[] args) throws Exception {
245 if (LOG.isInfoEnabled()) {
246 LOG.info("Starting program...");
247 }
248 try {
249
250 daemonize();
251 } catch (final Throwable e) {
252 LOG.fatal("Startup failed", e);
253 }
254
255 final DbHelperFactory dbHelperFactory = new Db4oDbHelperFactory("sws.db4o");
256
257 final HttpClientFactory httpClientFactory;
258 if (args.length == 2) {
259 httpClientFactory = new ApacheHttpClientFactory(args[0], Integer.parseInt(args[1]));
260 } else {
261 httpClientFactory = new ApacheHttpClientFactory();
262 }
263
264 final Main main = new Main(dbHelperFactory, httpClientFactory);
265 main.startCancleListener();
266 try {
267 main.runCrawler();
268 } catch (final RuntimeException e) {
269 LOG.error("Uncaught and unhandled error occurs. Please report this bug", e);
270 main.interruptCancleListener();
271 System.exit(1);
272 }
273 System.exit(0);
274 }
275 }