View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  package simplespider.simplespider;
19  
20  import java.io.File;
21  import java.sql.SQLException;
22  import java.util.concurrent.LinkedBlockingQueue;
23  import java.util.concurrent.ThreadPoolExecutor;
24  import java.util.concurrent.TimeUnit;
25  
26  import org.apache.commons.logging.Log;
27  import org.apache.commons.logging.LogFactory;
28  
29  import simplespider.simplespider.bot.Crawler;
30  import simplespider.simplespider.bot.CrawlerImpl;
31  import simplespider.simplespider.bot.CrawlerRunner;
32  import simplespider.simplespider.bot.extractor.LinkExtractor;
33  import simplespider.simplespider.bot.extractor.html.stream.StreamExtractor;
34  import simplespider.simplespider.bot.http.HttpClientFactory;
35  import simplespider.simplespider.bot.http.apache.ApacheHttpClientFactory;
36  import simplespider.simplespider.dao.DbHelper;
37  import simplespider.simplespider.dao.DbHelperFactory;
38  import simplespider.simplespider.dao.LinkDao;
39  import simplespider.simplespider.dao.db4o.Db4oDbHelperFactory;
40  import simplespider.simplespider.importing.EntityImporter;
41  import simplespider.simplespider.importing.simplefile.SimpleFileImporter;
42  
43  /***
44   * Hello world!
45   */
46  public class Main {
47  	private static final Log	LOG							= LogFactory.getLog(Main.class);
48  
49  	private static final String	PID_FILENAME_KEY			= "sws.daemon.pidfile";
50  	private static final String	PID_FILENAME_DEFAULT		= "simple-web-spider.pid";
51  	// TODO Configure this
52  	private static final String	LINK_IMPORT_FILENAME		= "bootstrapping.txt";
53  	// TODO Configure this
54  	private static final int	WAIT_FOR_THREAD_ON_SHUTDOWN	= 3 * 60;
55  	// TODO Configure this
56  	private static final int	MAX_CURRENT_THREADS			= 4;
57  	// TODO Configure this
58  	private static final int	MAX_THREADS_PER_MINUTE		= 10;
59  	// TODO Configure this
60  	private static final int	MAX_URL_LENGTH				= 1024;
61  
62  	private static Thread		mainThread;
63  
64  	private static Thread getMainDaemonThread() {
65  		return mainThread;
66  	}
67  
68  	private volatile boolean		cancled	= false;
69  	private final DbHelperFactory	dbHelperFactory;
70  	final HttpClientFactory			httpClientFactory;
71  	private Thread					listener;
72  
73  	private Main(final DbHelperFactory dbHelperFactory, final HttpClientFactory httpClientFactory) {
74  		this.dbHelperFactory = dbHelperFactory;
75  		this.httpClientFactory = httpClientFactory;
76  	}
77  
78  	static private void daemonize() {
79  		mainThread = Thread.currentThread();
80  		getPidFile().deleteOnExit();
81  	}
82  
83  	private static File getPidFile() {
84  		return new File(System.getProperty(PID_FILENAME_KEY, PID_FILENAME_DEFAULT));
85  	}
86  
87  	private void startCancleListener() {
88  		this.listener = new Thread() {
89  			@Override
90  			public void run() {
91  				LOG.warn("Invoke stopping crawler...");
92  				Main.this.cancled = true;
93  
94  				try {
95  					getMainDaemonThread().join();
96  				} catch (final InterruptedException e) {
97  					LOG.error("Interrupted which waiting on main daemon thread to complete.");
98  				}
99  
100 			}
101 
102 		};
103 
104 		if (LOG.isInfoEnabled()) {
105 			LOG.info("Add shutdown hook...");
106 		}
107 		this.listener.setDaemon(true);
108 		Runtime.getRuntime().addShutdownHook(this.listener);
109 	}
110 
111 	private void interruptCancleListener() {
112 		if (LOG.isInfoEnabled()) {
113 			LOG.info("Interrupting cancle listner...");
114 		}
115 		Runtime.getRuntime().removeShutdownHook(this.listener);
116 		this.listener.interrupt();
117 	}
118 
119 	private void runCrawler() throws SQLException {
120 		if (LOG.isInfoEnabled()) {
121 			LOG.info("Start crawler...");
122 			LOG.info("Open database connection... This could took time...");
123 		}
124 		final DbHelper db = this.dbHelperFactory.buildDbHelper();
125 		try {
126 			if (LOG.isInfoEnabled()) {
127 				LOG.info("Importing bootstrap links...");
128 			}
129 			importLinks(LINK_IMPORT_FILENAME, this.dbHelperFactory);
130 			db.commitTransaction();
131 
132 			final LimitThroughPut limitThroughPut = new LimitThroughPut(Main.MAX_THREADS_PER_MINUTE);
133 			if (LOG.isInfoEnabled()) {
134 				LOG.info("Crawl LINK entries...");
135 			}
136 
137 			final ThreadPoolExecutor threadPool = new ThreadPoolExecutor(MAX_CURRENT_THREADS, MAX_CURRENT_THREADS, 0L, TimeUnit.MILLISECONDS,
138 					new LinkedBlockingQueue<Runnable>());
139 
140 			runCrawler(this.dbHelperFactory, this.httpClientFactory, threadPool, limitThroughPut, false);
141 
142 			if (LOG.isInfoEnabled()) {
143 				LOG.info("Invoke shutting down threads...");
144 			}
145 			threadPool.shutdown();
146 			try {
147 				threadPool.awaitTermination(WAIT_FOR_THREAD_ON_SHUTDOWN, TimeUnit.NANOSECONDS);
148 			} catch (final InterruptedException e) {
149 				LOG.warn("failed to wait for ending of all threads", e);
150 			}
151 			threadPool.shutdownNow();
152 			if (LOG.isInfoEnabled()) {
153 				LOG.info("Crawler stops");
154 			}
155 		} finally {
156 			if (LOG.isInfoEnabled()) {
157 				LOG.info("Shutting down database");
158 			}
159 			db.shutdown();
160 
161 		}
162 	}
163 
164 	private void importLinks(final String filename, final DbHelperFactory dbHelperFactory) {
165 		final EntityImporter importer = new SimpleFileImporter(filename);
166 		final long importLink = importer.importLink(dbHelperFactory);
167 		if (LOG.isInfoEnabled()) {
168 			LOG.info("Imported links from file \"" + filename + "\": " + importLink);
169 		}
170 	}
171 
172 	private void runCrawler(final DbHelperFactory dbHelperFactory, final HttpClientFactory httpClientFactory, final ThreadPoolExecutor threadPool,
173 			final LimitThroughPut limitThroughPut, final boolean bootstrapping) throws SQLException {
174 		final DbHelper db = dbHelperFactory.buildDbHelper();
175 		try {
176 			final LinkDao linkDao = db.getLinkDao();
177 
178 			int retryCountOnNoLinks = 0;
179 			while (!this.cancled) {
180 				// Block while to much threads were working in last minute
181 				limitThroughPut.next();
182 
183 				// Check for next link, if there is none, wait and try again
184 				String next;
185 				try {
186 					next = linkDao.removeNextAndCommit();
187 				} catch (final RuntimeException e) {
188 					LOG.error("Failed to get next url", e);
189 					try {
190 						db.rollbackTransaction();
191 					} catch (final Exception e2) {
192 						LOG.error("Failed to rollback database transaction", e2);
193 					}
194 					next = null;
195 				}
196 
197 				if (next == null) {
198 					// On bootstrapping don't do any retry, if no more links are available
199 					if (bootstrapping) {
200 						if (LOG.isInfoEnabled()) {
201 							LOG.info("Bootstrapping: No more links available...");
202 						}
203 						break;
204 					}
205 					// Seconds try fails
206 					if (threadPool.getActiveCount() == 0 //
207 							|| retryCountOnNoLinks > 3) {
208 						LOG.fatal("No more links available...");
209 						break;
210 					}
211 
212 					// Wait for all running treads, perhaps there are any and they create some new LINK entities
213 					retryCountOnNoLinks++;
214 					if (LOG.isInfoEnabled()) {
215 						LOG.info("No more links available... Waiting for running thread and retry... Count " + retryCountOnNoLinks);
216 					}
217 					try {
218 						threadPool.awaitTermination(WAIT_FOR_THREAD_ON_SHUTDOWN, TimeUnit.SECONDS);
219 					} catch (final InterruptedException e) {
220 						LOG.warn("failed to wait for ending of all threads", e);
221 					}
222 					continue;
223 				} else {
224 					retryCountOnNoLinks = 0;
225 				}
226 
227 				if (LOG.isInfoEnabled()) {
228 					LOG.info("Start crawling URL: \"" + next + "\"");
229 				}
230 
231 				final LinkExtractor extractor = new StreamExtractor(MAX_URL_LENGTH);
232 				final Crawler crawler = new CrawlerImpl(dbHelperFactory, extractor, httpClientFactory);
233 				threadPool.execute(new CrawlerRunner(crawler, next));
234 			}
235 		} finally {
236 			try {
237 				db.close();
238 			} catch (final Exception e) {
239 				LOG.warn("Failed to cloase database connection", e);
240 			}
241 		}
242 	}
243 
244 	public static void main(final String[] args) throws Exception {
245 		if (LOG.isInfoEnabled()) {
246 			LOG.info("Starting program...");
247 		}
248 		try {
249 			// do sanity checks and startup actions
250 			daemonize();
251 		} catch (final Throwable e) {
252 			LOG.fatal("Startup failed", e);
253 		}
254 
255 		final DbHelperFactory dbHelperFactory = new Db4oDbHelperFactory("sws.db4o");
256 
257 		final HttpClientFactory httpClientFactory;
258 		if (args.length == 2) {
259 			httpClientFactory = new ApacheHttpClientFactory(args[0], Integer.parseInt(args[1]));
260 		} else {
261 			httpClientFactory = new ApacheHttpClientFactory();
262 		}
263 
264 		final Main main = new Main(dbHelperFactory, httpClientFactory);
265 		main.startCancleListener();
266 		try {
267 			main.runCrawler();
268 		} catch (final RuntimeException e) {
269 			LOG.error("Uncaught and unhandled error occurs. Please report this bug", e);
270 			main.interruptCancleListener();
271 			System.exit(1);
272 		}
273 		System.exit(0);
274 	}
275 }