View Javadoc

1   /***
2    * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3    * Copyright (C) 2009  <berendona@users.sourceforge.net>
4    *
5    * This program is free software: you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation, either version 3 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  package simplespider.simplespider.bot.extractor.html.stream;
19  
20  import java.util.ArrayList;
21  import java.util.HashMap;
22  import java.util.HashSet;
23  import java.util.List;
24  import java.util.Map;
25  import java.util.Properties;
26  import java.util.Set;
27  
28  import simplespider.simplespider.util.ValidityHelper;
29  
30  class TagListenerImpl implements TagListener {
31  	private static final Map<String, Set<String>>	tagsWithoutContent	= new HashMap<String, Set<String>>(14);
32  	static {
33  		final Set<String> attrHref = new HashSet<String>(1);
34  		attrHref.add("href");
35  
36  		final Set<String> attrSrc = new HashSet<String>(1);
37  		attrSrc.add("src");
38  
39  		final Set<String> attrDomain = new HashSet<String>(1);
40  		attrDomain.add("domain");
41  
42  		final Set<String> attrRdfAbout = new HashSet<String>(1);
43  		attrRdfAbout.add("rdf:about");
44  
45  		final Set<String> attrRdfResource = new HashSet<String>(1);
46  		attrRdfResource.add("rdf:resource");
47  
48  		final Set<String> attrUrl = new HashSet<String>(1);
49  		attrUrl.add("url");
50  
51  		final Set<String> tagMember = new HashSet<String>(2);
52  		tagMember.add("href");
53  		tagMember.add("hrefreadonly");
54  
55  		final Set<String> tagOutline = new HashSet<String>(3);
56  		tagOutline.add("htmlUrl");
57  		tagOutline.add("url");
58  		tagOutline.add("xmlUrl");
59  
60  		// html
61  		tagsWithoutContent.put("a", attrHref);
62  		tagsWithoutContent.put("frame", attrSrc);
63  		tagsWithoutContent.put("iframe", attrSrc);
64  		tagsWithoutContent.put("ilayer", attrSrc);
65  
66  		// rss & rdf
67  		tagsWithoutContent.put("atom:link", attrHref);
68  		tagsWithoutContent.put("category", attrDomain);
69  		tagsWithoutContent.put("item", attrRdfAbout);
70  		tagsWithoutContent.put("rdf:li", attrRdfResource);
71  		tagsWithoutContent.put("textinput", attrRdfResource);
72  		tagsWithoutContent.put("source", attrUrl);
73  
74  		// atom
75  		tagsWithoutContent.put("link", attrHref);
76  		tagsWithoutContent.put("collection", attrHref);
77  		tagsWithoutContent.put("member", tagMember);
78  
79  		// opml
80  		tagsWithoutContent.put("outline", tagOutline);
81  	}
82  
83  	private static final Set<String>				tagsWithContent		= new HashSet<String>(8);
84  	static {
85  		// rss & rdf
86  		tagsWithContent.add("comments");
87  		tagsWithContent.add("docs");
88  		tagsWithContent.add("link");
89  		tagsWithContent.add("url");
90  		tagsWithContent.add("wfw:commentRss");
91  
92  		// atom
93  		tagsWithContent.add("id");
94  
95  		// opml
96  		tagsWithContent.add("docs");
97  		tagsWithContent.add("ownerId");
98  	}
99  
100 	private final Set<String>						links				= new HashSet<String>();
101 
102 	@Override
103 	public boolean isTagWithoutContent(final String tag) {
104 		return tagsWithoutContent.containsKey(tag);
105 	}
106 
107 	@Override
108 	public boolean isTagWithContent(final String tag) {
109 		return tagsWithContent.contains(tag);
110 	}
111 
112 	@Override
113 	public void scrapeTagWithoutContent(final String tagname, final Properties tagopts) {
114 		if (!ValidityHelper.isEmpty(tagopts)) {
115 			final Set<String> attributes = tagsWithoutContent.get(tagname);
116 			if (!ValidityHelper.isEmpty(attributes)) {
117 				for (final String attribute : attributes) {
118 					final String link = (String) tagopts.get(attribute);
119 					if (!ValidityHelper.isEmpty(link)) {
120 						final String trimmedLink = link.trim();
121 						if (!ValidityHelper.isEmpty(trimmedLink)) {
122 							this.links.add(trimmedLink);
123 						}
124 					}
125 				}
126 			}
127 		}
128 	}
129 
130 	@Override
131 	public void scrapeTagWithContent(final String tagname, final Properties tagopts, final char[] text) {
132 		if (!ValidityHelper.isEmpty(text)) {
133 			final String link = String.valueOf(text);
134 			if (!ValidityHelper.isEmpty(link)) {
135 				final String trimmedLink = link.trim();
136 				if (!ValidityHelper.isEmpty(trimmedLink)) {
137 					this.links.add(trimmedLink);
138 				}
139 			}
140 		}
141 	}
142 
143 	public List<String> getLinks() {
144 		final List<String> linkList = new ArrayList<String>(this.links.size());
145 		for (final String link : this.links) {
146 			linkList.add(link);
147 		}
148 
149 		return linkList;
150 	}
151 
152 }