1 /***
2 * Simple Web Spider - <http://simplewebspider.sourceforge.net/>
3 * Copyright (C) 2009 <berendona@users.sourceforge.net>
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 package simplespider.simplespider.bot.extractor.html.stream;
19
20 import java.util.ArrayList;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Properties;
26 import java.util.Set;
27
28 import simplespider.simplespider.util.ValidityHelper;
29
30 class TagListenerImpl implements TagListener {
31 private static final Map<String, Set<String>> tagsWithoutContent = new HashMap<String, Set<String>>(14);
32 static {
33 final Set<String> attrHref = new HashSet<String>(1);
34 attrHref.add("href");
35
36 final Set<String> attrSrc = new HashSet<String>(1);
37 attrSrc.add("src");
38
39 final Set<String> attrDomain = new HashSet<String>(1);
40 attrDomain.add("domain");
41
42 final Set<String> attrRdfAbout = new HashSet<String>(1);
43 attrRdfAbout.add("rdf:about");
44
45 final Set<String> attrRdfResource = new HashSet<String>(1);
46 attrRdfResource.add("rdf:resource");
47
48 final Set<String> attrUrl = new HashSet<String>(1);
49 attrUrl.add("url");
50
51 final Set<String> tagMember = new HashSet<String>(2);
52 tagMember.add("href");
53 tagMember.add("hrefreadonly");
54
55 final Set<String> tagOutline = new HashSet<String>(3);
56 tagOutline.add("htmlUrl");
57 tagOutline.add("url");
58 tagOutline.add("xmlUrl");
59
60
61 tagsWithoutContent.put("a", attrHref);
62 tagsWithoutContent.put("frame", attrSrc);
63 tagsWithoutContent.put("iframe", attrSrc);
64 tagsWithoutContent.put("ilayer", attrSrc);
65
66
67 tagsWithoutContent.put("atom:link", attrHref);
68 tagsWithoutContent.put("category", attrDomain);
69 tagsWithoutContent.put("item", attrRdfAbout);
70 tagsWithoutContent.put("rdf:li", attrRdfResource);
71 tagsWithoutContent.put("textinput", attrRdfResource);
72 tagsWithoutContent.put("source", attrUrl);
73
74
75 tagsWithoutContent.put("link", attrHref);
76 tagsWithoutContent.put("collection", attrHref);
77 tagsWithoutContent.put("member", tagMember);
78
79
80 tagsWithoutContent.put("outline", tagOutline);
81 }
82
83 private static final Set<String> tagsWithContent = new HashSet<String>(8);
84 static {
85
86 tagsWithContent.add("comments");
87 tagsWithContent.add("docs");
88 tagsWithContent.add("link");
89 tagsWithContent.add("url");
90 tagsWithContent.add("wfw:commentRss");
91
92
93 tagsWithContent.add("id");
94
95
96 tagsWithContent.add("docs");
97 tagsWithContent.add("ownerId");
98 }
99
100 private final Set<String> links = new HashSet<String>();
101
102 @Override
103 public boolean isTagWithoutContent(final String tag) {
104 return tagsWithoutContent.containsKey(tag);
105 }
106
107 @Override
108 public boolean isTagWithContent(final String tag) {
109 return tagsWithContent.contains(tag);
110 }
111
112 @Override
113 public void scrapeTagWithoutContent(final String tagname, final Properties tagopts) {
114 if (!ValidityHelper.isEmpty(tagopts)) {
115 final Set<String> attributes = tagsWithoutContent.get(tagname);
116 if (!ValidityHelper.isEmpty(attributes)) {
117 for (final String attribute : attributes) {
118 final String link = (String) tagopts.get(attribute);
119 if (!ValidityHelper.isEmpty(link)) {
120 final String trimmedLink = link.trim();
121 if (!ValidityHelper.isEmpty(trimmedLink)) {
122 this.links.add(trimmedLink);
123 }
124 }
125 }
126 }
127 }
128 }
129
130 @Override
131 public void scrapeTagWithContent(final String tagname, final Properties tagopts, final char[] text) {
132 if (!ValidityHelper.isEmpty(text)) {
133 final String link = String.valueOf(text);
134 if (!ValidityHelper.isEmpty(link)) {
135 final String trimmedLink = link.trim();
136 if (!ValidityHelper.isEmpty(trimmedLink)) {
137 this.links.add(trimmedLink);
138 }
139 }
140 }
141 }
142
143 public List<String> getLinks() {
144 final List<String> linkList = new ArrayList<String>(this.links.size());
145 for (final String link : this.links) {
146 linkList.add(link);
147 }
148
149 return linkList;
150 }
151
152 }