Skip to content

Commit ec42cfb

Browse files
authored
Merge pull request #280 from smartive/NUTCH-2502
NUTCH-2502: Add Content-Type filter option to Any23 plugin
2 parents 27ff215 + 2e2cce0 commit ec42cfb

File tree

3 files changed

+45
-19
lines changed

3 files changed

+45
-19
lines changed

conf/nutch-default.xml

+6
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,12 @@
10721072
<description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description>
10731073
</property>
10741074

1075+
<property>
1076+
<name>any23.content_types</name>
1077+
<value>text/html,application/xhtml+xml</value>
1078+
<description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description>
1079+
</property>
1080+
10751081
<!-- moreindexingfilter plugin properties -->
10761082

10771083
<property>

src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java

+14-7
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import java.util.Set;
2626
import java.util.TreeSet;
2727
import java.util.Collections;
28+
import java.util.Arrays;
2829

2930
import org.apache.any23.Any23;
3031
import org.apache.any23.extractor.ExtractionException;
@@ -77,15 +78,16 @@ public class Any23ParseFilter implements HtmlParseFilter {
7778
public final static String ANY23_TRIPLES = "Any23-Triples";
7879

7980
public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
81+
public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
8082

8183
private static class Any23Parser {
8284

8385
Set<String> triples = null;
8486

85-
Any23Parser(String url, String htmlContent, String... extractorNames) throws TripleHandlerException {
87+
Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
8688
triples = new TreeSet<String>();
8789
try {
88-
parse(url, htmlContent, extractorNames);
90+
parse(url, htmlContent, contentType, extractorNames);
8991
} catch (URISyntaxException e) {
9092
throw new RuntimeException(e.getReason());
9193
} catch (IOException e) {
@@ -101,7 +103,7 @@ private Set<String> getTriples() {
101103
return triples;
102104
}
103105

104-
private void parse(String url, String htmlContent, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
106+
private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
105107
Any23 any23 = new Any23(extractorNames);
106108
any23.setMIMETypeDetector(null);
107109

@@ -118,7 +120,7 @@ private void parse(String url, String htmlContent, String... extractorNames) thr
118120
TripleHandler tHandler = new NTriplesWriter(baos);
119121
BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
120122
try {
121-
any23.extract(input, url, "text/html","UTF-8", bHandler);
123+
any23.extract(input, url, contentType, "UTF-8", bHandler);
122124
} catch (IOException e) {
123125
LOG.error("Error while reading the source", e);
124126
} catch (ExtractionException e) {
@@ -154,12 +156,18 @@ public void setConf(Configuration conf) {
154156
*/
155157
@Override
156158
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
157-
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF,"html-head-meta");
159+
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
160+
String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
161+
String contentType = content.getContentType();
162+
if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
163+
LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
164+
return parseResult;
165+
}
158166

159167
Any23Parser parser;
160168
try {
161169
String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
162-
parser = new Any23Parser(content.getUrl(), htmlContent, extractorNames);
170+
parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
163171
} catch (TripleHandlerException e) {
164172
throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
165173
}
@@ -175,4 +183,3 @@ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags
175183
return parseResult;
176184
}
177185
}
178-

src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java

+25-12
Original file line numberDiff line numberDiff line change
@@ -72,39 +72,40 @@ public void setUp() {
7272
conf.set("file.content.limit", "-1");
7373
conf.set("parser.timeout", "-1");
7474
conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
75+
conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
7576
}
7677

7778
@Test
7879
public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {
79-
80-
String urlString = "file:" + sampleDir + fileSeparator + file1;
81-
82-
File file = new File(sampleDir + fileSeparator + file1);
83-
84-
String[] triplesArray = extract(urlString, file);
80+
String[] triplesArray = getTriples(file1);
8581

8682
Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter",
8783
EXPECTED_TRIPLES_1, triplesArray.length);
8884
}
8985

9086
@Test
9187
public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
92-
String urlString = "file:" + sampleDir + fileSeparator + file2;
93-
94-
File file = new File(sampleDir + fileSeparator + file2);
95-
96-
String[] triplesArray = extract(urlString, file);
88+
String[] triplesArray = getTriples(file2);
9789

9890
Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter",
9991
EXPECTED_TRIPLES_2, triplesArray.length);
10092
}
93+
94+
@Test
95+
public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
96+
String[] triplesArray = getTriples(file1, "application/pdf");
97+
98+
Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
99+
0, triplesArray.length);
100+
}
101101

102-
public String[] extract(String urlString, File file) {
102+
public String[] extract(String urlString, File file, String contentType) {
103103
try {
104104
System.out.println(urlString);
105105
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
106106
Content content = protocol.getProtocolOutput(new Text(urlString),
107107
new CrawlDatum()).getContent();
108+
content.setContentType(contentType);
108109
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
109110
return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
110111
} catch (Exception e) {
@@ -113,4 +114,16 @@ public String[] extract(String urlString, File file) {
113114
}
114115
return null;
115116
}
117+
118+
private String[] getTriples(String fileName) {
119+
return getTriples(fileName, "text/html");
120+
}
121+
122+
private String[] getTriples(String fileName, String contentType) {
123+
String urlString = "file:" + sampleDir + fileSeparator + fileName;
124+
125+
File file = new File(sampleDir + fileSeparator + fileName);
126+
127+
return extract(urlString, file, contentType);
128+
}
116129
}

0 commit comments

Comments
 (0)