Merge pull request #280 from smartive/NUTCH-2502

lewismc · web-flow · commit ec42cfbcac20 · 2018-01-23T10:00:00.000-08:00
NUTCH-2502: Add Content-Type filter option to Any23 plugin
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -1072,6 +1072,12 @@
     <description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description>
 </property>
 
+<property>
+    <name>any23.content_types</name>
+    <value>text/html,application/xhtml+xml</value>
+    <description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description>
+</property>
+
 <!-- moreindexingfilter plugin properties -->
 
 <property>
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -25,6 +25,7 @@
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.Collections;
+import java.util.Arrays;
 
 import org.apache.any23.Any23;
 import org.apache.any23.extractor.ExtractionException;
@@ -77,15 +78,16 @@ public class Any23ParseFilter implements HtmlParseFilter {
   public final static String ANY23_TRIPLES = "Any23-Triples";
 
   public static final String ANY_23_EXTRACTORS_CONF = "any23.extractors";
+  public static final String ANY_23_CONTENT_TYPES_CONF = "any23.content_types";
 
   private static class Any23Parser {
 
     Set<String> triples = null;
 
-    Any23Parser(String url, String htmlContent, String... extractorNames) throws TripleHandlerException {
+    Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException {
       triples = new TreeSet<String>();
       try {
-        parse(url, htmlContent, extractorNames);
+        parse(url, htmlContent, contentType, extractorNames);
       } catch (URISyntaxException e) {
         throw new RuntimeException(e.getReason());
       } catch (IOException e) {
@@ -101,7 +103,7 @@ private Set<String> getTriples() {
       return triples;
     }
 
-    private void parse(String url, String htmlContent, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
+    private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException {
       Any23 any23 = new Any23(extractorNames);
       any23.setMIMETypeDetector(null);
 
@@ -118,7 +120,7 @@ private void parse(String url, String htmlContent, String... extractorNames) thr
         TripleHandler tHandler = new NTriplesWriter(baos);
         BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
         try {
-          any23.extract(input, url, "text/html","UTF-8", bHandler);
+          any23.extract(input, url, contentType, "UTF-8", bHandler);
         } catch (IOException e) {
           LOG.error("Error while reading the source", e);
         } catch (ExtractionException e) {
@@ -154,12 +156,18 @@ public void setConf(Configuration conf) {
    */
   @Override
   public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
-    String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF,"html-head-meta");
+    String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
+    String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
+    String contentType = content.getContentType();
+    if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
+      LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
+      return parseResult;
+    }
 
     Any23Parser parser;
     try {
       String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
-      parser = new Any23Parser(content.getUrl(), htmlContent, extractorNames);
+      parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
     } catch (TripleHandlerException e) {
       throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
     }
@@ -175,4 +183,3 @@ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags
     return parseResult;
   }
 }
-
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
@@ -72,39 +72,40 @@ public void setUp() {
     conf.set("file.content.limit", "-1");
     conf.set("parser.timeout", "-1");
     conf.set(Any23ParseFilter.ANY_23_EXTRACTORS_CONF, "html-embedded-jsonld,html-head-icbm,html-head-links,html-head-meta,html-head-title,html-mf-adr,html-mf-geo,html-mf-hcalendar,html-mf-hcard,html-mf-hlisting,html-mf-hrecipe,html-mf-hresume,html-mf-hreview,html-mf-hreview-aggregate,html-mf-license,html-mf-species,html-mf-xfn,html-microdata,html-rdfa11,html-xpath");
+    conf.set(Any23ParseFilter.ANY_23_CONTENT_TYPES_CONF, "text/html");
   }
 
   @Test
   public void testExtractTriplesFromHTML() throws IOException, ParserNotFound, ParseException {
-
-    String urlString = "file:" + sampleDir + fileSeparator + file1;
-
-    File file = new File(sampleDir + fileSeparator + file1);
-    
-    String[] triplesArray = extract(urlString, file);
+    String[] triplesArray = getTriples(file1);
     
     Assert.assertEquals("We expect 117 tab-separated triples extracted by the filter", 
         EXPECTED_TRIPLES_1, triplesArray.length);
   }
 
   @Test
   public void extractMicroDataFromHTML() throws ParserNotFound, IOException, ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + file2;
-
-    File file = new File(sampleDir + fileSeparator + file2);
-    
-    String[] triplesArray = extract(urlString, file);
+    String[] triplesArray = getTriples(file2);
     
     Assert.assertEquals("We expect 40 tab-separated triples extracted by the filter", 
         EXPECTED_TRIPLES_2, triplesArray.length);
   }
+
+  @Test
+  public void ignoreUnsupported() throws ParserNotFound, IOException, ParseException {
+    String[] triplesArray = getTriples(file1, "application/pdf");
+
+    Assert.assertEquals("We expect no triples extracted by the filter since content-type should be ignored",
+            0, triplesArray.length);
+  }
   
-  public String[] extract(String urlString, File file) {
+  public String[] extract(String urlString, File file, String contentType) {
     try {
       System.out.println(urlString);
       Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
       Content content = protocol.getProtocolOutput(new Text(urlString),
           new CrawlDatum()).getContent();
+      content.setContentType(contentType);
       Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
       return parse.getData().getParseMeta().getValues(Any23ParseFilter.ANY23_TRIPLES);
     } catch (Exception e) {
@@ -113,4 +114,16 @@ public String[] extract(String urlString, File file) {
     }
     return null;
   }
+
+  private String[] getTriples(String fileName) {
+    return getTriples(fileName, "text/html");
+  }
+
+  private String[] getTriples(String fileName, String contentType) {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+
+    File file = new File(sampleDir + fileSeparator + fileName);
+
+    return extract(urlString, file, contentType);
+  }
 }