Merge branch 'master' into NUTCH-2455

apache · Jan 17, 2018 · d2451af · d2451af
2 parents 16f26f1 + f82959d
commit d2451af
Show file tree

Hide file tree

Showing 72 changed files with 6,641 additions and 780 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,15 +1,107 @@
 # Nutch Change Log
 
-Nutch 1.14 Release (dd/mm/yyyy)
+Nutch 1.15 Release (dd/mm/yyyy)
 
 Comments
 
-Fellow committers, Nutch 1.14 contains a breaking change NUTCH-2046. Please use the note below and
-in the release announcement and keep it on top in this CHANGES.txt for the Nutch 1.14 release.
-* the bin/crawl script now expects the path to the seed to be preceded by -s
+Breaking Changes
+
+
+Nutch 1.14 Release 18/12/2017 (dd/mm/yyyy)
+
+    - the bin/crawl script now expects the path to the seed to be preceded by -s  (NUTCH-2046)
+
+Bug
+
+    [NUTCH-2071] - A parser failure on a single document may fail crawling job
+    [NUTCH-2235] - Classpath discrepancy with protocol-selenium in deploy mode
+    [NUTCH-2269] - Clean not working after crawl
+    [NUTCH-2295] - Nutch master docker container broken
+    [NUTCH-2297] - CrawlDbReader -stats wrong values for earliest fetch time and shortest interval
+    [NUTCH-2316] - Library conflict with Parser-Tika Plugin and Lib Folder
+    [NUTCH-2317] - Plugin jars don't get added to classpath while running in local
+    [NUTCH-2322] - URL not available for Jexl operations
+    [NUTCH-2354] - Upgrade Hadoop dependencies to 2.7.4
+    [NUTCH-2365] - HTTP Redirects to SubDomains don't get crawled if db.ignore.external.links.mode == byDomain
+    [NUTCH-2371] - Injector to support noFilter and noNormalize
+    [NUTCH-2372] - Javadocs build failing.
+    [NUTCH-2386] - BasicURLNormalizer does not encode curly braces
+    [NUTCH-2391] - Spurious Duplications for MD5
+    [NUTCH-2394] - Possible bugs in the source code
+    [NUTCH-2398] - Fetcher saving redirected robots.txt under redirect target URL
+    [NUTCH-2399] - indexer-elastic does not index multi-value fields (only the first value is indexed)
+    [NUTCH-2401] - headings plugin does not trim values
+    [NUTCH-2403] - Nutch Selenium: Wrong documentation about PhantomJS
+    [NUTCH-2413] - Parsing fetcher to respect property "parse.filter.urls"
+    [NUTCH-2420] - Bug in variable generate.max.count and fetcher.server.delay
+    [NUTCH-2436] - Remove empty comment, and redundant semicolon from CommandRunner
+    [NUTCH-2442] - Injector to stop if job fails to avoid loss of CrawlDb
+    [NUTCH-2444] - HostDB CSV dumper to emit field header by default
+    [NUTCH-2446] - URLFiltersCheck fix
+    [NUTCH-2448] - Allow Sending an empty http.agent.version
+    [NUTCH-2451] - protocol-ftp to resolve relative URL when following redirects
+    [NUTCH-2452] - Problem retrieving encoded URLs via FTP?
+    [NUTCH-2456] - Allow to index pages/URLs not contained in CrawlDb
+    [NUTCH-2458] - TikaParser doesn't work with tika-config.xml set
+    [NUTCH-2464] - Plugin headings: Headers That Contain HTML Elements Are Not Parsed
+    [NUTCH-2465] - Broken Eclipse project. Classpaths and interactiveselenium should be fixed.
+    [NUTCH-2472] - Sitemap processor does not honour db.ignore.external.links
+    [NUTCH-2473] - Elasticsearch REST Indexer broken due to wrong depenency
+    [NUTCH-2474] - CrawlDbReader -stats fails with ClassCastException
+    [NUTCH-2478] - // is not a valid base URL
+    [NUTCH-2483] - Remove/replace indirect dependencies to org.json
+
+Improvement
+
+    [NUTCH-1763] - Improving comments on the Injector Class
+    [NUTCH-2034] - CrawlDB filtered documents counter.
+    [NUTCH-2035] - Regex filter using case sensitive rules.
+    [NUTCH-2046] - The crawl script should be able to skip an initial injection.
+    [NUTCH-2135] - Ant Eclipse build does not include protocol-interactiveselenium
+    [NUTCH-2193] - Upgrade feed parser plugin to use rome 1.5
+    [NUTCH-2216] - db.ignore.*.links to optionally follow internal redirects
+    [NUTCH-2281] - Support non-default FileSystem
+    [NUTCH-2296] - Elasticsearch Indexing Over Rest
+    [NUTCH-2320] - URLFilterChecker to run as TCP Telnet service
+    [NUTCH-2335] - Injector not to filter and normalize existing URLs in CrawlDb
+    [NUTCH-2362] - Upgrade MaxMind GeoIP version in index-geoip
+    [NUTCH-2368] - Variable generate.max.count and fetcher.server.delay
+    [NUTCH-2370] - FileDumper: save JSON mapping file -> URL
+    [NUTCH-2376] - Improve configurability of HTTP Accept* header fields
+    [NUTCH-2378] - ChildFirst plugin classloader
+    [NUTCH-2380] - indexer-elastic version upgrade to 5.3.0
+    [NUTCH-2397] - Parser to add paragraph line breaks
+    [NUTCH-2400] - Solr 6.6.0 compatibility
+    [NUTCH-2406] - Sum up constants, make minor changes
+    [NUTCH-2408] - CrawlDb: allow update from unparsed segments
+    [NUTCH-2409] - Injector: complete command-line help and counters
+    [NUTCH-2414] - Allow LanguageIndexingFilter to actually filter documents by language.
+    [NUTCH-2430] - Complete plugin build configuration
+    [NUTCH-2431] - URLFilterchecker to implement Tool-interface
+    [NUTCH-2439] - Upgrade to Apache Tika 1.17
+    [NUTCH-2443] - Extract links from the video tag with the parse-html plugin
+    [NUTCH-2445] - Fetcher following outlinks to keep track of already fetched items
+    [NUTCH-2463] - Enable sampling CrawlDB
+    [NUTCH-2468] - should filter out invalid URLs by default
+    [NUTCH-2470] - CrawlDbReader -stats to show quantiles of score
+    [NUTCH-2477] - Refactor *Checker classes to use base class for common code
+    [NUTCH-2480] - Upgrade crawler-commons dependency to 0.9
 
 New Feature
-    [NUTCH-2046] -  The crawl script should be able to skip an initial injection
+
+    [NUTCH-1465] - Support sitemaps in Nutch
+    [NUTCH-1932] - Automatically remove orphaned pages
+    [NUTCH-2333] - Indexer for RabbitMQ
+    [NUTCH-2338] - URLNormalizerChecker to run as TCP Telnet service
+    [NUTCH-2415] - Create a JEXL based IndexingFilter
+    [NUTCH-2433] - Html Parser: keep htmltag where the outlinks are found
+    [NUTCH-2435] - New configuration allowing to choose whether to store 'parse_text' directory or not.
+    [NUTCH-2484] - Extend indexer-elastic-rest to support languages
+
+Task
+
+    [NUTCH-2181] - Add Webpage for 3rd Party Connectors/Libraries to Apache Nutch
+
 
 Nutch 1.13 Release 28/03/2017 (dd/mm/yyyy)
 Release Report: https://s.apache.org/wq3x

diff --git a/NOTICE.txt b/NOTICE.txt
@@ -1,5 +1,5 @@
 Apache Nutch
-Copyright 2017 The Apache Software Foundation
+Copyright 2018 The Apache Software Foundation
 
 This product includes software developed by The Apache Software
 Foundation (http://www.apache.org/).

diff --git a/build.xml b/build.xml
@@ -173,12 +173,14 @@
       <arg value="${javadoc.proxy.port}"/>
 
       <packageset dir="${src.dir}"/>
+      <packageset dir="${plugins.dir}/any23/src/java/" />
       <packageset dir="${plugins.dir}/creativecommons/src/java"/>
       <packageset dir="${plugins.dir}/feed/src/java"/>
       <packageset dir="${plugins.dir}/headings/src/java"/>
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
+      <packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
       <packageset dir="${plugins.dir}/index-links/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
@@ -624,12 +626,14 @@
       <arg value="${javadoc.proxy.port}"/>
 
       <packageset dir="${src.dir}"/>
+      <packageset dir="${plugins.dir}/any23/src/java/" />
       <packageset dir="${plugins.dir}/creativecommons/src/java"/>
       <packageset dir="${plugins.dir}/feed/src/java"/>
       <packageset dir="${plugins.dir}/headings/src/java"/>
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
+      <packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
       <packageset dir="${plugins.dir}/index-links/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
@@ -1030,6 +1034,8 @@
         <source path="${basedir}/src/java/" />
         <source path="${basedir}/src/test/" output="build/test/classes" />
 
+        <source path="${plugins.dir}/any23/src/java/" />
+        <source path="${plugins.dir}/any23/src/test/" />
         <source path="${plugins.dir}/creativecommons/src/java/" />
         <source path="${plugins.dir}/creativecommons/src/test/" />
         <source path="${plugins.dir}/feed/src/java/" />
@@ -1040,6 +1046,8 @@
         <source path="${plugins.dir}/index-basic/src/java/" />
         <source path="${plugins.dir}/index-basic/src/test/" />
         <source path="${plugins.dir}/index-geoip/src/java/" />
+        <source path="${plugins.dir}/index-jexl-filter/src/java/" />
+        <source path="${plugins.dir}/index-jexl-filter/src/test/" />
         <source path="${plugins.dir}/index-links/src/java/" />
         <source path="${plugins.dir}/index-links/src/test/" />
         <source path="${plugins.dir}/index-metadata/src/java/" />

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -164,7 +164,7 @@
 
 <property>
   <name>http.agent.version</name>
-  <value>Nutch-1.14-SNAPSHOT</value>
+  <value>Nutch-1.15-SNAPSHOT</value>
   <description>A version string to advertise in the User-Agent 
    header.</description>
 </property>
@@ -572,7 +572,7 @@
   <value>false</value>
   <description>If true, outlinks leading from a page to internal hosts or domain
   will be ignored. This is an effective way to limit the crawl to include
-  only initially injected hosts, without creating complex URLFilters.
+  only initially injected hosts or domains, without creating complex URLFilters.
   See 'db.ignore.external.links.mode'.
   </description>
 </property>
@@ -582,11 +582,21 @@
   <value>false</value>
   <description>If true, outlinks leading from a page to external hosts or domain
   will be ignored. This is an effective way to limit the crawl to include
-  only initially injected hosts, without creating complex URLFilters.
+  only initially injected hosts or domains, without creating complex URLFilters.
   See 'db.ignore.external.links.mode'.
   </description>
 </property>
 
+<property>
+  <name>db.ignore.also.redirects</name>
+  <value>true</value>
+  <description>If true, the fetcher checks redirects the same way as
+  links when ignoring internal or external links. Set to false to
+  follow redirects despite the values for db.ignore.external.links and
+  db.ignore.internal.links.
+  </description>
+</property>
+
 <property>
   <name>db.ignore.external.links.mode</name>
   <value>byHost</value>
@@ -1054,6 +1064,14 @@
 	Publisher implementation specific properties</description>
 </property> 
 
+<!--  any23 plugin properties -->
+
+<property>
+    <name>any23.extractors</name>
+    <value>html-microdata</value>
+    <description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description>
+</property>
+
 <!-- moreindexingfilter plugin properties -->
 
 <property>
@@ -1225,7 +1243,7 @@
 
 <property>
   <name>plugin.includes</name>
-  <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+  <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
@@ -1406,6 +1424,12 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 </property>
 -->
 
+<property>
+ <name>tika.config.file</name>
+ <value>tika-config.xml</value>
+ <description>Nutch-specific Tika config file</description>
+</property>
+
 <property>
   <name>tika.uppercase.element.names</name>
   <value>true</value>
@@ -1608,6 +1632,34 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
+<property>
+  <name>lang.index.languages</name>
+  <value></value>
+  <description>If not empty, should be a comma separated list of language codes.
+  Only documents with one of these language codes will be indexed.
+  "unknown" is a valid language code, will match documents where language
+  detection failed.
+  </description>
+</property>
+
+<!-- index-jexl-filter plugin properties -->
+
+<property>
+  <name>index.jexl.filter</name>
+  <value></value>
+  <description> A JEXL expression. If it evaluates to false,
+  the document will not be indexed.
+  Available primitives in the JEXL context:
+  * status, fetchTime, modifiedTime, retries, interval, score, signature, url, text, title
+  Available objects in the JEXL context:
+  * httpStatus - contains majorCode, minorCode, message
+  * documentMeta, contentMeta, parseMeta - contain all the Metadata properties.
+    each property value is always an array of Strings (so if you expect one value, use [0])
+  * doc - contains all the NutchFields from the NutchDocument.
+    each property value is always an array of Objects.
+  </description>
+</property>
+
 <!-- index-static plugin properties -->
 
 <property>
@@ -2081,6 +2133,34 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
     <description>Default index to send documents to.</description>
 </property>
 
+<property>
+    <name>elastic.rest.index.languages</name>
+    <value></value>
+    <description>
+        A list of strings denoting the supported languages (e.g. `en,de,fr,it`).
+        If this value is empty all documents will be sent to index ${elastic.rest.index}.
+        If not empty the Rest client will distribute documents in different indices based on their `lang` property.
+        Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.index.separator}${lang} (e.g. `nutch_de`).
+        Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink} (e.g. `nutch_others`).
+    </description>
+</property>
+
+<property>
+    <name>elastic.rest.index.separator</name>
+    <value>_</value>
+    <description>
+        Default value is `_`. Is used only if `elastic.rest.index.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${lang}). 
+    </description>
+</property>
+
+<property>
+    <name>elastic.rest.index.sink</name>
+    <value>others</value>
+    <description>
+        Default value is `others`. Is used only if `elastic.rest.index.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink}).
+    </description>
+</property>
+
 <property>
     <name>elastic.rest.type</name>
     <value>doc</value>

diff --git a/conf/regex-urlfilter.txt.template b/conf/regex-urlfilter.txt.template
@@ -27,7 +27,7 @@
 
 # skip image and other suffixes we can't yet parse
 # for a more extensive coverage use the urlfilter-suffix plugin
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
+-(?i)\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

diff --git a/conf/tika-config.xml.template b/conf/tika-config.xml.template
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <service-loader initializableProblemHandler="ignore"/>
+</properties>
diff --git a/default.properties b/default.properties
@@ -14,9 +14,9 @@
 # limitations under the License.
 
 name=apache-nutch
-version=1.14-SNAPSHOT
+version=1.15-SNAPSHOT
 final.name=${name}-${version}
-year=2017
+year=2018
 
 basedir = ./
 src.dir = ./src/java
@@ -170,6 +170,7 @@ plugins.index=\
    org.apache.nutch.indexer.basic*:\
    org.apache.nutch.indexer.feed*:\
    org.apache.nutch.indexer.geoip*:\
+   org.apache.nutch.indexer.jexl*:\
    org.apache.nutch.indexer.filter*:\
    org.apache.nutch.indexer.links*:\
    org.apache.nutch.indexer.metadata*:\
@@ -202,5 +203,6 @@ plugins.misc=\
    org.apache.nutch.collection*:\
    org.apache.nutch.analysis.lang*:\
    org.creativecommons.nutch*:\
-   org.apache.nutch.microformats.reltag*
+   org.apache.nutch.microformats.reltag*:\
+   org.apache.nutch.any23*