From b52ec9025e40152b3a1dae7c78bb803c7ad298ce Mon Sep 17 00:00:00 2001 From: Markus Jelsma Date: Thu, 9 Jan 2025 13:50:17 +0200 Subject: [PATCH] NUTCH-3100 HostDB to support minimum records per host --- .../org/apache/nutch/hostdb/UpdateHostDb.java | 15 +++++++++++---- .../apache/nutch/hostdb/UpdateHostDbReducer.java | 10 ++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java index 5148a6be12..c8b8c43cfb 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java @@ -68,10 +68,11 @@ public class UpdateHostDb extends Configured implements Tool { public static final String HOSTDB_STRING_FIELDS = "hostdb.string.fields"; public static final String HOSTDB_PERCENTILES = "hostdb.percentiles"; public static final String HOSTDB_CRAWLDATUM_PROCESSORS = "hostdb.crawldatum.processors"; + public static final String HOSTDB_URL_LIMIT = "hostdb.url.limit"; private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew, boolean checkKnown, - boolean force, boolean filter, boolean normalize) throws Exception { + boolean force, boolean filter, boolean normalize, long urlLimit) throws Exception { StopWatch stopWatch = new StopWatch(); stopWatch.start(); @@ -126,6 +127,7 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, conf.setBoolean(HOSTDB_FORCE_CHECK, force); conf.setBoolean(HOSTDB_URL_FILTERING, filter); conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize); + conf.setLong(HOSTDB_URL_LIMIT, urlLimit); conf.setClassLoader(Thread.currentThread().getContextClassLoader()); try { @@ -163,7 +165,7 @@ public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: UpdateHostDb -hostdb " + "[-tophosts ] [-crawldb ] [-checkAll] [-checkFailed]" + - " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]"); + " [-checkNew] [-checkKnown] [-force] [-filter] [-normalize] [-urlLimit ]"); return -1; } @@ -175,9 +177,9 @@ public int run(String[] args) throws Exception { boolean checkNew = false; boolean checkKnown = false; boolean force = false; - boolean filter = false; boolean normalize = false; + long urlLimit = -1l; for (int i = 0; i < args.length; i++) { if (args[i].equals("-hostdb")) { @@ -226,6 +228,11 @@ public int run(String[] args) throws Exception { LOG.info("UpdateHostDb: normalizing enabled"); normalize = true; } + if (args[i].equals("-urlLimit")) { + urlLimit = Long.valueOf(args[i + 1]); + LOG.info("UpdateHostDb: URL limit set to " + urlLimit); + i++; + } } if (hostDb == null) { @@ -235,7 +242,7 @@ public int run(String[] args) throws Exception { try { updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, - checkKnown, force, filter, normalize); + checkKnown, force, filter, normalize, urlLimit); return 0; } catch (Exception e) { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java index 1e41fb6df4..2c13756abc 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java @@ -60,6 +60,7 @@ public class UpdateHostDbReducer protected static boolean checkKnown = false; protected static boolean checkAny = false; protected static boolean force = false; + protected static long urlLimit = -1l; protected static long now = new Date().getTime(); protected static String[] numericFields; protected static String[] stringFields; @@ -85,6 +86,7 @@ public void setup(Reducer.Context context) checkKnown = conf.getBoolean(UpdateHostDb.HOSTDB_CHECK_KNOWN, false); checkAny = checkNew || checkKnown || checkFailed; force = conf.getBoolean(UpdateHostDb.HOSTDB_FORCE_CHECK, false); + urlLimit = conf.getLong(UpdateHostDb.HOSTDB_URL_LIMIT,-1l); numericFields = conf.getStrings(UpdateHostDb.HOSTDB_NUMERIC_FIELDS); stringFields = conf.getStrings(UpdateHostDb.HOSTDB_STRING_FIELDS); percentiles = conf.getInts(UpdateHostDb.HOSTDB_PERCENTILES); @@ -374,6 +376,14 @@ else if (value instanceof FloatWritable) { hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue())); } + // Impose limits on minimum number of URLs? + if (urlLimit > -1l) { + if (hostDatum.numRecords() < urlLimit) { + context.getCounter("UpdateHostDb", "url_limit_not_reached").increment(1); + return; + } + } + context.getCounter("UpdateHostDb", "total_hosts").increment(1); // See if this record is to be checked