Skip to content

Commit

Permalink
NUTCH-3100 HostDB to support minimum records per host
Browse files Browse the repository at this point in the history
  • Loading branch information
Markus Jelsma committed Jan 9, 2025
1 parent 18e7aeb commit b52ec90
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
15 changes: 11 additions & 4 deletions src/java/org/apache/nutch/hostdb/UpdateHostDb.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,11 @@ public class UpdateHostDb extends Configured implements Tool {
public static final String HOSTDB_STRING_FIELDS = "hostdb.string.fields";
public static final String HOSTDB_PERCENTILES = "hostdb.percentiles";
public static final String HOSTDB_CRAWLDATUM_PROCESSORS = "hostdb.crawldatum.processors";
public static final String HOSTDB_URL_LIMIT = "hostdb.url.limit";

private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
boolean checkFailed, boolean checkNew, boolean checkKnown,
boolean force, boolean filter, boolean normalize) throws Exception {
boolean force, boolean filter, boolean normalize, long urlLimit) throws Exception {

StopWatch stopWatch = new StopWatch();
stopWatch.start();
Expand Down Expand Up @@ -126,6 +127,7 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts,
conf.setBoolean(HOSTDB_FORCE_CHECK, force);
conf.setBoolean(HOSTDB_URL_FILTERING, filter);
conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
conf.setLong(HOSTDB_URL_LIMIT, urlLimit);
conf.setClassLoader(Thread.currentThread().getContextClassLoader());

try {
Expand Down Expand Up @@ -163,7 +165,7 @@ public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: UpdateHostDb -hostdb <hostdb> " +
"[-tophosts <tophosts>] [-crawldb <crawldb>] [-checkAll] [-checkFailed]" +
" [-checkNew] [-checkKnown] [-force] [-filter] [-normalize]");
" [-checkNew] [-checkKnown] [-force] [-filter] [-normalize] [-urlLimit <N>]");
return -1;
}

Expand All @@ -175,9 +177,9 @@ public int run(String[] args) throws Exception {
boolean checkNew = false;
boolean checkKnown = false;
boolean force = false;

boolean filter = false;
boolean normalize = false;
long urlLimit = -1l;

for (int i = 0; i < args.length; i++) {
if (args[i].equals("-hostdb")) {
Expand Down Expand Up @@ -226,6 +228,11 @@ public int run(String[] args) throws Exception {
LOG.info("UpdateHostDb: normalizing enabled");
normalize = true;
}
if (args[i].equals("-urlLimit")) {
urlLimit = Long.valueOf(args[i + 1]);
LOG.info("UpdateHostDb: URL limit set to " + urlLimit);
i++;
}
}

if (hostDb == null) {
Expand All @@ -235,7 +242,7 @@ public int run(String[] args) throws Exception {

try {
updateHostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew,
checkKnown, force, filter, normalize);
checkKnown, force, filter, normalize, urlLimit);

return 0;
} catch (Exception e) {
Expand Down
10 changes: 10 additions & 0 deletions src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class UpdateHostDbReducer
protected static boolean checkKnown = false;
protected static boolean checkAny = false;
protected static boolean force = false;
protected static long urlLimit = -1l;
protected static long now = new Date().getTime();
protected static String[] numericFields;
protected static String[] stringFields;
Expand All @@ -85,6 +86,7 @@ public void setup(Reducer<Text, NutchWritable, Text, HostDatum>.Context context)
checkKnown = conf.getBoolean(UpdateHostDb.HOSTDB_CHECK_KNOWN, false);
checkAny = checkNew || checkKnown || checkFailed;
force = conf.getBoolean(UpdateHostDb.HOSTDB_FORCE_CHECK, false);
urlLimit = conf.getLong(UpdateHostDb.HOSTDB_URL_LIMIT,-1l);
numericFields = conf.getStrings(UpdateHostDb.HOSTDB_NUMERIC_FIELDS);
stringFields = conf.getStrings(UpdateHostDb.HOSTDB_STRING_FIELDS);
percentiles = conf.getInts(UpdateHostDb.HOSTDB_PERCENTILES);
Expand Down Expand Up @@ -374,6 +376,14 @@ else if (value instanceof FloatWritable) {
hostDatum.getMetaData().put(new Text("min." + entry.getKey()), new FloatWritable(entry.getValue()));
}

// Impose limits on minimum number of URLs?
if (urlLimit > -1l) {
if (hostDatum.numRecords() < urlLimit) {
context.getCounter("UpdateHostDb", "url_limit_not_reached").increment(1);
return;
}
}

context.getCounter("UpdateHostDb", "total_hosts").increment(1);

// See if this record is to be checked
Expand Down

0 comments on commit b52ec90

Please sign in to comment.