Skip to content

Commit b02340d

Browse files
Merge pull request #827 from sebastian-nagel/NUTCH-3067
NUTCH-3067 Improve performance of FetchItemQueues if error state is preserved
2 parents a99bd8e + 633fa10 commit b02340d

File tree

4 files changed

+103
-40
lines changed

4 files changed

+103
-40
lines changed

conf/nutch-default.xml

+16-1
Original file line numberDiff line numberDiff line change
@@ -1173,6 +1173,17 @@
11731173
</description>
11741174
</property>
11751175

1176+
<property>
1177+
<name>fetcher.exceptions.per.queue.clear.after</name>
1178+
<value>1800</value>
1179+
<description>Time in seconds after which exception counters in
1180+
queues can be cleared. This happens only if no items are queued in
1181+
this queue and the configured time span has elapsed in addition to
1182+
the crawl delay including the exponential backoff time, see
1183+
fetcher.exceptions.per.queue.delay.
1184+
</description>
1185+
</property>
1186+
11761187
<property>
11771188
<name>fetcher.throughput.threshold.pages</name>
11781189
<value>-1</value>
@@ -1187,7 +1198,11 @@
11871198
<name>fetcher.throughput.threshold.retries</name>
11881199
<value>5</value>
11891200
<description>The number of times the fetcher.throughput.threshold.pages is allowed to be exceeded.
1190-
This settings prevents accidental slow downs from immediately killing the fetcher thread.
1201+
This settings prevents accidental slow downs from immediately shutting down the fetcher threads.
1202+
The throughput is checked approx. every second. Note: the number of retries should be lower
1203+
than the timeout defined by mapreduce.task.timeout and fetcher.threads.timeout.divisor (see there).
1204+
For the default values, the timeout is 300 seconds. Consequently, the number of retries should be
1205+
significantly lower.
11911206
</description>
11921207
</property>
11931208

src/java/org/apache/nutch/fetcher/FetchItemQueue.java

+6-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ public class FetchItemQueue {
4141
private static final Logger LOG = LoggerFactory
4242
.getLogger(MethodHandles.lookup().lookupClass());
4343

44+
private static Text variableFetchDelayKey = new Text("_variableFetchDelay_");
45+
4446
List<FetchItem> queue = Collections
4547
.synchronizedList(new LinkedList<FetchItem>());
4648
AtomicInteger inProgress = new AtomicInteger();
@@ -50,18 +52,20 @@ public class FetchItemQueue {
5052
long minCrawlDelay;
5153
int maxThreads;
5254
Text cookie;
53-
Text variableFetchDelayKey = new Text("_variableFetchDelay_");
5455
boolean variableFetchDelaySet = false;
5556
// keep track of duplicates if fetcher.follow.outlinks.depth > 0. Some urls may
5657
// not get followed due to hash collisions. Hashing is used to reduce memory
5758
// usage.
58-
Set<Integer> alreadyFetched = new HashSet<>();
59+
Set<Integer> alreadyFetched;
5960

6061
public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay,
6162
long minCrawlDelay) {
6263
this.maxThreads = maxThreads;
6364
this.crawlDelay = crawlDelay;
6465
this.minCrawlDelay = minCrawlDelay;
66+
if (conf.getInt("fetcher.follow.outlinks.depth", -1) > 0) {
67+
alreadyFetched = new HashSet<>();
68+
}
6569
// ready to start
6670
setEndTime(System.currentTimeMillis() - crawlDelay);
6771
}

src/java/org/apache/nutch/fetcher/FetchItemQueues.java

+70-30
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ public class FetchItemQueues {
5757
long timelimit = -1;
5858
int maxExceptionsPerQueue = -1;
5959
long exceptionsPerQueueDelay = -1;
60+
long exceptionsPerQueueClearAfter = 1800 * 1000L;
6061
boolean feederAlive = true;
6162
Configuration conf;
6263

@@ -88,6 +89,8 @@ public FetchItemQueues(Configuration conf) {
8889
"fetcher.max.exceptions.per.queue", -1);
8990
this.exceptionsPerQueueDelay = (long) (conf
9091
.getFloat("fetcher.exceptions.per.queue.delay", .0f) * 1000);
92+
this.exceptionsPerQueueClearAfter = (long) (conf
93+
.getFloat("fetcher.exceptions.per.queue.clear.after", 1800.0f) * 1000);
9194

9295
int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds",
9396
-1);
@@ -179,25 +182,41 @@ public synchronized FetchItem getFetchItem() {
179182
it = queues.entrySet().iterator();
180183
}
181184

185+
boolean keepExceptionState = (maxExceptionsPerQueue > -1
186+
|| exceptionsPerQueueDelay > 0);
187+
182188
while (it.hasNext()) {
183189
FetchItemQueue fiq = it.next().getValue();
184190

185191
// reap empty queues which do not hold state required to ensure politeness
186192
if (fiq.getQueueSize() == 0 && fiq.getInProgressSize() == 0) {
187193
if (!feederAlive) {
188-
// no more fetch items added
194+
// no more fetch items added: queue can be safely removed
189195
it.remove();
190-
} else if ((maxExceptionsPerQueue > -1 || exceptionsPerQueueDelay > 0)
191-
&& fiq.exceptionCounter.get() > 0) {
192-
// keep queue because the exceptions counter is bound to it
193-
// and is required to skip or delay items on this queue
194-
} else if (fiq.nextFetchTime.get() > System.currentTimeMillis()) {
196+
continue;
197+
}
198+
199+
if (fiq.nextFetchTime.get() > System.currentTimeMillis()) {
195200
// keep queue to have it blocked in case new fetch items of this queue
196201
// are added by the QueueFeeder
197-
} else {
198-
// empty queue without state
199-
it.remove();
202+
continue;
200203
}
204+
205+
if (keepExceptionState && fiq.exceptionCounter.get() > 0) {
206+
if ((fiq.nextFetchTime.get() + exceptionsPerQueueClearAfter) < System
207+
.currentTimeMillis()) {
208+
/*
209+
* the time configured by fetcher.exceptions.per.queue.clear.after
210+
* has passed in addition to the delay defined by the exponential
211+
* backoff
212+
*/
213+
it.remove();
214+
}
215+
continue;
216+
}
217+
218+
// queue is empty and does not hold state required to ensure politeness
219+
it.remove();
201220
continue;
202221
}
203222

@@ -239,9 +258,9 @@ public synchronized int checkTimelimit() {
239258
return count;
240259
}
241260

242-
// empties the queues (used by timebomb and throughput threshold)
261+
// empties the queues (used by fetcher timelimit and throughput threshold)
243262
public synchronized int emptyQueues() {
244-
int count = 0;
263+
int count = 0, queuesDropped = 0;
245264

246265
for (String id : queues.keySet()) {
247266
FetchItemQueue fiq = queues.get(id);
@@ -251,8 +270,12 @@ public synchronized int emptyQueues() {
251270
int deleted = fiq.emptyQueue();
252271
totalSize.addAndGet(-deleted);
253272
count += deleted;
273+
queuesDropped++;
254274
}
255275

276+
LOG.info("Emptied all queues: {} queues with {} items",
277+
queuesDropped, count);
278+
256279
return count;
257280
}
258281

@@ -282,10 +305,18 @@ public synchronized int checkExceptionThreshold(String queueid,
282305
if (fiq == null) {
283306
return 0;
284307
}
308+
285309
int excCount = fiq.incrementExceptionCounter();
310+
if (maxExceptions != -1 && excCount >= maxExceptions) {
311+
// too many exceptions for items in this queue - purge it
312+
return purgeAndBlockQueue(queueid, fiq, excCount);
313+
}
314+
315+
long nexFetchTime = 0;
286316
if (delay > 0) {
287-
fiq.nextFetchTime.getAndAdd(delay);
317+
nexFetchTime = fiq.nextFetchTime.addAndGet(delay);
288318
LOG.info("* queue: {} >> delayed next fetch by {} ms", queueid, delay);
319+
289320
} else if (exceptionsPerQueueDelay > 0) {
290321
/*
291322
* Delay the next fetch by a time span growing exponentially with the
@@ -298,32 +329,41 @@ public synchronized int checkExceptionThreshold(String queueid,
298329
// double the initial delay with every observed exception
299330
exceptionDelay *= 2L << Math.min((excCount - 2), 31);
300331
}
301-
fiq.nextFetchTime.getAndAdd(exceptionDelay);
332+
nexFetchTime = fiq.nextFetchTime.addAndGet(exceptionDelay);
302333
LOG.info(
303334
"* queue: {} >> delayed next fetch by {} ms after {} exceptions in queue",
304335
queueid, exceptionDelay, excCount);
305336
}
306-
if (maxExceptions != -1 && excCount >= maxExceptions) {
307-
// too many exceptions for items in this queue - purge it
308-
int deleted = fiq.emptyQueue();
309-
if (deleted > 0) {
310-
LOG.info(
311-
"* queue: {} >> removed {} URLs from queue because {} exceptions occurred",
312-
queueid, deleted, excCount);
313-
totalSize.getAndAdd(-deleted);
314-
}
315-
if (feederAlive) {
316-
LOG.info("* queue: {} >> blocked after {} exceptions", queueid,
317-
excCount);
318-
// keep queue IDs to ensure that these queues aren't created and filled
319-
// again, see addFetchItem(FetchItem)
320-
queuesMaxExceptions.add(queueid);
321-
}
322-
return deleted;
337+
338+
if (timelimit > 0 && nexFetchTime > timelimit) {
339+
// the next fetch would happen after the fetcher timelimit
340+
LOG.info(
341+
"* queue: {} >> purging queue because next fetch scheduled after fetcher timelimit",
342+
queueid);
343+
return purgeAndBlockQueue(queueid, fiq, excCount);
323344
}
345+
324346
return 0;
325347
}
326348

349+
private int purgeAndBlockQueue(String queueid, FetchItemQueue fiq,
350+
int excCount) {
351+
int deleted = fiq.emptyQueue();
352+
if (deleted > 0) {
353+
LOG.info(
354+
"* queue: {} >> removed {} URLs from queue after {} exceptions occurred",
355+
queueid, deleted, excCount);
356+
totalSize.getAndAdd(-deleted);
357+
}
358+
if (feederAlive) {
359+
LOG.info("* queue: {} >> blocked after {} exceptions", queueid, excCount);
360+
// keep queue IDs to ensure that these queues aren't created and filled
361+
// again, see addFetchItem(FetchItem)
362+
queuesMaxExceptions.add(queueid);
363+
}
364+
return deleted;
365+
}
366+
327367
/**
328368
* Increment the exception counter of a queue in case of an exception e.g.
329369
* timeout; when higher than a given threshold simply empty the queue.

src/java/org/apache/nutch/fetcher/Fetcher.java

+11-7
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ private void reportStatus(Context context, FetchItemQueues fetchQueues, int page
163163
StringBuilder status = new StringBuilder();
164164
Long elapsed = Long.valueOf((System.currentTimeMillis() - start) / 1000);
165165

166-
float avgPagesSec = (float) pages.get() / elapsed.floatValue();
166+
float avgPagesSec = pages.get() / elapsed.floatValue();
167167
long avgBytesSec = (bytes.get() / 128l) / elapsed.longValue();
168168

169169
status.append(activeThreads).append(" threads (")
@@ -212,11 +212,11 @@ public void run(Context innerContext)
212212
int timeoutDivisor = conf.getInt("fetcher.threads.timeout.divisor", 2);
213213
LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor);
214214

215-
int queueDepthMuliplier = conf.getInt("fetcher.queue.depth.multiplier",
215+
int queueDepthMultiplier = conf.getInt("fetcher.queue.depth.multiplier",
216216
50);
217217

218218
feeder = new QueueFeeder(innerContext, fetchQueues,
219-
threadCount * queueDepthMuliplier);
219+
threadCount * queueDepthMultiplier);
220220

221221
// the value of the time limit is either -1 or the time where it should
222222
// finish
@@ -317,12 +317,15 @@ public void run(Context innerContext)
317317
if (pagesLastSec < throughputThresholdPages) {
318318
throughputThresholdNumRetries++;
319319
LOG.warn(
320-
"{}: dropping below configured threshold of {} pages per second",
321-
throughputThresholdNumRetries, throughputThresholdPages);
320+
"{}: dropping below configured threshold of {} pages per second (current throughput: {} pages/sec.)",
321+
throughputThresholdNumRetries, throughputThresholdPages,
322+
pagesLastSec);
322323

323324
// Quit if we dropped below threshold too many times
324325
if (throughputThresholdNumRetries == throughputThresholdMaxRetries) {
325-
LOG.warn("Dropped below threshold too many times, killing!");
326+
LOG.warn(
327+
"Dropped below threshold {} times, dropping fetch queues to shut down",
328+
throughputThresholdNumRetries);
326329

327330
// Disable the threshold checker
328331
throughputThresholdPages = -1;
@@ -427,7 +430,8 @@ else if (bandwidthTargetCheckCounter == bandwidthTargetCheckEveryNSecs) {
427430
* we stop the fetching now.
428431
*/
429432
if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
430-
LOG.warn("Aborting with {} hung threads.", activeThreads);
433+
LOG.warn("Aborting with {} hung threads{}.", activeThreads,
434+
feeder.isAlive() ? " (queue feeder still alive)" : "");
431435
innerContext.getCounter("FetcherStatus", "hungThreads")
432436
.increment(activeThreads.get());
433437
for (int i = 0; i < fetcherThreads.size(); i++) {

0 commit comments

Comments
 (0)