org.apache.nutch.crawl
Class Generator

java.lang.Object
  extended by org.apache.hadoop.util.ToolBase
      extended by org.apache.nutch.crawl.Generator
All Implemented Interfaces:
Configurable, Tool

public class Generator
extends ToolBase

Generates a subset of a crawl db to fetch.


Nested Class Summary
static class Generator.CrawlDbUpdater
          Update the CrawlDB so that the next generate won't include the same URLs.
static class Generator.DecreasingFloatComparator
           
static class Generator.HashComparator
          Sort fetch lists by hash of URL.
static class Generator.Selector
          Selects entries due for fetch.
static class Generator.SelectorEntry
           
static class Generator.SelectorInverseMapper
           
 
Field Summary
static String CRAWL_GEN_CUR_TIME
           
static String CRAWL_GEN_DELAY
           
static String CRAWL_GENERATE_FILTER
           
static String CRAWL_TOP_N
           
static String GENERATE_MAX_PER_HOST
           
static String GENERATE_MAX_PER_HOST_BY_IP
           
static String GENERATE_UPDATE_CRAWLDB
           
static org.apache.commons.logging.Log LOG
           
 
Fields inherited from class org.apache.hadoop.util.ToolBase
conf
 
Constructor Summary
Generator()
           
Generator(Configuration conf)
           
 
Method Summary
 Path generate(Path dbDir, Path segments)
          Generate fetchlists in a segment.
 Path generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean force)
          Generate fetchlists in a segment.
static String generateSegmentName()
           
static void main(String[] args)
          Generate a fetchlist from the crawldb.
 int run(String[] args)
           
 
Methods inherited from class org.apache.hadoop.util.ToolBase
doMain, getConf, setConf
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

CRAWL_GENERATE_FILTER

public static final String CRAWL_GENERATE_FILTER
See Also:
Constant Field Values

GENERATE_MAX_PER_HOST_BY_IP

public static final String GENERATE_MAX_PER_HOST_BY_IP
See Also:
Constant Field Values

GENERATE_MAX_PER_HOST

public static final String GENERATE_MAX_PER_HOST
See Also:
Constant Field Values

GENERATE_UPDATE_CRAWLDB

public static final String GENERATE_UPDATE_CRAWLDB
See Also:
Constant Field Values

CRAWL_TOP_N

public static final String CRAWL_TOP_N
See Also:
Constant Field Values

CRAWL_GEN_CUR_TIME

public static final String CRAWL_GEN_CUR_TIME
See Also:
Constant Field Values

CRAWL_GEN_DELAY

public static final String CRAWL_GEN_DELAY
See Also:
Constant Field Values

LOG

public static final org.apache.commons.logging.Log LOG
Constructor Detail

Generator

public Generator()

Generator

public Generator(Configuration conf)
Method Detail

generate

public Path generate(Path dbDir,
                     Path segments)
              throws IOException
Generate fetchlists in a segment.

Throws:
IOException

generate

public Path generate(Path dbDir,
                     Path segments,
                     int numLists,
                     long topN,
                     long curTime,
                     boolean filter,
                     boolean force)
              throws IOException
Generate fetchlists in a segment.

Returns:
Path to generated segment or null if no entries were selected.
Throws:
IOException

generateSegmentName

public static String generateSegmentName()

main

public static void main(String[] args)
                 throws Exception
Generate a fetchlist from the crawldb.

Throws:
Exception

run

public int run(String[] args)
        throws Exception
Throws:
Exception


Copyright © 2006 The Apache Software Foundation