public class RuntimeConfiguration
extends java.lang.Object
All BUbiNG components must share a certain number of global variables, such as filters and pool of objects. A single instance of this class is created at agent construction time: it is used to pass around a single reference to global data.
All fields in this class are either final
or
volatile
, depending on whether they can be modified at runtime
(usually by means of JMX methods in Agent
).
Modifier and Type | Field | Description |
---|---|---|
boolean |
acceptAllCertificates |
|
IntOpenHashSet |
blackListedHostHashes |
The set of hashes of hosts that should be blacklisted.
|
java.util.concurrent.locks.ReadWriteLock |
blackListedHostHashesLock |
A lock used to access
blackListedHostHashes . |
IntOpenHashSet |
blackListedIPv4Addresses |
|
java.util.concurrent.locks.ReadWriteLock |
blackListedIPv4Lock |
A lock used to access
blackListedIPv4Addresses . |
double |
bloomFilterPrecision |
|
int |
connectionTimeout |
|
int |
cookieMaxByteSize |
|
java.lang.String |
cookiePolicy |
|
boolean |
crawlIsNew |
|
java.lang.String |
digestAlgorithm |
|
int |
dnsCacheMaxSize |
|
long |
dnsNegativeTtl |
|
long |
dnsPositiveTtl |
|
org.apache.http.conn.DnsResolver |
dnsResolver |
The DNS resolver used throughout the crawler.
|
int |
dnsThreads |
|
static java.util.regex.Pattern |
DOTTED_ADDRESS |
A pattern used to identify hosts specified directed via their address in dotted notation.
|
static boolean |
FETCH_ROBOTS |
Whether to fetch and use
robots.txt . |
int |
fetchDataBufferByteSize |
|
Filter<java.net.URI> |
fetchFilter |
|
int |
fetchingThreads |
|
Filter<URIResponse> |
followFilter |
|
java.io.File |
frontierDir |
|
java.lang.String |
group |
|
long |
ipDelay |
|
double |
ipDelayFactor |
|
long |
keepAliveTime |
|
long |
maxUrls |
|
int |
maxUrlsPerSchemeAuthority |
|
java.lang.String |
name |
|
Filter<URIResponse> |
parseFilter |
|
java.util.ArrayList<Parser<?>> |
parsers |
The parser, instantiated.
|
int |
parsingThreads |
|
boolean |
paused |
Whether the crawler is currently paused.
|
java.lang.String |
proxyHost |
|
int |
proxyPort |
|
int |
responseBodyMaxByteSize |
|
java.io.File |
responseCacheDir |
|
long |
robotsExpiration |
|
java.io.File |
rootDir |
|
Filter<Link> |
scheduleFilter |
|
long |
schemeAuthorityDelay |
|
java.util.Iterator<java.net.URI> |
seed |
An iterator returning URIs that are then used as a seed; this iterator may return
null (when
invalid or relative URLs are specified). |
int |
sieveAuxFileIOBufferByteSize |
|
java.io.File |
sieveDir |
|
int |
sieveSize |
|
int |
sieveStoreIOBufferByteSize |
|
int |
socketTimeout |
|
int |
spamDetectionPeriodicity |
|
int |
spamDetectionThreshold |
|
SpamDetector<?> |
spamDetector |
|
boolean |
startPaused |
|
boolean |
stopping |
Whether the crawler is currently being stopping.
|
java.lang.Class<? extends Store> |
storeClass |
|
java.io.File |
storeDir |
|
Filter<URIResponse> |
storeFilter |
|
long |
urlCacheMaxByteSize |
|
java.lang.String |
userAgent |
|
java.lang.String |
userAgentFrom |
|
long |
virtualizerMaxByteSize |
|
int |
weight |
|
long |
workbenchMaxByteSize |
Constructor | Description |
---|---|
RuntimeConfiguration(StartupConfiguration startupConfiguration) |
Modifier and Type | Method | Description |
---|---|---|
void |
addBlackListedHost(java.lang.String spec) |
Adds a (or a set of) new host to the black list; the host can be specified directly or it can be a file (prefixed by
file: ). |
void |
addBlackListedIPv4(java.lang.String spec) |
Adds a (or a set of) new IPv4 to the black list; the IPv4 can be specified directly or it can be a file (prefixed by
file: ). |
void |
ensureNotPaused() |
|
static java.util.ArrayList<Parser<?>> |
parsersFromSpecs(java.lang.String[] specs) |
Given an array of parser specifications, it returns the corresponding list of parsers (only
the correct specifications are put in the list.
|
java.lang.String |
toString() |
public static final boolean FETCH_ROBOTS
robots.txt
. This value cannot be configured and it
requires recompilation from the sources.
You should be better knowing what you are doing if you change this to false.
public final java.lang.String name
StartupConfiguration.name
public final java.lang.String group
StartupConfiguration.group
public final int weight
StartupConfiguration.weight
public final int maxUrlsPerSchemeAuthority
public volatile int fetchingThreads
StartupConfiguration.fetchingThreads
public volatile int parsingThreads
StartupConfiguration.parsingThreads
public volatile int dnsThreads
StartupConfiguration.dnsThreads
public volatile Filter<java.net.URI> fetchFilter
StartupConfiguration.fetchFilter
public volatile Filter<Link> scheduleFilter
StartupConfiguration.scheduleFilter
public volatile Filter<URIResponse> parseFilter
StartupConfiguration.parseFilter
public volatile Filter<URIResponse> followFilter
StartupConfiguration.followFilter
public volatile Filter<URIResponse> storeFilter
StartupConfiguration.storeFilter
public volatile long keepAliveTime
StartupConfiguration.keepAliveTime
public volatile long schemeAuthorityDelay
public volatile long ipDelay
StartupConfiguration.ipDelay
public volatile double ipDelayFactor
StartupConfiguration.ipDelayFactor
public volatile long maxUrls
StartupConfiguration.maxUrls
public final double bloomFilterPrecision
public final java.util.Iterator<java.net.URI> seed
null
(when
invalid or relative URLs are specified).StartupConfiguration.seed
public final IntOpenHashSet blackListedIPv4Addresses
StartupConfiguration.seed
public final java.util.concurrent.locks.ReadWriteLock blackListedIPv4Lock
blackListedIPv4Addresses
.public final IntOpenHashSet blackListedHostHashes
StartupConfiguration.blackListedHosts
public final java.util.concurrent.locks.ReadWriteLock blackListedHostHashesLock
blackListedHostHashes
.public volatile int socketTimeout
StartupConfiguration.socketTimeout
public volatile int connectionTimeout
StartupConfiguration.connectionTimeout
public final int fetchDataBufferByteSize
public final java.lang.String proxyHost
StartupConfiguration.proxyHost
public final int proxyPort
StartupConfiguration.proxyPort
public final java.lang.String cookiePolicy
StartupConfiguration.cookiePolicy
public final int cookieMaxByteSize
StartupConfiguration.cookieMaxByteSize
public final java.lang.String userAgent
StartupConfiguration.userAgent
public final java.lang.String userAgentFrom
StartupConfiguration.userAgentFrom
public volatile long robotsExpiration
StartupConfiguration.robotsExpiration
public volatile boolean acceptAllCertificates
public final java.io.File rootDir
StartupConfiguration.rootDir
public final java.io.File storeDir
StartupConfiguration.storeDir
public final java.io.File responseCacheDir
StartupConfiguration.responseCacheDir
public final java.io.File sieveDir
StartupConfiguration.sieveDir
public final java.io.File frontierDir
StartupConfiguration.frontierDir
public volatile int responseBodyMaxByteSize
public final java.lang.String digestAlgorithm
StartupConfiguration.digestAlgorithm
public final boolean startPaused
StartupConfiguration.startPaused
public final java.lang.Class<? extends Store> storeClass
StartupConfiguration.storeClass
public volatile long workbenchMaxByteSize
public final long virtualizerMaxByteSize
public volatile long urlCacheMaxByteSize
StartupConfiguration.urlCacheMaxByteSize
public final int sieveSize
StartupConfiguration.sieveSize
public final int sieveStoreIOBufferByteSize
public final int sieveAuxFileIOBufferByteSize
public final int dnsCacheMaxSize
StartupConfiguration.dnsCacheMaxSize
public final long dnsPositiveTtl
StartupConfiguration.dnsPositiveTtl
public final long dnsNegativeTtl
StartupConfiguration.dnsNegativeTtl
public final boolean crawlIsNew
StartupConfiguration.crawlIsNew
public final SpamDetector<?> spamDetector
StartupConfiguration.spamDetectorUri
public final int spamDetectionThreshold
public final int spamDetectionPeriodicity
public final java.util.ArrayList<Parser<?>> parsers
ParsingThread
instances are obtained by copying this parsers.public volatile boolean paused
notifyAll()
is issued on this runtime configuration.public volatile boolean stopping
public final org.apache.http.conn.DnsResolver dnsResolver
StartupConfiguration.dnsResolverClass
public static final java.util.regex.Pattern DOTTED_ADDRESS
public RuntimeConfiguration(StartupConfiguration startupConfiguration) throws ConfigurationException, java.io.IOException
ConfigurationException
java.io.IOException
public void addBlackListedIPv4(java.lang.String spec) throws ConfigurationException, java.io.FileNotFoundException
file:
).spec
- the specification (an IP address, or a file prefixed by file
).ConfigurationException
java.io.FileNotFoundException
public void addBlackListedHost(java.lang.String spec) throws ConfigurationException, java.io.FileNotFoundException
file:
).spec
- the specification (a host, or a file prefixed by file
).ConfigurationException
java.io.FileNotFoundException
public void ensureNotPaused() throws java.lang.InterruptedException
java.lang.InterruptedException
public java.lang.String toString()
toString
in class java.lang.Object
public static java.util.ArrayList<Parser<?>> parsersFromSpecs(java.lang.String[] specs) throws java.lang.IllegalArgumentException, java.lang.ClassNotFoundException, java.lang.IllegalAccessException, java.lang.reflect.InvocationTargetException, java.lang.InstantiationException, java.lang.NoSuchMethodException, java.io.IOException
specs
- the parser specifications (they will be parsed using ObjectParser
.java.lang.IllegalArgumentException
java.lang.ClassNotFoundException
java.lang.IllegalAccessException
java.lang.reflect.InvocationTargetException
java.lang.InstantiationException
java.lang.NoSuchMethodException
java.io.IOException