protected class WebcrawlerConnector.DocumentURLFilter extends Object
Modifier and Type | Field and Description |
---|---|
protected WebcrawlerConnector.CanonicalizationPolicies |
canonicalizationPolicies
Canonicalization policies
|
protected List<Pattern> |
excludeContentIndexPatterns
List of content exclusion pattern
|
protected List<Pattern> |
excludeIndexPatterns
The arraylist of index exclude patterns
|
protected List<Pattern> |
excludePatterns
The arraylist of exclude patterns
|
protected List<Pattern> |
includeIndexPatterns
The arraylist of index include patterns
|
protected List<Pattern> |
includePatterns
The arraylist of include patterns
|
protected WebcrawlerConnector.MappingRules |
mappings
Mapping rules
|
protected Set<String> |
seedHosts
The hash map of seed hosts, to limit urls by, if non-null
|
protected String |
versionString
The version string
|
Constructor and Description |
---|
WebcrawlerConnector.DocumentURLFilter(Specification spec)
Process a document specification to produce a filter.
|
Modifier and Type | Method and Description |
---|---|
protected String |
findSpecifiedContent(String currentURI,
List<Pattern> patterns) |
WebcrawlerConnector.CanonicalizationPolicies |
getCanonicalizationPolicies()
Get canonicalization policies
|
String |
getVersionString()
Get whatever contribution to the version string should come from this data.
|
boolean |
isDocumentAndHostLegal(String url)
Check if both a document and host are legal.
|
boolean |
isDocumentContentIndexable(String documentIdentifier) |
String |
isDocumentIndexable(String url)
Check if the document identifier is indexable, and return the indexing URL if found.
|
boolean |
isDocumentLegal(String url)
Check if the document identifier is legal.
|
boolean |
isHostLegal(String host)
Check if a host is legal.
|
protected String versionString
protected final WebcrawlerConnector.MappingRules mappings
protected final List<Pattern> includeIndexPatterns
protected final List<Pattern> excludeIndexPatterns
protected final List<Pattern> excludeContentIndexPatterns
protected final WebcrawlerConnector.CanonicalizationPolicies canonicalizationPolicies
public WebcrawlerConnector.DocumentURLFilter(Specification spec) throws ManifoldCFException
ManifoldCFException
public String getVersionString()
public boolean isDocumentAndHostLegal(String url)
public boolean isHostLegal(String host)
public boolean isDocumentLegal(String url)
public String isDocumentIndexable(String url) throws ManifoldCFException
ManifoldCFException
public WebcrawlerConnector.CanonicalizationPolicies getCanonicalizationPolicies()
public boolean isDocumentContentIndexable(String documentIdentifier) throws ManifoldCFException
ManifoldCFException
protected String findSpecifiedContent(String currentURI, List<Pattern> patterns) throws ManifoldCFException
ManifoldCFException