blob: 1692907da8e8a2e1e51040a7376cf13845c330f2 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.importing.crawler.web;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/** constants used by web crawler and subcomponents: attribute and attachment names, task parameters. */
public final class WebCrawlerConstants {
/** name of attribute containing the URL of the web resource. */
public static final String ATTRIBUTE_URL = "httpUrl";
/** name of attribute containing the last-modified header reported by the web server (if any). */
public static final String ATTRIBUTE_LASTMODIFIED = "httpLastModified";
/** name of attribute containing the content-type of the web resource reported by the web server (if any). */
public static final String ATTRIBUTE_CONTENTTYPE = "httpContenttype";
/** name of attribute containing the mimetype of the web resource reported by the web server. (if any). */
public static final String ATTRIBUTE_MIMETYPE = "httpMimetype";
/** name of attribute containing the charset of the web resource reported by the web server (if any). */
public static final String ATTRIBUTE_CHARSET = "httpCharset";
/** name of attribute containing the content-length of the web resource reported by the web server (if any). */
public static final String ATTRIBUTE_SIZE = "httpSize";
/** name of attachment containing the content of a web resource. */
public static final String ATTACHMENT_CONTENT = "httpContent";
/** internal attribute used to apply max crawl depth. */
public static final String ATTRIBUTE_CRAWL_DEPTH = "crawlDepth";
/** Name of the task parameter that contains the start URL for crawling. */
public static final String TASK_PARAM_START_URL = "startUrl";
/**
* Name of the task parameter that contains a long value in milliseconds on how long to wait between http requests.
*/
public static final String TASK_PARAM_WAIT_BETWEEN_REQUESTS = "waitBetweenRequests";
/** Name of the task parameter that contains the number of links to write to one bulk object. */
public static final String TASK_PARAM_LINKS_PER_BULK = "linksPerBulk";
/** default value for 'linksPerBulk' parameter. */
public static final int DEFAULT_LINKS_PER_BULK = 10;
/** default user agent, if nothing valid is defined in webcrawler.properties. */
public static final String DEFAULT_USERAGENT =
"SMILA (http://wiki.eclipse.org/SMILA/UserAgent; smila-dev@eclipse.org)";
/** Name of the task parameter that tells how to handle links that cannot be fetched. */
public static final String TASK_PARAM_LINK_ERROR_HANDLING = "linkErrorHandling";
/** what to do on IO errors when fetching links. */
public enum ErrorHandling {
/** finish task as recoverable so that it can be retried. */
RETRY,
/** drop record. */
DROP,
/** ignore errors during content fetching and keep record metadata. */
IGNORE
}
/** the property names the web ETL workers should support for mapping. */
public static final Set<String> PROPERTY_NAMES;
static {
final Set<String> properties = new HashSet<String>();
properties.add(ATTACHMENT_CONTENT);
properties.add(ATTRIBUTE_CHARSET);
properties.add(ATTRIBUTE_CONTENTTYPE);
properties.add(ATTRIBUTE_LASTMODIFIED);
properties.add(ATTRIBUTE_MIMETYPE);
properties.add(ATTRIBUTE_SIZE);
properties.add(ATTRIBUTE_URL);
PROPERTY_NAMES = Collections.unmodifiableSet(properties);
}
/** don't create instances. */
private WebCrawlerConstants() {
throw new UnsupportedOperationException();
}
}