| /******************************************************************************* |
| * Copyright (c) 2008, 2011 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Juergen Schumacher (Attensity Europe GmbH) - initial API and implementation |
| *******************************************************************************/ |
| |
| package org.eclipse.smila.importing.crawler.web; |
| |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| /** constants used by web crawler and subcomponents: attribute and attachment names, task parameters. */ |
| public final class WebCrawlerConstants { |
| /** name of attribute containing the URL of the web resource. */ |
| public static final String ATTRIBUTE_URL = "httpUrl"; |
| |
| /** name of attribute containing the last-modified header reported by the web server (if any). */ |
| public static final String ATTRIBUTE_LASTMODIFIED = "httpLastModified"; |
| |
| /** name of attribute containing the content-type of the web resource reported by the web server (if any). */ |
| public static final String ATTRIBUTE_CONTENTTYPE = "httpContenttype"; |
| |
| /** name of attribute containing the mimetype of the web resource reported by the web server. (if any). */ |
| public static final String ATTRIBUTE_MIMETYPE = "httpMimetype"; |
| |
| /** name of attribute containing the charset of the web resource reported by the web server (if any). */ |
| public static final String ATTRIBUTE_CHARSET = "httpCharset"; |
| |
| /** name of attribute containing the content-length of the web resource reported by the web server (if any). */ |
| public static final String ATTRIBUTE_SIZE = "httpSize"; |
| |
| /** name of attachment containing the content of a web resource. */ |
| public static final String ATTACHMENT_CONTENT = "httpContent"; |
| |
| /** internal attribute used to apply max crawl depth. */ |
| public static final String ATTRIBUTE_CRAWL_DEPTH = "crawlDepth"; |
| |
| /** Name of the task parameter that contains the start URL for crawling. */ |
| public static final String TASK_PARAM_START_URL = "startUrl"; |
| |
| /** |
| * Name of the task parameter that contains a long value in milliseconds on how long to wait between http requests. |
| */ |
| public static final String TASK_PARAM_WAIT_BETWEEN_REQUESTS = "waitBetweenRequests"; |
| |
| /** Name of the task parameter that contains the number of links to write to one bulk object. */ |
| public static final String TASK_PARAM_LINKS_PER_BULK = "linksPerBulk"; |
| |
| /** default value for 'linksPerBulk' parameter. */ |
| public static final int DEFAULT_LINKS_PER_BULK = 10; |
| |
| /** default user agent, if nothing valid is defined in webcrawler.properties. */ |
| public static final String DEFAULT_USERAGENT = |
| "SMILA (http://wiki.eclipse.org/SMILA/UserAgent; smila-dev@eclipse.org)"; |
| |
| /** Name of the task parameter that tells how to handle links that cannot be fetched. */ |
| public static final String TASK_PARAM_LINK_ERROR_HANDLING = "linkErrorHandling"; |
| |
| /** what to do on IO errors when fetching links. */ |
| public enum ErrorHandling { |
| /** finish task as recoverable so that it can be retried. */ |
| RETRY, |
| /** drop record. */ |
| DROP, |
| /** ignore errors during content fetching and keep record metadata. */ |
| IGNORE |
| } |
| |
| /** the property names the web ETL workers should support for mapping. */ |
| public static final Set<String> PROPERTY_NAMES; |
| |
| static { |
| final Set<String> properties = new HashSet<String>(); |
| properties.add(ATTACHMENT_CONTENT); |
| properties.add(ATTRIBUTE_CHARSET); |
| properties.add(ATTRIBUTE_CONTENTTYPE); |
| properties.add(ATTRIBUTE_LASTMODIFIED); |
| properties.add(ATTRIBUTE_MIMETYPE); |
| properties.add(ATTRIBUTE_SIZE); |
| properties.add(ATTRIBUTE_URL); |
| PROPERTY_NAMES = Collections.unmodifiableSet(properties); |
| } |
| |
| /** don't create instances. */ |
| private WebCrawlerConstants() { |
| throw new UnsupportedOperationException(); |
| } |
| } |