blob: 1fa796b39ad4f181d4fa361ae0f0e9bcae2d4cf1 [file] [log] [blame]
//------------------------------------------------------------------------------
// Copyright (c) 2005, 2006 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.publishing.services.search;
import java.io.File;
import java.io.Reader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* This class returns a Lucene document for indexing and searching.
* A document is a unit for indexing and searching. It consists of a
* list of fields that can be indexed and searched. Each field has
* a name and a text value. <br>
*
* The fields that are maintained by this class are: <br>
* <UL>
* <LI>document title</LI>
* <LI>document URL</LI>
* <LI>summary (usually the first few lines of text)</LI>
* <LI>text of document</LI>
* </UL>
* <br>
* All types of document, such as HTML and PDF documents, must
* subclass AbstractDocument. <br>
*
*/
abstract class AbstractDocument
{
static final String FIELD_URL = "url"; //$NON-NLS-1$
static final String FIELD_CONTENTS = "contents"; //$NON-NLS-1$
static final String FIELD_SUMMARY = "summary"; //$NON-NLS-1$
static final String FIELD_TITLE = "title"; //$NON-NLS-1$
static final String FIELD_VALUE_UNDEFINED = ""; //$NON-NLS-1$
protected String _docTitle = null;
protected String _docUrl = null;
protected String _summary = null;
protected String _contents = null;
protected Reader _contentReader = null;
protected Map additionalFields = new HashMap();
/**
* Default constructor.
*/
AbstractDocument()
{
}
/**
* Parses and compiles the document given the file.
*/
Document document( File file )
{
// first compile the document fields
compileDocument( file );
if( _docTitle == null )
{
_docTitle = FIELD_VALUE_UNDEFINED;
}
if( _docUrl == null )
{
_docUrl = FIELD_VALUE_UNDEFINED;
}
if( _summary == null )
{
_summary = FIELD_VALUE_UNDEFINED;
}
// create a new Lucene document
Document luceneDocument = new Document();
// add the url as a field named "url". Use an UnIndexed field, so
// that the url is just stored with the document, but is not searchable.
luceneDocument.add( Field.UnIndexed( FIELD_URL, _docUrl ) );
// add the contents so it will get tokenized and indexed.
if( null != _contents )
{
luceneDocument.add( Field.Text( FIELD_CONTENTS, _contents ) );
}
else
{
luceneDocument.add( Field.Text( FIELD_CONTENTS, _contentReader ) );
}
// add the summary as an UnIndexed field, so that it is stored and returned
// with hit documents for display.
luceneDocument.add( Field.UnIndexed( FIELD_SUMMARY, _summary ) );
// Add the title as a separate Text field, so that it can be searched
// separately.
luceneDocument.add( Field.Text( FIELD_TITLE, _docTitle ) );
if ( additionalFields.size() > 0 )
{
for ( Iterator it = additionalFields.entrySet().iterator(); it.hasNext(); )
{
Map.Entry entry = (Map.Entry) it.next();
luceneDocument.add( Field.Text((String)entry.getKey(), (String)entry.getValue() ) );
}
}
return( luceneDocument );
}
/**
* Sets the document title.
*/
protected void setDocTitle( String title )
{
//System.out.println( "TITLE === " + title );
_docTitle = title;
}
/**
* Sets the document url.
*/
protected void setDocUrl( String url )
{
// System.out.println( "URL === " + url );
_docUrl = url;
}
/**
* Sets the document summary.
*/
protected void setSummary( String summary )
{
//System.out.println( "SUMMARY === " + summary );
_summary = summary;
}
/**
* Sets the document content with the given string.
* Mutually exclusive with setting the document content
* with a reader.
* @see #setContentReader()
*/
protected void setContentString( String contents )
{
System.out.println( contents );
_contents = contents;
_contentReader = null;
}
/**
* Sets the document content with the given reader.
* Mutually exclusive with setting the document content
* with a string.
* @see #setContentString()
*/
protected void setContentReader( Reader contentReader )
{
_contentReader = contentReader;
_contents = null;
}
/**
* Parses and compiles the document fields from the given file.
*/
protected abstract void compileDocument( File file );
}