blob: 5f789a07b0cdce7bfa4e1bdcc504a5bae2ffe359 [file] [log] [blame]
//------------------------------------------------------------------------------
// Copyright (c) 2005, 2007 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.search;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Date;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.eclipse.epf.search.analysis.TextAnalyzer;
import org.eclipse.epf.search.utils.JarCreator;
import org.eclipse.epf.search.utils.LHTMLParser;
import org.eclipse.epf.search.utils.UNCUtil;
/**
* This class is the main class that creates the Index from the file
* associations in the process layout.
*/
public class IndexBuilder {
static final String VERSION_FILE_NAME = "version.txt"; //$NON-NLS-1$
static final String VERSION_DELIMITER = "*"; //$NON-NLS-1$
/**
* Document fields.
*/
public static final String BRIEF_DESCRIPTION_FIELD = "briefDescription"; //$NON-NLS-1$
public static final String CONTENT_FIELD = "contents"; //$NON-NLS-1$
public static final String ID_FIELD = "id"; //$NON-NLS-1$
public static final String MODIFIED_FIELD = "modified"; //$NON-NLS-1$
public static final String NAME_FIELD = "name"; //$NON-NLS-1$
public static final String ROLE_FIELD = "role"; //$NON-NLS-1$
public static final String SUMMARY_FIELD = "summary"; //$NON-NLS-1$
public static final String TYPE_FIELD = "type"; //$NON-NLS-1$
public static final String URL_FIELD = "url"; //$NON-NLS-1$
public static final String TITLE_FIELD = "title"; //$NON-NLS-1$
public static final String UMA_ELEMENT_TYPE_FIELD = "uma.type"; //$NON-NLS-1$
public static final String GENERAL_CONTENT = "general_content"; //$NON-NLS-1$
// List of UMA elements that should be included in the search index.
private static List NO_SEARCHEABLE_UMA_ELEMENTS = new ArrayList();
static {
NO_SEARCHEABLE_UMA_ELEMENTS.add("summary"); //$NON-NLS-1$
NO_SEARCHEABLE_UMA_ELEMENTS.add("workproductdescriptor"); //$NON-NLS-1$
NO_SEARCHEABLE_UMA_ELEMENTS.add("taskdescriptor"); //$NON-NLS-1$
NO_SEARCHEABLE_UMA_ELEMENTS.add("roledescriptor"); //$NON-NLS-1$
}
// A list of top level directories that should be excluded from the search
// index.
public static List dirsToSkip = new ArrayList();
public static String pDirectory = null;
private StringBuffer indexFolder = null;
private String productName = null;
private List filesToSkip = new ArrayList();
private File parentFolder = null;
public IndexBuilder(String publishDir) {
int appletIndex = -1;
if (publishDir == null)
return;
appletIndex = publishDir.indexOf(File.separator + "applet"); //$NON-NLS-1$
pDirectory = UNCUtil.convertFilename((appletIndex > -1) ? publishDir
.substring(0, appletIndex + 1) : publishDir);
String siteName = pDirectory.replace(File.separatorChar, '/');
parentFolder = new File(pDirectory);
int index = siteName.length();
if (siteName.endsWith("/")) { //$NON-NLS-1$
index = index - 1;
}
int index2 = siteName.lastIndexOf("/", index - 1); //$NON-NLS-1$
productName = siteName.substring(index2 + 1, index);
// create the index
StringBuffer searchFolder = new StringBuffer(pDirectory);
if (!searchFolder.toString().endsWith(File.separator)) {
searchFolder.append(File.separator);
}
searchFolder.append("search"); //$NON-NLS-1$
indexFolder = new StringBuffer(searchFolder.toString());
indexFolder.append(File.separator).append("index"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "applet"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "css"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "ext_help"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "icons"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "images"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "index"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "logs"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "manuals"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "noapplet"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "pages_not_installed"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "process"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "scripts"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "stylesheets"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "xml"); //$NON-NLS-1$
dirsToSkip.add(pDirectory + "search"); //$NON-NLS-1$
filesToSkip.add("_desc.htm"); //$NON-NLS-1$
filesToSkip.add("_wbs.htm"); //$NON-NLS-1$
filesToSkip.add("_tbs.htm"); //$NON-NLS-1$
filesToSkip.add("_wpbs.htm"); //$NON-NLS-1$
}
public boolean createIndex(boolean jarIt) throws SearchServiceException {
synchronized (IndexBuilder.class) {
if (indexFolder == null || pDirectory == null) {
throw new IllegalStateException(
"Invalid indexFolder or pDirectory"); //$NON-NLS-1$
}
boolean jako = false;
Locale locale = Locale.getDefault();
String lang = locale.getLanguage();
if (lang.equals(Locale.JAPANESE.getLanguage()) ||
lang.equals(Locale.KOREA.getLanguage())) {
jako = true;
}
Analyzer analyzer = jako ? new CJKAnalyzer() : new TextAnalyzer();
try {
// RAMDirectory ramDir = new RAMDirectory();
IndexWriter fsWriter = new IndexWriter(FSDirectory
.getDirectory(indexFolder.toString(), true),
analyzer, true);
// IndexWriter ramWriter = new IndexWriter(ramDir,
// new TextAnalyzer(), true);
if ((fsWriter != null)) {
// fsWriter.mergeFactor = 1000;
// fsWriter.maxMergeDocs = 10000;
fsWriter.setMaxFieldLength(1000000);
indexDocs(new File(pDirectory), fsWriter);
// fsWriter.addIndexes(new Directory[] { ramDir });
fsWriter.optimize();
// ramWriter.close();
fsWriter.close();
}
} catch (Exception e) {
e.printStackTrace();
}
// create the version file.
Date today = new Date();
long milliseconds = today.getTime();
if (!jarIt) {
try {
FileWriter fw = new FileWriter(indexFolder + File.separator
+ VERSION_FILE_NAME);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(productName + VERSION_DELIMITER + milliseconds
+ "\n"); //$NON-NLS-1$
if (analyzer instanceof CJKAnalyzer) {
bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$ //$NON-NLS-2$
}
bw.close();
fw.close();
} catch (IOException ioe) {
throw new SearchServiceException(
SearchResources.createSearchIndexError);
}
return true;
}
// jar up the created index.
JarCreator.jarFolder(indexFolder.toString());
System.out.println("index Jarred successfully"); //$NON-NLS-1$
try {
// delete the files now that they've been jarred.
File indexDir = new File(indexFolder.toString());
File[] files = indexDir.listFiles();
for (int i = 0; i < files.length; i++) {
File tempFile = files[i];
if (!tempFile.getName().equals(JarCreator.INDEX_JAR)) {
tempFile.delete();
}
}
// String rupName = publishDir.substring(index);
File newIndexJar = new File(indexFolder + File.separator
+ JarCreator.INDEX_JAR);
if (newIndexJar.exists()) {
String fileSize = "" + newIndexJar.length(); //$NON-NLS-1$
FileWriter fw = new FileWriter(indexFolder + File.separator
+ VERSION_FILE_NAME);
BufferedWriter bw = new BufferedWriter(fw);
bw.write(productName + VERSION_DELIMITER + milliseconds
+ VERSION_DELIMITER + fileSize + "\n"); //$NON-NLS-1$
if (analyzer instanceof CJKAnalyzer) {
bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$ //$NON-NLS-2$
}
bw.close();
fw.close();
} else {
throw new SearchServiceException(
SearchResources.createSearchIndexError);
}
} catch (IOException ioe) {
throw new SearchServiceException(
SearchResources.createSearchIndexError);
}
return true;
}
}
/**
* Index the actual documents specified by the files and recursively get all
* file in the specified folder file
*
*/
private void indexDocs(File file, IndexWriter writer) throws Exception {
if (dirsToSkip.contains(file.getAbsolutePath())) {
return;
}
if (file.isFile()) {
for (Iterator iter = filesToSkip.iterator(); iter.hasNext();) {
String fileToSkip = (String) iter.next();
if (file.getName().indexOf(fileToSkip) > -1) {
return;
}
}
}
if (file.isDirectory()) {
String[] files = file.list();
for (int i = 0; i < files.length; i++) {
indexDocs(new File(file, files[i]), writer);
}
} else if (isHtmlDoc(file)) {
if (shouldBeExcluded(file)) {
return;
}
try {
Document doc = getHTMLDocument(file);
if (doc != null) {
writer.addDocument(doc);
}
} catch (Exception e1) {
System.out.println(file.getName());
System.out.println("indexDocs"); //$NON-NLS-1$
e1.printStackTrace();
}
}
}
/**
* Checks whether the given file should be excluded from the search index.
*
* @param file
* The file to be verified.
* @return <code>true</code> if the given file should be excluded from the
* search index.
*/
private boolean shouldBeExcluded(File file) {
String path = file.getParentFile().getAbsolutePath();
if (pDirectory.startsWith(path)) {
return true;
}
return false;
}
private static boolean isHtmlDoc(File file) {
String path = file.getPath();
return path.endsWith(".html") || path.endsWith(".htm"); //$NON-NLS-1$ //$NON-NLS-2$
}
private boolean isNoSearchableDocument(Properties metaTags) {
String value = metaTags.getProperty(UMA_ELEMENT_TYPE_FIELD);
// value == null is treated as general document
return (value != null) && NO_SEARCHEABLE_UMA_ELEMENTS.contains(value);
}
char[] cbuf = new char[1024];
int skipCount = 0;
private Document getHTMLDocument(File file) {
Document luceneDocument = null;
InputStreamReader input = null;
Reader reader = null;
BufferedReader bufferedReader = null;
try {
input = new InputStreamReader(new FileInputStream(file), "UTF-8"); //$NON-NLS-1$
LHTMLParser parser = new LHTMLParser(input);
reader = parser.getReader();
if ( reader == null ) {
return null;
}
StringBuffer htmlContent = new StringBuffer("");
String line = "";
bufferedReader = new BufferedReader(reader);
while((line = bufferedReader.readLine()) != null)
{
htmlContent.append(line + "\n");
}
Properties metaTags = parser.getMetaTags();
if ( isNoSearchableDocument(metaTags) ) {
// the LHTMLParser thread will not end if the reader is not processed
// causing major resource leak
// while ( reader.read(cbuf) > 0 ) {
// ;
// }
//System.out.println( ++skipCount + " file skipped: " + file.getAbsolutePath());
parser = null;
return null;
}
luceneDocument = new Document();
String url = productName
+ file.getPath().substring(parentFolder.getPath().length())
.replace(File.separatorChar, '/'); //$NON-NLS-1$
luceneDocument.add(Field.UnIndexed(URL_FIELD, url));
// luceneDocument.add(Field.Text(CONTENT_FIELD, reader));
luceneDocument.add(Field.UnStored(CONTENT_FIELD, htmlContent.toString()));
String title = parser.getTitle();
if (title != null && title.length() > 0) {
// Workaround a Linux specific issue.
title = title.replaceAll("\\xa0", " "); //$NON-NLS-1$ //$NON-NLS-2$
luceneDocument.add(Field.Keyword(TITLE_FIELD, title));
} else {
return null;
}
String summary = parser.getSummary();
if (summary.startsWith(title) && summary.length() > title.length()) {
luceneDocument.add(Field.Keyword(SUMMARY_FIELD, summary
.substring(title.length() + 1)));
} else
luceneDocument.add(Field.Keyword(SUMMARY_FIELD, parser
.getSummary()));
for (Enumeration names = metaTags.propertyNames(); names
.hasMoreElements();) {
String tagName = (String) names.nextElement();
if (tagName != null) {
if (tagName.equals(ROLE_FIELD)) {
String roleName = metaTags.getProperty(tagName);
if (roleName != null) {
luceneDocument.add(Field.Text(tagName, roleName));
}
} else {
String tagValue = metaTags.getProperty(tagName);
if (tagValue != null) {
luceneDocument.add(Field.Text(tagName, tagValue));
}
}
}
}
if (luceneDocument.getField(ROLE_FIELD) == null) {
// Default to "na" to support searching for files without
// role meta tags.
luceneDocument.add(Field.Text(ROLE_FIELD, "NORUPROLE")); //$NON-NLS-1$
}
Field umaTypeField = luceneDocument
.getField(UMA_ELEMENT_TYPE_FIELD);
if (umaTypeField == null) {
// Default to general content.
luceneDocument.add(Field.Text(UMA_ELEMENT_TYPE_FIELD,
GENERAL_CONTENT));
}
parser = null;
} catch (Exception e) {
luceneDocument = null;
SearchPlugin.getDefault().getLogger().logError(e);
} finally {
if (bufferedReader != null) {
try {
bufferedReader.close();
} catch (Exception e) {
}
}
if (input != null) {
try {
input.close();
} catch (Exception e) {
}
}
}
return luceneDocument;
}
}