| //------------------------------------------------------------------------------ |
| //Copyright (c) 2005, 2007 IBM Corporation and others. |
| //All rights reserved. This program and the accompanying materials |
| //are made available under the terms of the Eclipse Public License v1.0 |
| //which accompanies this distribution, and is available at |
| //http://www.eclipse.org/legal/epl-v10.html |
| // |
| //Contributors: |
| //IBM Corporation - initial implementation |
| //------------------------------------------------------------------------------ |
| package org.eclipse.epf.search; |
| |
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileWriter; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.util.ArrayList; |
| import java.util.Date; |
| import java.util.Enumeration; |
| import java.util.List; |
| import java.util.Properties; |
| |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FSDirectory; |
| import org.apache.lucene.store.RAMDirectory; |
| import org.eclipse.epf.search.analysis.TextAnalyzer; |
| import org.eclipse.epf.search.utils.LHTMLParser; |
| import org.eclipse.epf.search.utils.JarCreator; |
| import org.eclipse.epf.search.utils.UNCUtil; |
| |
| /** |
| * This class is the main class that creates the Index from the file |
| * associations in the process layout. |
| */ |
| public class IndexBuilder { |
| static final String VERSION_FILE_NAME = "version.txt"; //$NON-NLS-1$ |
| static final String VERSION_DELIMITER = "*"; //$NON-NLS-1$ |
| |
| /** |
| * Document fields. |
| */ |
| public static final String BRIEF_DESCRIPTION_FIELD = "briefDescription"; //$NON-NLS-1$ |
| public static final String CONTENT_FIELD = "contents"; //$NON-NLS-1$ |
| public static final String ID_FIELD = "id"; //$NON-NLS-1$ |
| public static final String MODIFIED_FIELD = "modified"; //$NON-NLS-1$ |
| public static final String NAME_FIELD = "name"; //$NON-NLS-1$ |
| public static final String ROLE_FIELD = "role"; //$NON-NLS-1$ |
| public static final String SUMMARY_FIELD = "summary"; //$NON-NLS-1$ |
| public static final String TYPE_FIELD = "type"; //$NON-NLS-1$ |
| public static final String URL_FIELD = "url"; //$NON-NLS-1$ |
| private static final String TITLE_FIELD = "title"; //$NON-NLS-1$ |
| private static final String UMA_ELEMENT_TYPE_FIELD = "uma.type"; //$NON-NLS-1$ |
| public static final String GENERAL_CONTENT = "general_content"; //$NON-NLS-1$ |
| |
| // List of UMA elements that should be included in the search index. |
| private static List NO_SEARCHEABLE_UMA_ELEMENTS = new ArrayList(); |
| static { |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("summary"); |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("workproductdescriptor"); |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("taskdescriptor"); |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("roledescriptor"); |
| } |
| |
| // A list of top level directories that should be excluded from the search |
| // index. |
| private java.util.List dirsToSkip = new ArrayList(); |
| private String pDirectory = null; |
| private StringBuffer indexFolder = null; |
| private String productName = null; |
| |
| public IndexBuilder(String publishDir) { |
| int appletIndex = -1; |
| if (publishDir == null) |
| return; |
| |
| appletIndex = publishDir.indexOf(File.separator + "applet"); |
| |
| pDirectory = UNCUtil.convertFilename((appletIndex > -1) ? publishDir |
| .substring(0, appletIndex + 1) : publishDir); |
| String siteName = pDirectory.replace(File.separatorChar, '/'); |
| int index = siteName.length(); |
| if (siteName.endsWith("/")) //$NON-NLS-1$ |
| { |
| index = index - 1; |
| } |
| |
| int index2 = siteName.lastIndexOf("/", index - 1); //$NON-NLS-1$ |
| |
| productName = siteName.substring(index2 + 1, index); |
| |
| // create the index |
| StringBuffer searchFolder = new StringBuffer(pDirectory); |
| if (!searchFolder.toString().endsWith(File.separator)) { |
| searchFolder.append(File.separator); |
| } |
| searchFolder.append("search"); //$NON-NLS-2$ |
| |
| indexFolder = new StringBuffer(searchFolder.toString()); |
| indexFolder.append(File.separator).append("index"); //$NON-NLS-1$ |
| |
| dirsToSkip.add(pDirectory + "applet"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "css"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "ext_help"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "icons"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "images"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "index"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "logs"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "manuals"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "noapplet"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "pages_not_installed"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "process"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "scripts"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "stylesheets"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "xml"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "search"); //$NON-NLS-1$ |
| } |
| |
| public boolean createIndex() throws SearchServiceException { |
| synchronized (IndexBuilder.class) { |
| |
| if (indexFolder == null || pDirectory == null) { |
| throw new IllegalStateException( |
| "Invalid indexFolder or pDirectory"); //$NON-NLS-1$ |
| } |
| |
| try { |
| RAMDirectory ramDir = new RAMDirectory(); |
| |
| IndexWriter fsWriter = new IndexWriter(FSDirectory |
| .getDirectory(indexFolder.toString(), true), |
| new TextAnalyzer(), true); |
| |
| IndexWriter ramWriter = new IndexWriter(ramDir, |
| new TextAnalyzer(), true); |
| |
| if ((fsWriter != null)) { |
| // fsWriter.mergeFactor = 1000; |
| // fsWriter.maxMergeDocs = 10000; |
| fsWriter.maxFieldLength = 1000000; |
| |
| indexDocs(new File(pDirectory), ramWriter); |
| |
| fsWriter.addIndexes(new Directory[] { ramDir }); |
| fsWriter.optimize(); |
| ramWriter.close(); |
| fsWriter.close(); |
| } |
| } catch (Exception e) { |
| System.out.println(e.getMessage()); |
| System.out.println("createIndex"); |
| e.printStackTrace(); |
| } |
| |
| System.out.println("index created successfully"); //$NON-NLS-1$ |
| |
| // jar up the created index. |
| JarCreator.jarFolder(indexFolder.toString()); |
| |
| System.out.println("index Jarred successfully"); //$NON-NLS-1$ |
| |
| try { |
| // delete the files now that they've been jarred. |
| File indexDir = new File(indexFolder.toString()); |
| File[] files = indexDir.listFiles(); |
| for (int i = 0; i < files.length; i++) { |
| File tempFile = files[i]; |
| if (!tempFile.getName().equals(JarCreator.INDEX_JAR)) { |
| tempFile.delete(); |
| } |
| } |
| |
| // create the version file. |
| Date today = new Date(); |
| long milliseconds = today.getTime(); |
| |
| // String rupName = publishDir.substring(index); |
| File newIndexJar = new File(indexFolder + File.separator |
| + JarCreator.INDEX_JAR); |
| if (newIndexJar.exists()) { |
| String fileSize = "" + newIndexJar.length(); //$NON-NLS-1$ |
| FileWriter fw = new FileWriter(indexFolder + File.separator |
| + VERSION_FILE_NAME); |
| BufferedWriter bw = new BufferedWriter(fw); |
| bw.write(productName + VERSION_DELIMITER + milliseconds |
| + VERSION_DELIMITER + fileSize + "\n"); //$NON-NLS-1$ |
| bw.close(); |
| fw.close(); |
| } else { |
| throw new SearchServiceException( |
| SearchResources.createSearchIndexError); |
| } |
| } catch (IOException ioe) { |
| throw new SearchServiceException( |
| SearchResources.createSearchIndexError); |
| } |
| |
| return true; |
| } |
| } |
| |
| /** |
| * Index the actual documents specified by the files and recursively get all |
| * file in the specified folder file |
| * |
| */ |
| private void indexDocs(File file, IndexWriter writer) throws Exception { |
| if (dirsToSkip.contains(file.getAbsolutePath())) { |
| return; |
| } |
| if (file.isDirectory()) { |
| String[] files = file.list(); |
| for (int i = 0; i < files.length; i++) { |
| indexDocs(new File(file, files[i]), writer); |
| } |
| } else if (isHtmlDoc(file)) { |
| if (shouldBeExcluded(file)) { |
| return; |
| } |
| try { |
| Document doc = getHTMLDocument(file); |
| |
| if (doc != null) { |
| writer.addDocument(doc); |
| } |
| } catch (Exception e1) { |
| System.out.println(file.getName()); |
| System.out.println("indexDocs"); |
| e1.printStackTrace(); |
| } |
| } |
| } |
| |
| /** |
| * Checks whether the given file should be excluded from the search index. |
| * |
| * @param file |
| * The file to be verified. |
| * @return <code>true</code> if the given file should be excluded from the |
| * search index. |
| */ |
| private boolean shouldBeExcluded(File file) { |
| String path = file.getParentFile().getAbsolutePath(); |
| if (pDirectory.startsWith(path)) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| private static boolean isHtmlDoc(File file) { |
| String path = file.getPath(); |
| return path.endsWith(".html") || path.endsWith(".htm"); //$NON-NLS-2$ //$NON-NLS-3$ |
| } |
| |
| private Document getHTMLDocument(File file) { |
| Document luceneDocument = new Document(); |
| |
| String url = file.getPath().replace(File.pathSeparatorChar, '/'); |
| luceneDocument.add(Field.UnIndexed(URL_FIELD, url)); |
| |
| try { |
| LHTMLParser parser = new LHTMLParser(new InputStreamReader( |
| new FileInputStream(file), "UTF-8")); //$NON-NLS-1$ |
| |
| Reader reader = parser.getReader(); |
| if (reader != null) { |
| luceneDocument.add(Field.Text(CONTENT_FIELD, reader)); |
| } else { |
| return null; |
| } |
| |
| String title = parser.getTitle(); |
| if (title != null && title.length() > 0) { |
| // Workaround a Linux specific issue. |
| title = title.replaceAll("\\xa0", " "); //$NON-NLS-1$ //$NON-NLS-2$ |
| luceneDocument.add(Field.Text(TITLE_FIELD, title)); |
| } else { |
| return null; |
| } |
| |
| String summary = parser.getSummary(); |
| if (summary.startsWith(title)) { |
| luceneDocument.add(Field.UnIndexed(SUMMARY_FIELD, summary |
| .substring(title.length() + 1))); |
| } else |
| luceneDocument.add(Field.UnIndexed(SUMMARY_FIELD, parser |
| .getSummary())); |
| |
| Properties metaTags = parser.getMetaTags(); |
| for (Enumeration names = metaTags.propertyNames(); names |
| .hasMoreElements();) { |
| String tagName = (String) names.nextElement(); |
| if (tagName != null) { |
| if (tagName.equals(ROLE_FIELD)) { |
| String roleName = metaTags.getProperty(tagName); |
| if (roleName != null) { |
| luceneDocument.add(Field.Text(tagName, roleName)); |
| } |
| } else { |
| String tagValue = metaTags.getProperty(tagName); |
| if (tagValue != null) { |
| luceneDocument.add(Field.Text(tagName, tagValue)); |
| } |
| } |
| } |
| } |
| |
| if (luceneDocument.getField(ROLE_FIELD) == null) { |
| // Default to "na" to support searching for files without |
| // role meta tags. |
| luceneDocument.add(Field.Text(ROLE_FIELD, "NORUPROLE")); //$NON-NLS-1$ |
| } |
| |
| Field umaTypeField = luceneDocument |
| .getField(UMA_ELEMENT_TYPE_FIELD); |
| if (umaTypeField == null) { |
| // Default to general content. |
| luceneDocument.add(Field.Text(UMA_ELEMENT_TYPE_FIELD, |
| GENERAL_CONTENT)); |
| } else if (NO_SEARCHEABLE_UMA_ELEMENTS.contains(umaTypeField |
| .stringValue())) { |
| // Exclude non-searcheable elements from the search |
| // index. |
| return null; |
| } |
| |
| parser = null; |
| |
| } catch (Exception e) { |
| luceneDocument = null; |
| System.out.println("getHTMLDocument"); |
| e.printStackTrace(); |
| } |
| |
| return luceneDocument; |
| } |
| |
| public static void main(String[] args) { |
| // TODO Auto-generated method stub |
| |
| } |
| } |