| //------------------------------------------------------------------------------ |
| // Copyright (c) 2005, 2007 IBM Corporation and others. |
| // All rights reserved. This program and the accompanying materials |
| // are made available under the terms of the Eclipse Public License v1.0 |
| // which accompanies this distribution, and is available at |
| // http://www.eclipse.org/legal/epl-v10.html |
| // |
| // Contributors: |
| // IBM Corporation - initial implementation |
| //------------------------------------------------------------------------------ |
| package org.eclipse.epf.search; |
| |
| import java.io.BufferedReader; |
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileWriter; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.io.Reader; |
| import java.util.ArrayList; |
| import java.util.Date; |
| import java.util.Enumeration; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Properties; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.cjk.CJKAnalyzer; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.index.IndexWriter; |
| import org.apache.lucene.store.FSDirectory; |
| import org.eclipse.epf.search.analysis.TextAnalyzer; |
| import org.eclipse.epf.search.utils.JarCreator; |
| import org.eclipse.epf.search.utils.LHTMLParser; |
| import org.eclipse.epf.search.utils.UNCUtil; |
| |
| /** |
| * This class is the main class that creates the Index from the file |
| * associations in the process layout. |
| */ |
| public class IndexBuilder { |
| |
| static final String VERSION_FILE_NAME = "version.txt"; //$NON-NLS-1$ |
| static final String VERSION_DELIMITER = "*"; //$NON-NLS-1$ |
| |
| /** |
| * Document fields. |
| */ |
| public static final String BRIEF_DESCRIPTION_FIELD = "briefDescription"; //$NON-NLS-1$ |
| public static final String CONTENT_FIELD = "contents"; //$NON-NLS-1$ |
| public static final String ID_FIELD = "id"; //$NON-NLS-1$ |
| public static final String MODIFIED_FIELD = "modified"; //$NON-NLS-1$ |
| public static final String NAME_FIELD = "name"; //$NON-NLS-1$ |
| public static final String ROLE_FIELD = "role"; //$NON-NLS-1$ |
| public static final String SUMMARY_FIELD = "summary"; //$NON-NLS-1$ |
| public static final String TYPE_FIELD = "type"; //$NON-NLS-1$ |
| public static final String URL_FIELD = "url"; //$NON-NLS-1$ |
| public static final String TITLE_FIELD = "title"; //$NON-NLS-1$ |
| public static final String UMA_ELEMENT_TYPE_FIELD = "uma.type"; //$NON-NLS-1$ |
| public static final String GENERAL_CONTENT = "general_content"; //$NON-NLS-1$ |
| |
| // List of UMA elements that should be included in the search index. |
| private static List NO_SEARCHEABLE_UMA_ELEMENTS = new ArrayList(); |
| static { |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("summary"); //$NON-NLS-1$ |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("workproductdescriptor"); //$NON-NLS-1$ |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("taskdescriptor"); //$NON-NLS-1$ |
| NO_SEARCHEABLE_UMA_ELEMENTS.add("roledescriptor"); //$NON-NLS-1$ |
| } |
| |
| // A list of top level directories that should be excluded from the search |
| // index. |
| public static List dirsToSkip = new ArrayList(); |
| public static String pDirectory = null; |
| private StringBuffer indexFolder = null; |
| private String productName = null; |
| private List filesToSkip = new ArrayList(); |
| private File parentFolder = null; |
| |
| public IndexBuilder(String publishDir) { |
| int appletIndex = -1; |
| if (publishDir == null) |
| return; |
| |
| appletIndex = publishDir.indexOf(File.separator + "applet"); //$NON-NLS-1$ |
| |
| pDirectory = UNCUtil.convertFilename((appletIndex > -1) ? publishDir |
| .substring(0, appletIndex + 1) : publishDir); |
| String siteName = pDirectory.replace(File.separatorChar, '/'); |
| parentFolder = new File(pDirectory); |
| int index = siteName.length(); |
| if (siteName.endsWith("/")) { //$NON-NLS-1$ |
| index = index - 1; |
| } |
| |
| int index2 = siteName.lastIndexOf("/", index - 1); //$NON-NLS-1$ |
| |
| productName = siteName.substring(index2 + 1, index); |
| |
| // create the index |
| StringBuffer searchFolder = new StringBuffer(pDirectory); |
| if (!searchFolder.toString().endsWith(File.separator)) { |
| searchFolder.append(File.separator); |
| } |
| searchFolder.append("search"); //$NON-NLS-1$ |
| |
| indexFolder = new StringBuffer(searchFolder.toString()); |
| indexFolder.append(File.separator).append("index"); //$NON-NLS-1$ |
| |
| dirsToSkip.add(pDirectory + "applet"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "css"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "ext_help"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "icons"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "images"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "index"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "logs"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "manuals"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "noapplet"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "pages_not_installed"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "process"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "scripts"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "stylesheets"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "xml"); //$NON-NLS-1$ |
| dirsToSkip.add(pDirectory + "search"); //$NON-NLS-1$ |
| |
| filesToSkip.add("_desc.htm"); //$NON-NLS-1$ |
| filesToSkip.add("_wbs.htm"); //$NON-NLS-1$ |
| filesToSkip.add("_tbs.htm"); //$NON-NLS-1$ |
| filesToSkip.add("_wpbs.htm"); //$NON-NLS-1$ |
| } |
| |
| public boolean createIndex(boolean jarIt) throws SearchServiceException { |
| synchronized (IndexBuilder.class) { |
| |
| if (indexFolder == null || pDirectory == null) { |
| throw new IllegalStateException( |
| "Invalid indexFolder or pDirectory"); //$NON-NLS-1$ |
| } |
| |
| |
| boolean jako = false; |
| Locale locale = Locale.getDefault(); |
| String lang = locale.getLanguage(); |
| if (lang.equals(Locale.JAPANESE.getLanguage()) || |
| lang.equals(Locale.KOREA.getLanguage())) { |
| jako = true; |
| } |
| Analyzer analyzer = jako ? new CJKAnalyzer() : new TextAnalyzer(); |
| |
| try { |
| // RAMDirectory ramDir = new RAMDirectory(); |
| IndexWriter fsWriter = new IndexWriter(FSDirectory |
| .getDirectory(indexFolder.toString(), true), |
| analyzer, true); |
| |
| // IndexWriter ramWriter = new IndexWriter(ramDir, |
| // new TextAnalyzer(), true); |
| |
| if ((fsWriter != null)) { |
| // fsWriter.mergeFactor = 1000; |
| // fsWriter.maxMergeDocs = 10000; |
| fsWriter.setMaxFieldLength(1000000); |
| |
| indexDocs(new File(pDirectory), fsWriter); |
| |
| // fsWriter.addIndexes(new Directory[] { ramDir }); |
| fsWriter.optimize(); |
| // ramWriter.close(); |
| fsWriter.close(); |
| } |
| } catch (Exception e) { |
| e.printStackTrace(); |
| } |
| |
| // create the version file. |
| Date today = new Date(); |
| long milliseconds = today.getTime(); |
| |
| if (!jarIt) { |
| try { |
| FileWriter fw = new FileWriter(indexFolder + File.separator |
| + VERSION_FILE_NAME); |
| BufferedWriter bw = new BufferedWriter(fw); |
| bw.write(productName + VERSION_DELIMITER + milliseconds |
| + "\n"); //$NON-NLS-1$ |
| if (analyzer instanceof CJKAnalyzer) { |
| bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$ //$NON-NLS-2$ |
| } |
| bw.close(); |
| fw.close(); |
| } catch (IOException ioe) { |
| throw new SearchServiceException( |
| SearchResources.createSearchIndexError); |
| } |
| |
| return true; |
| } |
| |
| // jar up the created index. |
| JarCreator.jarFolder(indexFolder.toString()); |
| |
| System.out.println("index Jarred successfully"); //$NON-NLS-1$ |
| |
| try { |
| // delete the files now that they've been jarred. |
| File indexDir = new File(indexFolder.toString()); |
| File[] files = indexDir.listFiles(); |
| for (int i = 0; i < files.length; i++) { |
| File tempFile = files[i]; |
| if (!tempFile.getName().equals(JarCreator.INDEX_JAR)) { |
| tempFile.delete(); |
| } |
| } |
| |
| // String rupName = publishDir.substring(index); |
| File newIndexJar = new File(indexFolder + File.separator |
| + JarCreator.INDEX_JAR); |
| if (newIndexJar.exists()) { |
| String fileSize = "" + newIndexJar.length(); //$NON-NLS-1$ |
| FileWriter fw = new FileWriter(indexFolder + File.separator |
| + VERSION_FILE_NAME); |
| BufferedWriter bw = new BufferedWriter(fw); |
| bw.write(productName + VERSION_DELIMITER + milliseconds |
| + VERSION_DELIMITER + fileSize + "\n"); //$NON-NLS-1$ |
| if (analyzer instanceof CJKAnalyzer) { |
| bw.write("CJKAnalyzer" + "\n"); //$NON-NLS-1$ //$NON-NLS-2$ |
| } |
| bw.close(); |
| fw.close(); |
| } else { |
| throw new SearchServiceException( |
| SearchResources.createSearchIndexError); |
| } |
| } catch (IOException ioe) { |
| throw new SearchServiceException( |
| SearchResources.createSearchIndexError); |
| } |
| |
| return true; |
| } |
| } |
| |
| /** |
| * Index the actual documents specified by the files and recursively get all |
| * file in the specified folder file |
| * |
| */ |
| private void indexDocs(File file, IndexWriter writer) throws Exception { |
| if (dirsToSkip.contains(file.getAbsolutePath())) { |
| return; |
| } |
| if (file.isFile()) { |
| for (Iterator iter = filesToSkip.iterator(); iter.hasNext();) { |
| String fileToSkip = (String) iter.next(); |
| if (file.getName().indexOf(fileToSkip) > -1) { |
| return; |
| } |
| } |
| } |
| |
| if (file.isDirectory()) { |
| String[] files = file.list(); |
| for (int i = 0; i < files.length; i++) { |
| indexDocs(new File(file, files[i]), writer); |
| } |
| } else if (isHtmlDoc(file)) { |
| if (shouldBeExcluded(file)) { |
| return; |
| } |
| try { |
| |
| Document doc = getHTMLDocument(file); |
| |
| if (doc != null) { |
| writer.addDocument(doc); |
| } |
| |
| } catch (Exception e1) { |
| System.out.println(file.getName()); |
| System.out.println("indexDocs"); //$NON-NLS-1$ |
| e1.printStackTrace(); |
| } |
| } |
| } |
| |
| /** |
| * Checks whether the given file should be excluded from the search index. |
| * |
| * @param file |
| * The file to be verified. |
| * @return <code>true</code> if the given file should be excluded from the |
| * search index. |
| */ |
| private boolean shouldBeExcluded(File file) { |
| String path = file.getParentFile().getAbsolutePath(); |
| if (pDirectory.startsWith(path)) { |
| return true; |
| } |
| |
| return false; |
| } |
| |
| private static boolean isHtmlDoc(File file) { |
| String path = file.getPath(); |
| return path.endsWith(".html") || path.endsWith(".htm"); //$NON-NLS-1$ //$NON-NLS-2$ |
| } |
| |
| private boolean isNoSearchableDocument(Properties metaTags) { |
| String value = metaTags.getProperty(UMA_ELEMENT_TYPE_FIELD); |
| |
| // value == null is treated as general document |
| return (value != null) && NO_SEARCHEABLE_UMA_ELEMENTS.contains(value); |
| } |
| |
| char[] cbuf = new char[1024]; |
| int skipCount = 0; |
| |
| private Document getHTMLDocument(File file) { |
| Document luceneDocument = null; |
| InputStreamReader input = null; |
| Reader reader = null; |
| BufferedReader bufferedReader = null; |
| try { |
| |
| input = new InputStreamReader(new FileInputStream(file), "UTF-8"); //$NON-NLS-1$ |
| |
| LHTMLParser parser = new LHTMLParser(input); |
| |
| reader = parser.getReader(); |
| if ( reader == null ) { |
| return null; |
| } |
| |
| StringBuffer htmlContent = new StringBuffer(""); |
| String line = ""; |
| bufferedReader = new BufferedReader(reader); |
| while((line = bufferedReader.readLine()) != null) |
| { |
| htmlContent.append(line + "\n"); |
| } |
| |
| Properties metaTags = parser.getMetaTags(); |
| if ( isNoSearchableDocument(metaTags) ) { |
| |
| // the LHTMLParser thread will not end if the reader is not processed |
| // causing major resource leak |
| // while ( reader.read(cbuf) > 0 ) { |
| // ; |
| // } |
| //System.out.println( ++skipCount + " file skipped: " + file.getAbsolutePath()); |
| parser = null; |
| return null; |
| } |
| |
| luceneDocument = new Document(); |
| |
| String url = productName |
| + file.getPath().substring(parentFolder.getPath().length()) |
| .replace(File.separatorChar, '/'); //$NON-NLS-1$ |
| luceneDocument.add(Field.UnIndexed(URL_FIELD, url)); |
| |
| // luceneDocument.add(Field.Text(CONTENT_FIELD, reader)); |
| luceneDocument.add(Field.UnStored(CONTENT_FIELD, htmlContent.toString())); |
| |
| String title = parser.getTitle(); |
| if (title != null && title.length() > 0) { |
| // Workaround a Linux specific issue. |
| title = title.replaceAll("\\xa0", " "); //$NON-NLS-1$ //$NON-NLS-2$ |
| luceneDocument.add(Field.Keyword(TITLE_FIELD, title)); |
| } else { |
| return null; |
| } |
| |
| String summary = parser.getSummary(); |
| if (summary.startsWith(title) && summary.length() > title.length()) { |
| luceneDocument.add(Field.Keyword(SUMMARY_FIELD, summary |
| .substring(title.length() + 1))); |
| } else |
| luceneDocument.add(Field.Keyword(SUMMARY_FIELD, parser |
| .getSummary())); |
| |
| for (Enumeration names = metaTags.propertyNames(); names |
| .hasMoreElements();) { |
| String tagName = (String) names.nextElement(); |
| if (tagName != null) { |
| if (tagName.equals(ROLE_FIELD)) { |
| String roleName = metaTags.getProperty(tagName); |
| if (roleName != null) { |
| luceneDocument.add(Field.Text(tagName, roleName)); |
| } |
| } else { |
| String tagValue = metaTags.getProperty(tagName); |
| if (tagValue != null) { |
| luceneDocument.add(Field.Text(tagName, tagValue)); |
| } |
| } |
| } |
| } |
| |
| if (luceneDocument.getField(ROLE_FIELD) == null) { |
| // Default to "na" to support searching for files without |
| // role meta tags. |
| luceneDocument.add(Field.Text(ROLE_FIELD, "NORUPROLE")); //$NON-NLS-1$ |
| } |
| |
| Field umaTypeField = luceneDocument |
| .getField(UMA_ELEMENT_TYPE_FIELD); |
| if (umaTypeField == null) { |
| // Default to general content. |
| luceneDocument.add(Field.Text(UMA_ELEMENT_TYPE_FIELD, |
| GENERAL_CONTENT)); |
| } |
| |
| parser = null; |
| } catch (Exception e) { |
| luceneDocument = null; |
| SearchPlugin.getDefault().getLogger().logError(e); |
| } finally { |
| if (bufferedReader != null) { |
| try { |
| bufferedReader.close(); |
| } catch (Exception e) { |
| } |
| } |
| if (input != null) { |
| try { |
| input.close(); |
| } catch (Exception e) { |
| } |
| } |
| } |
| |
| return luceneDocument; |
| } |
| |
| } |