core/org.eclipse.smila.tika.test/code/src/org/eclipse/smila/tika/test/TestTikaPipelet.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
  * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
  * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
  *******************************************************************************/
 package org.eclipse.smila.tika.test;

 import org.eclipse.smila.datamodel.AnyMap;
 import org.eclipse.smila.datamodel.DataFactory;
 import org.eclipse.smila.tika.TikaPipelet;

 public class TestTikaPipelet extends ConverterPipelineTestBase {

   /** extract HTML or text content from given file. */
   protected String executeTest(final String fileName) throws Exception {
     final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
     return executeTest(fileName, additionalRecordParams);
   }

   /** test html extraction. */
   protected void doHtmlExtraction(final String fileName) throws Exception {
     final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
     additionalParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, true);
     checkHtmlResult(executeTest(fileName, additionalParams));
   }

   /** test text extraction with providing content type. */
   protected void doTextExtractionWithContentType(final String fileName, final String contentType) throws Exception {
     final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
     additionalParams.put(TikaPipelet.PROP_ATTACHMENT_CONTENT_TYPE_ATTRIBUTE, CONTENT_TYPE_ATTRIBUTE);
     additionalParams.put(CONTENT_TYPE_PARAM, contentType);
     checkTextResult(executeTest(fileName, additionalParams));
   }

   /** test text extraction with providing file name. */
   protected void doTextExtractionWithFileName(final String fileName) throws Exception {
     final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
     additionalParams.put(TikaPipelet.PROP_FILE_NAME_ATTRIBUTE, FILENAME_ATTRIBUTE);
     checkTextResult(executeTest(fileName, additionalParams));
   }

   /** test simple text extraction without parameters. */
   protected void doTextExtraction(final String fileName) throws Exception {
     checkTextResult(executeTest(fileName));
   }

   protected void checkTextResult(final String result) {
     assertTrue("SMILA not contained in text content of converted content: " + result, result.contains("SMILA"));
     assertFalse("Unexpected HTML tags in converted content: " + result, result.contains("<"));
   }

   protected void checkHtmlResult(final String result) {
     assertTrue("SMILA not contained in content of converted content: " + result, result.contains("SMILA"));
     assertTrue("<html> not contained in content of converted content: " + result, result.contains("<body>"));
   }

   /** Test openoffice 24 odp. */
   public void testOPENOFFICE24ODP() throws Exception {
     final String fileName = "OpenOffice.2.4.odp";
     doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test openoffice 24 ods. */
   public void testOPENOFFICE24ODS() throws Exception {
     final String fileName = "OpenOffice.2.4.ods";
     doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test openoffice 24 odt. */
   public void testOPENOFFICE24ODT() throws Exception {
     final String fileName = "OpenOffice.2.4.odt";
     doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test openoffice 32 odp. */
   public void testOPENOFFICE32ODP() throws Exception {
     final String fileName = "OpenOffice.3.2.odp";
     doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test openoffice 32 ods. */
   public void testOPENOFFICE32ODS() throws Exception {
     final String fileName = "OpenOffice.3.2.ods";
     doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test openoffice 32 odt. */
   public void testOPENOFFICE32ODT() throws Exception {
     final String fileName = "OpenOffice.3.2.odt";
     doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test msoffic e2003 doc. */
   public void testMSOFFICE2003DOC() throws Exception {
     final String fileName = "MSWORD_97_2003.doc";
     doTextExtractionWithContentType(fileName, "application/msword");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** tests ppt 2010. */
   public void testMSOFFICE2010PPTX() throws Exception {
     final String fileName = "SMILA_PPTX_2010.pptx";
     doTextExtractionWithContentType(fileName,
       "application/vnd.openxmlformats-officedocument.presentationml.presentation");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** tests more complex ppt 2010. */
   public void testMSOFFICE2010PPTX_needs_jempbox() throws Exception {
     final String fileName = "SMILA_PPTX_2010_needs-jempbox.pptx";
     doTextExtractionWithContentType(fileName,
       "application/vnd.openxmlformats-officedocument.presentationml.presentation");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** tests excel 2010. */
   public void testMSOFFICE2010XLSX() throws Exception {
     final String fileName = "SMILA_XLS_2010.xlsx";
     doTextExtractionWithContentType(fileName, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test msoffic e2007 docx. */
   public void testMSOFFICE2007DOCX() throws Exception {
     final String fileName = "MSWORD_2007.docx";
     doTextExtractionWithContentType(fileName,
       "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** tests docx 2010. */
   public void testMSOFFICE2010DOCX() throws Exception {
     final String fileName = "SMILA_DOCX_2010.docx";
     doTextExtractionWithContentType(fileName,
       "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** tests docx 2010 with wrong extension. */
   public void testMSOFFICE2010DOCXWithWrongExtension() throws Exception {
     final String fileName = "SMILA_DOCX_2010.ppt";
     doTextExtractionWithContentType(fileName,
       "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** tests docx 2010 without extension. */
   public void testMSOFFICE2010DOCXWithoutExtension() throws Exception {
     final String fileName = "SMILA_DOCX_2010";
     doTextExtractionWithContentType(fileName,
       "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test rtf. */
   public void testRTF() throws Exception {
     final String fileName = "test.rtf";
     doTextExtractionWithContentType(fileName, "application/rtf");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test xml. */
   public void testXML() throws Exception {
     final String fileName = "test.xml";
     doTextExtractionWithContentType(fileName, "text/xml");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test msoffic e2003 xls. */
   public void testMSOFFICE2003XLS() throws Exception {
     final String fileName = "MSEXCEL_97_2003.xls";
     doTextExtractionWithContentType(fileName, "application/vnd.ms-excel");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test msoffic e2003 ppt. */
   public void testMSOFFICE2003PPT() throws Exception {
     final String fileName = "MSPPT_97_2000_XP.ppt";
     doTextExtractionWithContentType(fileName, "application/vnd.ms-powerpoint");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }

   /** Test rss feed. */
   public void testRssFeed() throws Exception {
     final String fileName = "test-feed.rss";
     doTextExtractionWithContentType(fileName, "text/xml");
     doTextExtractionWithFileName(fileName);
     doTextExtraction(fileName);
     doHtmlExtraction(fileName);
   }
 }
	/*******************************************************************************
	* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
	* program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
	* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
	*******************************************************************************/
	package org.eclipse.smila.tika.test;

	import org.eclipse.smila.datamodel.AnyMap;
	import org.eclipse.smila.datamodel.DataFactory;
	import org.eclipse.smila.tika.TikaPipelet;

	public class TestTikaPipelet extends ConverterPipelineTestBase {

	/** extract HTML or text content from given file. */
	protected String executeTest(final String fileName) throws Exception {
	final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
	return executeTest(fileName, additionalRecordParams);
	}

	/** test html extraction. */
	protected void doHtmlExtraction(final String fileName) throws Exception {
	final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
	additionalParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, true);
	checkHtmlResult(executeTest(fileName, additionalParams));
	}

	/** test text extraction with providing content type. */
	protected void doTextExtractionWithContentType(final String fileName, final String contentType) throws Exception {
	final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
	additionalParams.put(TikaPipelet.PROP_ATTACHMENT_CONTENT_TYPE_ATTRIBUTE, CONTENT_TYPE_ATTRIBUTE);
	additionalParams.put(CONTENT_TYPE_PARAM, contentType);
	checkTextResult(executeTest(fileName, additionalParams));
	}

	/** test text extraction with providing file name. */
	protected void doTextExtractionWithFileName(final String fileName) throws Exception {
	final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
	additionalParams.put(TikaPipelet.PROP_FILE_NAME_ATTRIBUTE, FILENAME_ATTRIBUTE);
	checkTextResult(executeTest(fileName, additionalParams));
	}

	/** test simple text extraction without parameters. */
	protected void doTextExtraction(final String fileName) throws Exception {
	checkTextResult(executeTest(fileName));
	}

	protected void checkTextResult(final String result) {
	assertTrue("SMILA not contained in text content of converted content: " + result, result.contains("SMILA"));
	assertFalse("Unexpected HTML tags in converted content: " + result, result.contains("<"));
	}

	protected void checkHtmlResult(final String result) {
	assertTrue("SMILA not contained in content of converted content: " + result, result.contains("SMILA"));
	assertTrue("<html> not contained in content of converted content: " + result, result.contains("<body>"));
	}

	/** Test openoffice 24 odp. */
	public void testOPENOFFICE24ODP() throws Exception {
	final String fileName = "OpenOffice.2.4.odp";
	doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test openoffice 24 ods. */
	public void testOPENOFFICE24ODS() throws Exception {
	final String fileName = "OpenOffice.2.4.ods";
	doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test openoffice 24 odt. */
	public void testOPENOFFICE24ODT() throws Exception {
	final String fileName = "OpenOffice.2.4.odt";
	doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test openoffice 32 odp. */
	public void testOPENOFFICE32ODP() throws Exception {
	final String fileName = "OpenOffice.3.2.odp";
	doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test openoffice 32 ods. */
	public void testOPENOFFICE32ODS() throws Exception {
	final String fileName = "OpenOffice.3.2.ods";
	doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test openoffice 32 odt. */
	public void testOPENOFFICE32ODT() throws Exception {
	final String fileName = "OpenOffice.3.2.odt";
	doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test msoffic e2003 doc. */
	public void testMSOFFICE2003DOC() throws Exception {
	final String fileName = "MSWORD_97_2003.doc";
	doTextExtractionWithContentType(fileName, "application/msword");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** tests ppt 2010. */
	public void testMSOFFICE2010PPTX() throws Exception {
	final String fileName = "SMILA_PPTX_2010.pptx";
	doTextExtractionWithContentType(fileName,
	"application/vnd.openxmlformats-officedocument.presentationml.presentation");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** tests more complex ppt 2010. */
	public void testMSOFFICE2010PPTX_needs_jempbox() throws Exception {
	final String fileName = "SMILA_PPTX_2010_needs-jempbox.pptx";
	doTextExtractionWithContentType(fileName,
	"application/vnd.openxmlformats-officedocument.presentationml.presentation");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** tests excel 2010. */
	public void testMSOFFICE2010XLSX() throws Exception {
	final String fileName = "SMILA_XLS_2010.xlsx";
	doTextExtractionWithContentType(fileName, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test msoffic e2007 docx. */
	public void testMSOFFICE2007DOCX() throws Exception {
	final String fileName = "MSWORD_2007.docx";
	doTextExtractionWithContentType(fileName,
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** tests docx 2010. */
	public void testMSOFFICE2010DOCX() throws Exception {
	final String fileName = "SMILA_DOCX_2010.docx";
	doTextExtractionWithContentType(fileName,
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** tests docx 2010 with wrong extension. */
	public void testMSOFFICE2010DOCXWithWrongExtension() throws Exception {
	final String fileName = "SMILA_DOCX_2010.ppt";
	doTextExtractionWithContentType(fileName,
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** tests docx 2010 without extension. */
	public void testMSOFFICE2010DOCXWithoutExtension() throws Exception {
	final String fileName = "SMILA_DOCX_2010";
	doTextExtractionWithContentType(fileName,
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test rtf. */
	public void testRTF() throws Exception {
	final String fileName = "test.rtf";
	doTextExtractionWithContentType(fileName, "application/rtf");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test xml. */
	public void testXML() throws Exception {
	final String fileName = "test.xml";
	doTextExtractionWithContentType(fileName, "text/xml");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test msoffic e2003 xls. */
	public void testMSOFFICE2003XLS() throws Exception {
	final String fileName = "MSEXCEL_97_2003.xls";
	doTextExtractionWithContentType(fileName, "application/vnd.ms-excel");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test msoffic e2003 ppt. */
	public void testMSOFFICE2003PPT() throws Exception {
	final String fileName = "MSPPT_97_2000_XP.ppt";
	doTextExtractionWithContentType(fileName, "application/vnd.ms-powerpoint");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}

	/** Test rss feed. */
	public void testRssFeed() throws Exception {
	final String fileName = "test-feed.rss";
	doTextExtractionWithContentType(fileName, "text/xml");
	doTextExtractionWithFileName(fileName);
	doTextExtraction(fileName);
	doHtmlExtraction(fileName);
	}
	}