blob: a8f8e1ac489338cf3772cb20bc4f1f54ce94b1fc [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
* program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.tika.test;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.tika.TikaPipelet;
public class TestTikaPipelet extends ConverterPipelineTestBase {
/** extract HTML or text content from given file. */
protected String executeTest(final String fileName) throws Exception {
final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
return executeTest(fileName, additionalRecordParams);
}
/** test html extraction. */
protected void doHtmlExtraction(final String fileName) throws Exception {
final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
additionalParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, true);
checkHtmlResult(executeTest(fileName, additionalParams));
}
/** test text extraction with providing content type. */
protected void doTextExtractionWithContentType(final String fileName, final String contentType) throws Exception {
final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
additionalParams.put(TikaPipelet.PROP_ATTACHMENT_CONTENT_TYPE_ATTRIBUTE, CONTENT_TYPE_ATTRIBUTE);
additionalParams.put(CONTENT_TYPE_PARAM, contentType);
checkTextResult(executeTest(fileName, additionalParams));
}
/** test text extraction with providing file name. */
protected void doTextExtractionWithFileName(final String fileName) throws Exception {
final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap();
additionalParams.put(TikaPipelet.PROP_FILE_NAME_ATTRIBUTE, FILENAME_ATTRIBUTE);
checkTextResult(executeTest(fileName, additionalParams));
}
/** test simple text extraction without parameters. */
protected void doTextExtraction(final String fileName) throws Exception {
checkTextResult(executeTest(fileName));
}
protected void checkTextResult(final String result) {
assertTrue("SMILA not contained in text content of converted content: " + result, result.contains("SMILA"));
assertFalse("Unexpected HTML tags in converted content: " + result, result.contains("<"));
}
protected void checkHtmlResult(final String result) {
assertTrue("SMILA not contained in content of converted content: " + result, result.contains("SMILA"));
assertTrue("<html> not contained in content of converted content: " + result, result.contains("<body>"));
}
/** Test openoffice 24 odp. */
public void testOPENOFFICE24ODP() throws Exception {
final String fileName = "OpenOffice.2.4.odp";
doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test openoffice 24 ods. */
public void testOPENOFFICE24ODS() throws Exception {
final String fileName = "OpenOffice.2.4.ods";
doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test openoffice 24 odt. */
public void testOPENOFFICE24ODT() throws Exception {
final String fileName = "OpenOffice.2.4.odt";
doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test openoffice 32 odp. */
public void testOPENOFFICE32ODP() throws Exception {
final String fileName = "OpenOffice.3.2.odp";
doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test openoffice 32 ods. */
public void testOPENOFFICE32ODS() throws Exception {
final String fileName = "OpenOffice.3.2.ods";
doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test openoffice 32 odt. */
public void testOPENOFFICE32ODT() throws Exception {
final String fileName = "OpenOffice.3.2.odt";
doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test msoffic e2003 doc. */
public void testMSOFFICE2003DOC() throws Exception {
final String fileName = "MSWORD_97_2003.doc";
doTextExtractionWithContentType(fileName, "application/msword");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** tests ppt 2010. */
public void testMSOFFICE2010PPTX() throws Exception {
final String fileName = "SMILA_PPTX_2010.pptx";
doTextExtractionWithContentType(fileName,
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** tests more complex ppt 2010. */
public void testMSOFFICE2010PPTX_needs_jempbox() throws Exception {
final String fileName = "SMILA_PPTX_2010_needs-jempbox.pptx";
doTextExtractionWithContentType(fileName,
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** tests excel 2010. */
public void testMSOFFICE2010XLSX() throws Exception {
final String fileName = "SMILA_XLS_2010.xlsx";
doTextExtractionWithContentType(fileName, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test msoffic e2007 docx. */
public void testMSOFFICE2007DOCX() throws Exception {
final String fileName = "MSWORD_2007.docx";
doTextExtractionWithContentType(fileName,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** tests docx 2010. */
public void testMSOFFICE2010DOCX() throws Exception {
final String fileName = "SMILA_DOCX_2010.docx";
doTextExtractionWithContentType(fileName,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** tests docx 2010 with wrong extension. */
public void testMSOFFICE2010DOCXWithWrongExtension() throws Exception {
final String fileName = "SMILA_DOCX_2010.ppt";
doTextExtractionWithContentType(fileName,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** tests docx 2010 without extension. */
public void testMSOFFICE2010DOCXWithoutExtension() throws Exception {
final String fileName = "SMILA_DOCX_2010";
doTextExtractionWithContentType(fileName,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test rtf. */
public void testRTF() throws Exception {
final String fileName = "test.rtf";
doTextExtractionWithContentType(fileName, "application/rtf");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test xml. */
public void testXML() throws Exception {
final String fileName = "test.xml";
doTextExtractionWithContentType(fileName, "text/xml");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test msoffic e2003 xls. */
public void testMSOFFICE2003XLS() throws Exception {
final String fileName = "MSEXCEL_97_2003.xls";
doTextExtractionWithContentType(fileName, "application/vnd.ms-excel");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test msoffic e2003 ppt. */
public void testMSOFFICE2003PPT() throws Exception {
final String fileName = "MSPPT_97_2000_XP.ppt";
doTextExtractionWithContentType(fileName, "application/vnd.ms-powerpoint");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
/** Test rss feed. */
public void testRssFeed() throws Exception {
final String fileName = "test-feed.rss";
doTextExtractionWithContentType(fileName, "text/xml");
doTextExtractionWithFileName(fileName);
doTextExtraction(fileName);
doHtmlExtraction(fileName);
}
}