| /******************************************************************************* |
| * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This |
| * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which |
| * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.smila.tika.test; |
| |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.DataFactory; |
| import org.eclipse.smila.tika.TikaPipelet; |
| |
| public class TestTikaPipelet extends ConverterPipelineTestBase { |
| |
| /** extract HTML or text content from given file. */ |
| protected String executeTest(final String fileName) throws Exception { |
| final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap(); |
| return executeTest(fileName, additionalRecordParams); |
| } |
| |
| /** test html extraction. */ |
| protected void doHtmlExtraction(final String fileName) throws Exception { |
| final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap(); |
| additionalParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, true); |
| checkHtmlResult(executeTest(fileName, additionalParams)); |
| } |
| |
| /** test text extraction with providing content type. */ |
| protected void doTextExtractionWithContentType(final String fileName, final String contentType) throws Exception { |
| final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap(); |
| additionalParams.put(TikaPipelet.PROP_ATTACHMENT_CONTENT_TYPE_ATTRIBUTE, CONTENT_TYPE_ATTRIBUTE); |
| additionalParams.put(CONTENT_TYPE_PARAM, contentType); |
| checkTextResult(executeTest(fileName, additionalParams)); |
| } |
| |
| /** test text extraction with providing file name. */ |
| protected void doTextExtractionWithFileName(final String fileName) throws Exception { |
| final AnyMap additionalParams = DataFactory.DEFAULT.createAnyMap(); |
| additionalParams.put(TikaPipelet.PROP_FILE_NAME_ATTRIBUTE, FILENAME_ATTRIBUTE); |
| checkTextResult(executeTest(fileName, additionalParams)); |
| } |
| |
| /** test simple text extraction without parameters. */ |
| protected void doTextExtraction(final String fileName) throws Exception { |
| checkTextResult(executeTest(fileName)); |
| } |
| |
| protected void checkTextResult(final String result) { |
| assertTrue("SMILA not contained in text content of converted content: " + result, result.contains("SMILA")); |
| assertFalse("Unexpected HTML tags in converted content: " + result, result.contains("<")); |
| } |
| |
| protected void checkHtmlResult(final String result) { |
| assertTrue("SMILA not contained in content of converted content: " + result, result.contains("SMILA")); |
| assertTrue("<html> not contained in content of converted content: " + result, result.contains("<body>")); |
| } |
| |
| /** Test openoffice 24 odp. */ |
| public void testOPENOFFICE24ODP() throws Exception { |
| final String fileName = "OpenOffice.2.4.odp"; |
| doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test openoffice 24 ods. */ |
| public void testOPENOFFICE24ODS() throws Exception { |
| final String fileName = "OpenOffice.2.4.ods"; |
| doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test openoffice 24 odt. */ |
| public void testOPENOFFICE24ODT() throws Exception { |
| final String fileName = "OpenOffice.2.4.odt"; |
| doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test openoffice 32 odp. */ |
| public void testOPENOFFICE32ODP() throws Exception { |
| final String fileName = "OpenOffice.3.2.odp"; |
| doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.presentation"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test openoffice 32 ods. */ |
| public void testOPENOFFICE32ODS() throws Exception { |
| final String fileName = "OpenOffice.3.2.ods"; |
| doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.spreadsheet"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test openoffice 32 odt. */ |
| public void testOPENOFFICE32ODT() throws Exception { |
| final String fileName = "OpenOffice.3.2.odt"; |
| doTextExtractionWithContentType(fileName, "application/vnd.oasis.opendocument.text"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test msoffic e2003 doc. */ |
| public void testMSOFFICE2003DOC() throws Exception { |
| final String fileName = "MSWORD_97_2003.doc"; |
| doTextExtractionWithContentType(fileName, "application/msword"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** tests ppt 2010. */ |
| public void testMSOFFICE2010PPTX() throws Exception { |
| final String fileName = "SMILA_PPTX_2010.pptx"; |
| doTextExtractionWithContentType(fileName, |
| "application/vnd.openxmlformats-officedocument.presentationml.presentation"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** tests more complex ppt 2010. */ |
| public void testMSOFFICE2010PPTX_needs_jempbox() throws Exception { |
| final String fileName = "SMILA_PPTX_2010_needs-jempbox.pptx"; |
| doTextExtractionWithContentType(fileName, |
| "application/vnd.openxmlformats-officedocument.presentationml.presentation"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** tests excel 2010. */ |
| public void testMSOFFICE2010XLSX() throws Exception { |
| final String fileName = "SMILA_XLS_2010.xlsx"; |
| doTextExtractionWithContentType(fileName, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test msoffic e2007 docx. */ |
| public void testMSOFFICE2007DOCX() throws Exception { |
| final String fileName = "MSWORD_2007.docx"; |
| doTextExtractionWithContentType(fileName, |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** tests docx 2010. */ |
| public void testMSOFFICE2010DOCX() throws Exception { |
| final String fileName = "SMILA_DOCX_2010.docx"; |
| doTextExtractionWithContentType(fileName, |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** tests docx 2010 with wrong extension. */ |
| public void testMSOFFICE2010DOCXWithWrongExtension() throws Exception { |
| final String fileName = "SMILA_DOCX_2010.ppt"; |
| doTextExtractionWithContentType(fileName, |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** tests docx 2010 without extension. */ |
| public void testMSOFFICE2010DOCXWithoutExtension() throws Exception { |
| final String fileName = "SMILA_DOCX_2010"; |
| doTextExtractionWithContentType(fileName, |
| "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test rtf. */ |
| public void testRTF() throws Exception { |
| final String fileName = "test.rtf"; |
| doTextExtractionWithContentType(fileName, "application/rtf"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test xml. */ |
| public void testXML() throws Exception { |
| final String fileName = "test.xml"; |
| doTextExtractionWithContentType(fileName, "text/xml"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test msoffic e2003 xls. */ |
| public void testMSOFFICE2003XLS() throws Exception { |
| final String fileName = "MSEXCEL_97_2003.xls"; |
| doTextExtractionWithContentType(fileName, "application/vnd.ms-excel"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test msoffic e2003 ppt. */ |
| public void testMSOFFICE2003PPT() throws Exception { |
| final String fileName = "MSPPT_97_2000_XP.ppt"; |
| doTextExtractionWithContentType(fileName, "application/vnd.ms-powerpoint"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| |
| /** Test rss feed. */ |
| public void testRssFeed() throws Exception { |
| final String fileName = "test-feed.rss"; |
| doTextExtractionWithContentType(fileName, "text/xml"); |
| doTextExtractionWithFileName(fileName); |
| doTextExtraction(fileName); |
| doHtmlExtraction(fileName); |
| } |
| } |