| /******************************************************************************* |
| * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. |
| * This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.smila.tika.test; |
| |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.DataFactory; |
| import org.eclipse.smila.tika.TikaPipelet; |
| |
| /** Test with parameter 'keepHypens' for keepiing/removing hyphens in extracted test. */ |
| public class TestKeepHyphens extends ConverterPipelineTestBase { |
| |
| /** second hyphen should have been removed cause it's followed by newline. */ |
| protected void checkHyphensRemoved(final String result) { |
| assertTrue("result was: " + result, result.contains("first-hyphen")); |
| assertTrue("result was: " + result, result.contains("secondhyphen")); |
| assertTrue("result was: " + result, result.contains("automatische Silbentrennung")); |
| System.out.println(result); |
| } |
| |
| /** second hyphen should have been kept. */ |
| protected void checkHyphensKept(final String result) { |
| assertTrue("result was: " + result, result.contains("first-hyphen")); |
| assertTrue("result was: " + result, result.contains("second-")); |
| System.out.println(result); |
| } |
| |
| /** test text extraction with removing hyphens (default). */ |
| protected void doTextExtractionRemoveHyphens(final String fileName) throws Exception { |
| final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap(); |
| checkHyphensRemoved(executeTest(fileName, additionalRecordParams)); |
| } |
| |
| /** test text extraction with keeping hyphens. */ |
| protected void doTextExtractionKeepHyphens(final String fileName) throws Exception { |
| final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap(); |
| additionalRecordParams.put(TikaPipelet.PROP_KEEP_HYPHENS, "true"); |
| checkHyphensKept(executeTest(fileName, additionalRecordParams)); |
| } |
| |
| /** Test rtf created from 2010 doc. */ |
| public void testRTF() throws Exception { |
| final String fileName = "keepHyphens/MSWORD_2010.rtf"; |
| doTextExtractionRemoveHyphens(fileName); |
| doTextExtractionKeepHyphens(fileName); |
| } |
| |
| /** Test msoffice 2010 doc. */ |
| public void testMSOFFICE2010DOCX() throws Exception { |
| final String fileName = "keepHyphens/MSWORD_2010.docx"; |
| doTextExtractionRemoveHyphens(fileName); |
| doTextExtractionKeepHyphens(fileName); |
| } |
| |
| } |