| /******************************************************************************* |
| * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This |
| * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which |
| * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.smila.tika.test; |
| |
| import org.eclipse.smila.datamodel.AnyMap; |
| import org.eclipse.smila.datamodel.DataFactory; |
| import org.eclipse.smila.tika.TikaPipelet; |
| |
| /** Test with parameter 'maxLength' for limiting extracted text length. */ |
| public class TestMaxLength extends ConverterPipelineTestBase { |
| |
| /** test text extraction with maxLength parameter. */ |
| protected String doTextExtraction(final String fileName, final Integer maxLength, final Boolean asHtml) |
| throws Exception { |
| final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap(); |
| if (maxLength != null) { |
| additionalRecordParams.put(TikaPipelet.PROP_MAX_LENGTH, maxLength); |
| } |
| if (asHtml != null && asHtml) { |
| additionalRecordParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, asHtml); |
| } |
| return executeTest(fileName, additionalRecordParams); |
| } |
| |
| /** Test docx 2010. */ |
| public void testMSOFFICE2010DOCX() throws Exception { |
| final String fileName = "SMILA_DOCX_2010.docx"; |
| String result = doTextExtraction(fileName, null, null); |
| assertTrue("was: " + result, result.contains("SMILA")); |
| result = doTextExtraction(fileName, 6, null); |
| // we need maxLength=6 cause Tika will also extract 3 leading whitespaces |
| assertFalse("was: " + result, result.contains("SMILA")); |
| assertTrue("was: " + result, result.contains("SMI")); |
| final String htmlResult = doTextExtraction(fileName, 11, true); |
| // html tags are not taken into account for maxLength parameter, |
| // nevertheless we have to set a higher maxLength parameter here. |
| assertFalse("was: " + htmlResult, htmlResult.contains("SMILA")); |
| assertTrue("was: " + htmlResult, htmlResult.contains("SMI")); |
| assertTrue("was: " + htmlResult, htmlResult.contains("<html")); |
| assertTrue(htmlResult.length() > result.length()); |
| } |
| } |