blob: d2e89e142876f0288f47cd80ac939825ca4049df [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
* program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.tika.test;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.tika.TikaPipelet;
/** Test with parameter 'maxLength' for limiting extracted text length. */
public class TestMaxLength extends ConverterPipelineTestBase {
/** test text extraction with maxLength parameter. */
protected String doTextExtraction(final String fileName, final Integer maxLength, final Boolean asHtml)
throws Exception {
final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
if (maxLength != null) {
additionalRecordParams.put(TikaPipelet.PROP_MAX_LENGTH, maxLength);
}
if (asHtml != null && asHtml) {
additionalRecordParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, asHtml);
}
return executeTest(fileName, additionalRecordParams);
}
/** Test docx 2010. */
public void testMSOFFICE2010DOCX() throws Exception {
final String fileName = "SMILA_DOCX_2010.docx";
String result = doTextExtraction(fileName, null, null);
assertTrue("was: " + result, result.contains("SMILA"));
result = doTextExtraction(fileName, 6, null);
// we need maxLength=6 cause Tika will also extract 3 leading whitespaces
assertFalse("was: " + result, result.contains("SMILA"));
assertTrue("was: " + result, result.contains("SMI"));
final String htmlResult = doTextExtraction(fileName, 11, true);
// html tags are not taken into account for maxLength parameter,
// nevertheless we have to set a higher maxLength parameter here.
assertFalse("was: " + htmlResult, htmlResult.contains("SMILA"));
assertTrue("was: " + htmlResult, htmlResult.contains("SMI"));
assertTrue("was: " + htmlResult, htmlResult.contains("<html"));
assertTrue(htmlResult.length() > result.length());
}
}