core/org.eclipse.smila.tika.test/code/src/org/eclipse/smila/tika/test/TestMaxLength.java - gerrit/smila/org.eclipse.smila.core - Git at Google

 /*******************************************************************************
  * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
  * program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
  * accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
  *
  * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
  *******************************************************************************/
 package org.eclipse.smila.tika.test;

 import org.eclipse.smila.datamodel.AnyMap;
 import org.eclipse.smila.datamodel.DataFactory;
 import org.eclipse.smila.tika.TikaPipelet;

 /** Test with parameter 'maxLength' for limiting extracted text length. */
 public class TestMaxLength extends ConverterPipelineTestBase {

   /** test text extraction with maxLength parameter. */
   protected String doTextExtraction(final String fileName, final Integer maxLength, final Boolean asHtml)
     throws Exception {
     final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
     if (maxLength != null) {
       additionalRecordParams.put(TikaPipelet.PROP_MAX_LENGTH, maxLength);
     }
     if (asHtml != null && asHtml) {
       additionalRecordParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, asHtml);
     }
     return executeTest(fileName, additionalRecordParams);
   }

   /** Test docx 2010. */
   public void testMSOFFICE2010DOCX() throws Exception {
     final String fileName = "SMILA_DOCX_2010.docx";
     String result = doTextExtraction(fileName, null, null);
     assertTrue("was: " + result, result.contains("SMILA"));
     result = doTextExtraction(fileName, 6, null);
     // we need maxLength=6 cause Tika will also extract 3 leading whitespaces
     assertFalse("was: " + result, result.contains("SMILA"));
     assertTrue("was: " + result, result.contains("SMI"));
     final String htmlResult = doTextExtraction(fileName, 11, true);
     // html tags are not taken into account for maxLength parameter,
     // nevertheless we have to set a higher maxLength parameter here.
     assertFalse("was: " + htmlResult, htmlResult.contains("SMILA"));
     assertTrue("was: " + htmlResult, htmlResult.contains("SMI"));
     assertTrue("was: " + htmlResult, htmlResult.contains("<html"));
     assertTrue(htmlResult.length() > result.length());
   }
 }
	/*******************************************************************************
	* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. This
	* program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 which
	* accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
	*
	* Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
	*******************************************************************************/
	package org.eclipse.smila.tika.test;

	import org.eclipse.smila.datamodel.AnyMap;
	import org.eclipse.smila.datamodel.DataFactory;
	import org.eclipse.smila.tika.TikaPipelet;

	/** Test with parameter 'maxLength' for limiting extracted text length. */
	public class TestMaxLength extends ConverterPipelineTestBase {

	/** test text extraction with maxLength parameter. */
	protected String doTextExtraction(final String fileName, final Integer maxLength, final Boolean asHtml)
	throws Exception {
	final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
	if (maxLength != null) {
	additionalRecordParams.put(TikaPipelet.PROP_MAX_LENGTH, maxLength);
	}
	if (asHtml != null && asHtml) {
	additionalRecordParams.put(TikaPipelet.PROP_EXPORT_AS_HTML, asHtml);
	}
	return executeTest(fileName, additionalRecordParams);
	}

	/** Test docx 2010. */
	public void testMSOFFICE2010DOCX() throws Exception {
	final String fileName = "SMILA_DOCX_2010.docx";
	String result = doTextExtraction(fileName, null, null);
	assertTrue("was: " + result, result.contains("SMILA"));
	result = doTextExtraction(fileName, 6, null);
	// we need maxLength=6 cause Tika will also extract 3 leading whitespaces
	assertFalse("was: " + result, result.contains("SMILA"));
	assertTrue("was: " + result, result.contains("SMI"));
	final String htmlResult = doTextExtraction(fileName, 11, true);
	// html tags are not taken into account for maxLength parameter,
	// nevertheless we have to set a higher maxLength parameter here.
	assertFalse("was: " + htmlResult, htmlResult.contains("SMILA"));
	assertTrue("was: " + htmlResult, htmlResult.contains("SMI"));
	assertTrue("was: " + htmlResult, htmlResult.contains("<html"));
	assertTrue(htmlResult.length() > result.length());
	}
	}