blob: fa79cfa1e27d77d55d308160b9f681fbe63428c4 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved.
* This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.tika.test;
import java.util.Map;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnyMap;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.tika.TikaPipelet;
/** test for extracting metadata. */
public class TestExtractProperties extends ConverterPipelineTestBase {
protected AnyMap _expectedMetadata;
/** Tests for Open Office metadata extraction. */
public void testOpenOfficeProps() throws Exception {
final String fileName = "OpenOffice.3.2.odp";
final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
final AnySeq extractProperties = DataFactory.DEFAULT.createAnySeq();
additionalRecordParams.put(TikaPipelet.PROP_EXTRACT_PROPERTIES, extractProperties);
final AnyMap author = DataFactory.DEFAULT.createAnyMap();
author.put("metadataName", "creator");
author.put("targetAttribute", "ATT_Creator");
extractProperties.add(author);
final AnyMap language = DataFactory.DEFAULT.createAnyMap();
language.put("metadataName", "language");
language.put("targetAttribute", "ATT_Language");
extractProperties.add(language);
final AnyMap title = DataFactory.DEFAULT.createAnyMap();
title.put("metadataName", "title");
title.put("targetAttribute", "ATT_Title");
extractProperties.add(title);
_expectedMetadata.put("ATT_Creator", "Juergen Schuhmacher");
_expectedMetadata.put("ATT_Language", "en-US");
_expectedMetadata.put("ATT_Title", "Introducing a New Product");
executeTest(fileName, additionalRecordParams);
}
public void testMsOfficeSimple() throws Exception {
final String fileName = "MSWORD_2010.docx";
final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
final AnySeq extractProperties = DataFactory.DEFAULT.createAnySeq();
additionalRecordParams.put(TikaPipelet.PROP_EXTRACT_PROPERTIES, extractProperties);
final AnyMap title = DataFactory.DEFAULT.createAnyMap();
title.put("metadataName", "title");
title.put("targetAttribute", "ATT_Titel");
extractProperties.add(title);
_expectedMetadata.put("ATT_Titel", "Mein Titel");
executeTest(fileName, additionalRecordParams);
}
/** Tests for MS Office specific properties. */
public void testMsOfficeProps() throws Exception {
final String fileName = "MSWORD_2010.docx";
final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
final AnySeq extractProperties = DataFactory.DEFAULT.createAnySeq();
additionalRecordParams.put(TikaPipelet.PROP_EXTRACT_PROPERTIES, extractProperties);
final AnyMap author = DataFactory.DEFAULT.createAnyMap();
author.put("metadataName", "Author");
author.put("targetAttribute", "ATT_Author");
extractProperties.add(author);
final AnyMap category = DataFactory.DEFAULT.createAnyMap();
category.put("metadataName", "Category");
category.put("targetAttribute", "ATT_Category");
extractProperties.add(category);
final AnyMap status = DataFactory.DEFAULT.createAnyMap();
status.put("metadataName", "Content-Status");
status.put("targetAttribute", "ATT_Status");
extractProperties.add(status);
final AnyMap manager = DataFactory.DEFAULT.createAnyMap();
manager.put("metadataName", "Manager");
manager.put("targetAttribute", "ATT_Manager");
extractProperties.add(manager);
_expectedMetadata.put("ATT_Author", "Andreas;Homer");
_expectedMetadata.put("ATT_Category", "Meine Kategorie");
_expectedMetadata.put("ATT_Status", "Mein Status");
_expectedMetadata.put("ATT_Manager", "Mein Manager");
executeTest(fileName, additionalRecordParams);
}
/** Tests for parameter 'storeMode'. */
public void testStoreMode() throws Exception {
final String fileName = "MSWORD_2010.docx";
final String testParameter = "storeMode";
final AnyMap additionalRecordParams = DataFactory.DEFAULT.createAnyMap();
final AnySeq extractProperties = DataFactory.DEFAULT.createAnySeq();
additionalRecordParams.put(TikaPipelet.PROP_EXTRACT_PROPERTIES, extractProperties);
final AnyMap manager = DataFactory.DEFAULT.createAnyMap();
manager.put("metadataName", "Manager");
manager.put("targetAttribute", FILENAME_ATTRIBUTE); // we use an attribute that already has a value
extractProperties.add(manager);
// add
manager.put(testParameter, TikaPipelet.StoreMode.add.name());
_expectedMetadata.add(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createStringValue("MSWORD_2010.docx"));
_expectedMetadata.add(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createStringValue("Mein Manager"));
executeTest(fileName, additionalRecordParams);
// leave
manager.put(testParameter, TikaPipelet.StoreMode.leave.name());
_expectedMetadata.put(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createStringValue("MSWORD_2010.docx"));
executeTest(fileName, additionalRecordParams);
// overwrite
manager.put(testParameter, TikaPipelet.StoreMode.overwrite.name());
_expectedMetadata.put(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createStringValue("Mein Manager"));
executeTest(fileName, additionalRecordParams);
// default: add
manager.remove(testParameter);
_expectedMetadata.put(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createAnySeq());
_expectedMetadata.add(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createStringValue("MSWORD_2010.docx"));
_expectedMetadata.add(FILENAME_ATTRIBUTE, DataFactory.DEFAULT.createStringValue("Mein Manager"));
executeTest(fileName, additionalRecordParams);
}
@Override
protected void setUp() throws Exception {
super.setUp();
_expectedMetadata = DataFactory.DEFAULT.createAnyMap();
}
@Override
protected void assertResultMetadata(final AnyMap metadata) {
for (final Map.Entry<String, Any> expected : _expectedMetadata.entrySet()) {
assertTrue("Expected attribute: " + expected.getKey(), metadata.containsKey(expected.getKey()));
assertEquals(expected.getValue(), metadata.get(expected.getKey()));
}
}
}