blob: f8c1a92347585eedfef92bb250a67f90e5eeb46c [file] [log] [blame]
/***********************************************************************************************************************
* Copyright (c) 2008,2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation
**********************************************************************************************************************/
package org.eclipse.smila.importing.compounds.simple.test;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import org.eclipse.smila.datamodel.Any;
import org.eclipse.smila.datamodel.AnySeq;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.compounds.CompoundExtractorException;
import org.eclipse.smila.importing.compounds.simple.SimpleCompoundExtractorService;
import org.eclipse.smila.test.DeclarativeServiceTestCase;
import org.eclipse.smila.utils.config.ConfigUtils;
/**
* @author scank01
*
*/
public class TestSimpleCompoundExtractorService extends DeclarativeServiceTestCase {
private static final String BUNDLE_NAME = "org.eclipse.smila.importing.compounds.simple.test";
/** the compound extractor under test. */
private CompoundExtractor _compoundExtractor;
/** {@inheritDoc} */
@Override
protected void setUp() throws Exception {
super.setUp();
_compoundExtractor = getService(CompoundExtractor.class);
assertNotNull(_compoundExtractor);
assertTrue(_compoundExtractor instanceof SimpleCompoundExtractorService);
}
/** {@inheritDoc} */
@Override
protected void tearDown() throws Exception {
super.tearDown();
}
/**
* Test method for
* {@link org.eclipse.smila.importing.compounds.simple.SimpleCompoundExtractorService#canExtract(java.io.File)}.
*/
public void testCanExtractFile() {
assertFalse(_compoundExtractor.canExtract(null));
assertFalse(_compoundExtractor.canExtract(new File("null")));
assertTrue(_compoundExtractor.canExtract(new File("a.zip")));
assertTrue(_compoundExtractor.canExtract(new File("a.ZIP")));
assertTrue(_compoundExtractor.canExtract(new File("a.zIp")));
assertTrue(_compoundExtractor.canExtract(new File("a.gz")));
assertTrue(_compoundExtractor.canExtract(new File("a.GZ")));
assertTrue(_compoundExtractor.canExtract(new File("a.gZ")));
}
/**
* Test method for
* {@link org.eclipse.smila.importing.compounds.simple.SimpleCompoundExtractorService#canExtract(java.net.URL, java.lang.String)}
* .
*
* @throws MalformedURLException
*/
public void testCanExtractURLString() throws MalformedURLException {
assertFalse(_compoundExtractor.canExtract((URL) null, null));
assertFalse(_compoundExtractor.canExtract((String) null, null));
assertFalse(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d"), null));
assertFalse(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d"), ""));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d"), "application/x-gunzip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d"), "application/x-gzip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d"), "application/zip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.gz"), "application/x-gunzip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.GZ"), "application/x-gzip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.zip"), "application/zip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.ZIP"), "application/zip"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.ZIP"), "application/octet-stream"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.gz"), "application/octet-stream"));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.gz"), ""));
assertTrue(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.zip"), null));
assertFalse(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.gz"), "application/pdf"));
assertFalse(_compoundExtractor.canExtract(new URL("http://x.y.z:8080/a/b/c/d.zip"), "text/plain"));
}
/**
* Test method for
* {@link org.eclipse.smila.importing.compounds.simple.SimpleCompoundExtractorService#extract(java.io.InputStream)}.
*
* @throws CompoundExtractorException
*/
public void testExtractGzipInZip() throws CompoundExtractorException {
doTestTestZipEntries("test.zip");
}
/**
* Test method for
* {@link org.eclipse.smila.importing.compounds.simple.SimpleCompoundExtractorService#extract(java.io.InputStream)}
* using a zip created by WinZip.
*
* @throws CompoundExtractorException
*/
public void testExtractGzipInWinzipZip() throws CompoundExtractorException {
doTestTestZipEntries("test-winzip.zip");
}
/** test the test.zip variants. */
private void doTestTestZipEntries(final String baseZipName) throws CompoundExtractorException {
final Collection<String> compounds =
Arrays.asList(baseZipName, baseZipName + "/\u00e4/\u00b3zip.zip", baseZipName + "/doc.txt.gz");
final Collection<String> files =
Arrays.asList(baseZipName, baseZipName + "/\u00e4/\u00b3zip.zip", baseZipName + "/doc.txt.gz", baseZipName
+ "/doc.txt.gz/doc.txt", baseZipName + "/doc2.txt", baseZipName + "/\u00e4/\u00fc.txt", baseZipName
+ "/\u00e4/\u00b3zip.zip/\u00b3zip/\u00fc.txt");
final Collection<String> internalFileNames =
Arrays.asList(baseZipName, "\u00e4/\u00b3zip.zip", "doc.txt.gz", "doc.txt", "doc2.txt", "\u00e4/\u00fc.txt",
"\u00b3zip/\u00fc.txt");
final Iterator<Record> iter =
_compoundExtractor
.extract(ConfigUtils.getConfigStream(BUNDLE_NAME, baseZipName), baseZipName, "test-content");
final Collection<String> foundFiles = new HashSet<String>();
while (iter.hasNext()) {
final Record record = iter.next();
final String fileName = record.getId();
foundFiles.add(fileName);
assertTrue(fileName + " not expected.", files.contains(fileName));
assertTrue(fileName + " not correct.",
fileName.endsWith(record.getMetadata().getStringValue(CompoundExtractor.KEY_FILE_NAME)));
final String internalFileName = record.getMetadata().getStringValue(CompoundExtractor.KEY_FILE_NAME);
assertTrue(internalFileName + " is no correct entry file name", internalFileNames.contains(internalFileName));
if (fileName.equals(baseZipName)) {
assertTrue(record.getMetadata().containsKey(CompoundExtractor.KEY_IS_ROOT_COMPOUND_RECORD));
assertTrue(record.getMetadata().getBooleanValue(CompoundExtractor.KEY_IS_ROOT_COMPOUND_RECORD));
} else {
assertFalse(record.getMetadata().containsKey(CompoundExtractor.KEY_IS_ROOT_COMPOUND_RECORD));
}
if (compounds.contains(fileName)) {
assertTrue(record.getMetadata().getBooleanValue(CompoundExtractor.KEY_IS_COMPOUND));
assertFalse(record.hasAttachments());
} else {
assertFalse(record.getMetadata().containsKey(CompoundExtractor.KEY_IS_COMPOUND));
assertTrue(record.hasAttachment("test-content"));
}
AnySeq compoundsSeq = DataFactory.DEFAULT.createAnySeq();
if (record.getMetadata().containsKey(CompoundExtractor.KEY_COMPOUNDS)) {
compoundsSeq = record.getMetadata().getSeq(CompoundExtractor.KEY_COMPOUNDS);
}
String computedId = "";
for (final Any compound : compoundsSeq) {
computedId += compound.asValue().asString() + "/";
}
computedId += record.getMetadata().getStringValue(CompoundExtractor.KEY_FILE_NAME);
assertEquals("ids do not match.", computedId, record.getId());
}
assertEquals(files.size(), foundFiles.size());
}
/**
* Test method for
* {@link org.eclipse.smila.importing.compounds.simple.SimpleCompoundExtractorService#extract(java.io.InputStream)}.
*
* @throws CompoundExtractorException
*/
public void testExtractGzipInZipInGzip() throws CompoundExtractorException {
final Collection<String> compounds =
Arrays.asList("1.zip.gz", "1.zip.gz/1.zip", "1.zip.gz/1.zip/1/2/3/4.txt.gz");
final Collection<String> files =
Arrays.asList("1.zip.gz", "1.zip.gz/1.zip", "1.zip.gz/1.zip/1/2/3/4.txt.gz", "1.zip.gz/1.zip/1/2/3/4.txt",
"1.zip.gz/1.zip/1/2/3/4.txt.gz/4.txt");
final Collection<String> internalFileNames =
Arrays.asList("1.zip.gz", "1.zip", "1/2/3/4.txt.gz", "1/2/3/4.txt", "4.txt");
final Map<String, Long> sizes = new HashMap<String, Long>();
sizes.put("1.zip.gz/1.zip/1/2/3/4.txt.gz", 68L);
sizes.put("1.zip.gz/1.zip/1/2/3/4.txt.gz/4.txt", 1018L);
sizes.put("1.zip.gz/1.zip/1/2/3/4.txt", 1018L);
sizes.put("1.zip.gz/1.zip", 768L);
final Iterator<Record> iter =
_compoundExtractor.extract(ConfigUtils.getConfigStream(BUNDLE_NAME, "1.zip.gz"), "1.zip.gz", "test-content");
checkResultingRecords(compounds, files, internalFileNames, sizes, iter);
}
/**
* Test method for {@link CompoundExtractor#extract(java.io.InputStream, String, String)} using a zip file created
* with linux.
*
* @throws CompoundExtractorException
*/
public void testLinuxZip() throws CompoundExtractorException {
final Collection<String> compounds = Arrays.asList("\u00e4_linux.zip");
final Collection<String> files =
Arrays.asList("\u00e4_linux.zip", "\u00e4_linux.zip/\u00e4/\u00b3.txt", "\u00e4_linux.zip/\u00e4/\u00e4.txt");
final Collection<String> internalFileNames =
Arrays.asList("\u00e4_linux.zip", "\u00e4/\u00b3.txt", "\u00e4/\u00e4.txt");
final Map<String, Long> sizes = new HashMap<String, Long>();
sizes.put("\u00e4_linux.zip/\u00e4/\u00b3.txt", 11L);
sizes.put("\u00e4_linux.zip/\u00e4/\u00e4.txt", 11L);
final Iterator<Record> iter =
_compoundExtractor.extract(ConfigUtils.getConfigStream(BUNDLE_NAME, "\u00e4_linux.zip"), "\u00e4_linux.zip",
"test-content");
checkResultingRecords(compounds, files, internalFileNames, sizes, iter);
}
/**
* Test method for {@link CompoundExtractor#extract(java.io.InputStream, String, String)} extracting a gz file.
*
* @throws CompoundExtractorException
*/
public void testGz() throws CompoundExtractorException {
final Collection<String> compounds = Arrays.asList("doc.txt.gz");
final Collection<String> files = Arrays.asList("doc.txt.gz", "doc.txt.gz/doc.txt");
final Collection<String> internalFileNames = Arrays.asList("doc.txt.gz", "doc.txt");
final Map<String, Long> sizes = new HashMap<String, Long>();
sizes.put("doc.txt.gz/doc.txt", 16L);
final Iterator<Record> iter =
_compoundExtractor.extract(ConfigUtils.getConfigStream(BUNDLE_NAME, "doc.txt.gz"), "doc.txt.gz",
"test-content");
checkResultingRecords(compounds, files, internalFileNames, sizes, iter);
}
/** check the records. */
private void checkResultingRecords(final Collection<String> compounds, final Collection<String> files,
final Collection<String> internalFileNames, final Map<String, Long> sizes, final Iterator<Record> iter) {
final Collection<String> foundFiles = new HashSet<String>();
while (iter.hasNext()) {
final Record record = iter.next();
final String fileName = record.getId();
foundFiles.add(fileName);
assertTrue(fileName + " not expected.", files.contains(fileName));
assertEquals("Uncompressed size does not match for " + fileName, sizes.get(fileName), record.getMetadata()
.getLongValue(CompoundExtractor.KEY_SIZE));
final String internalFileName = record.getMetadata().getStringValue(CompoundExtractor.KEY_FILE_NAME);
assertTrue(internalFileName + " is no correct entry file name", internalFileNames.contains(internalFileName));
if (compounds.contains(fileName)) {
assertTrue("CompoundExtractor.KEY_IS_COMPOUND not present",
record.getMetadata().containsKey(CompoundExtractor.KEY_IS_COMPOUND));
assertTrue("CompoundExtractor.KEY_IS_COMPOUND not set to true",
record.getMetadata().getBooleanValue(CompoundExtractor.KEY_IS_COMPOUND));
assertFalse(record.hasAttachments());
} else {
assertFalse(record.getId() + " has compound flag",
record.getMetadata().containsKey(CompoundExtractor.KEY_IS_COMPOUND));
assertTrue(record.hasAttachment("test-content"));
}
AnySeq compoundsSeq = DataFactory.DEFAULT.createAnySeq();
if (record.getMetadata().containsKey(CompoundExtractor.KEY_COMPOUNDS)) {
compoundsSeq = record.getMetadata().getSeq(CompoundExtractor.KEY_COMPOUNDS);
}
String computedId = "";
for (final Any compound : compoundsSeq) {
computedId += compound.asValue().asString() + "/";
}
computedId += record.getMetadata().getStringValue(CompoundExtractor.KEY_FILE_NAME);
assertEquals("ids do not match.", computedId, record.getId());
}
assertEquals(files.size(), foundFiles.size());
}
}