blob: f07ee2f107b782868ee7782c838819124d3b5c72 [file] [log] [blame]
/***********************************************************************************************************************
* Copyright (c) 2008,2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the
* accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this
* distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation
**********************************************************************************************************************/
package org.eclipse.smila.importing.compounds.compress;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.eclipse.smila.common.mimetype.MimeTypeIdentifier;
import org.eclipse.smila.common.mimetype.MimeTypeParseException;
import org.eclipse.smila.datamodel.DataFactory;
import org.eclipse.smila.datamodel.Record;
import org.eclipse.smila.datamodel.util.AnyUtil;
import org.eclipse.smila.importing.compounds.CompoundExtractor;
import org.eclipse.smila.importing.compounds.CompoundExtractorException;
import org.eclipse.smila.utils.config.ConfigUtils;
/**
* Extractor service using commons compress.
*/
public class CommonsCompressCompoundExtractorService implements CompoundExtractor {
/** the default encoding. */
protected static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;
/** mime type that can mean almost anything and therefore will be ommitted. */
protected static final String APPLICATION_OCTETSTREAM = "application/octet-stream";
/** constant for "tar" suffix. */
protected static final String SUFFIX_TAR = ".tar";
/** constant for "tgz" suffix. */
protected static final String SUFFIX_TGZ = ".tgz";
/** mime type for ZIP. Zips must be handles specifically because of encoding issues. */
protected static final String APPLICATION_ZIP = "application/zip";
/** map from mime type to compressor stream factory identifier. */
protected static final Map<String, String> COMPRESSION_IDENTIFIERS = new HashMap<String, String>();
/** map from mime type to archive stream factory identifier. */
protected static final Map<String, String> ARCHIVE_IDENTIFIERS = new HashMap<String, String>();
/** key for temporary file name. */
protected static final String KEY_TMP_FILE_NAME = "tmpFileName";
/** bundle ID for configuration area access. */
private static final String BUNDLE_ID = "org.eclipse.smila.importing.compounds.compress";
/** encoding. */
protected Charset _charset = DEFAULT_CHARSET;
/** log. */
private final Log _log = LogFactory.getLog(getClass());
/** the mime type identifier service . */
private MimeTypeIdentifier _mimeTypeIdentifier;
/** the directory where to store the temporary uncompressed files. */
private Path _rootTmpDir;
/** service activation. */
protected void activate() {
Properties props;
try {
props = ConfigUtils.getConfigProperties(BUNDLE_ID, "extractor.properties");
} catch (final Exception ex) {
_log.info("No configuration " + BUNDLE_ID + "/" + " found, using default settings.");
props = new Properties();
}
if (props.containsKey("zip.encoding")) {
_charset = Charset.forName(props.getProperty("zip.encoding"));
}
final String rootTmpDirName = props.getProperty("tmp.dir", null);
if (rootTmpDirName != null) {
_rootTmpDir = Paths.get(rootTmpDirName);
} else {
_rootTmpDir = Paths.get(FileUtils.getTempDirectoryPath()).resolve(BUNDLE_ID);
}
// check if there are files left from any prior run:
try {
FileUtils.deleteDirectory(_rootTmpDir.toFile());
} catch (final IOException e) {
_log.warn("Could not delete old temporary files from previous invocation");
}
}
/** service deactivation. */
protected void deactivate() {
if (Files.exists(_rootTmpDir)) {
try {
FileUtils.deleteDirectory(_rootTmpDir.toFile());
} catch (final IOException e) {
_log.warn("Could not clean up temp extraction directory.", e);
}
}
}
static {
// compressors
COMPRESSION_IDENTIFIERS.put("application/x-bzip", CompressorStreamFactory.BZIP2);
COMPRESSION_IDENTIFIERS.put("application/bzip2", CompressorStreamFactory.BZIP2);
COMPRESSION_IDENTIFIERS.put("application/x-gtar", CompressorStreamFactory.GZIP);
COMPRESSION_IDENTIFIERS.put("application/x-gzip", CompressorStreamFactory.GZIP);
COMPRESSION_IDENTIFIERS.put("application/x-gunzip", CompressorStreamFactory.GZIP);
COMPRESSION_IDENTIFIERS.put("application/gzip", CompressorStreamFactory.GZIP);
// archivers
ARCHIVE_IDENTIFIERS.put(APPLICATION_ZIP, ArchiveStreamFactory.ZIP);
ARCHIVE_IDENTIFIERS.put("application/x-tar", ArchiveStreamFactory.TAR);
ARCHIVE_IDENTIFIERS.put("application/tar", ArchiveStreamFactory.TAR);
ARCHIVE_IDENTIFIERS.put("application/us-tar", ArchiveStreamFactory.TAR);
ARCHIVE_IDENTIFIERS.put("application/cpio", ArchiveStreamFactory.CPIO);
ARCHIVE_IDENTIFIERS.put("application/x-cpio", ArchiveStreamFactory.CPIO);
ARCHIVE_IDENTIFIERS.put("application/x-bcpio", ArchiveStreamFactory.CPIO);
ARCHIVE_IDENTIFIERS.put("application/x-sv4cpio", ArchiveStreamFactory.CPIO);
ARCHIVE_IDENTIFIERS.put("application/java-archive", ArchiveStreamFactory.JAR);
}
/** {@inheritDoc} */
@Override
public boolean canExtract(final File file) {
if (file == null) {
return false;
}
return canExtract(file.getName(), null);
}
/** {@inheritDoc} */
@Override
public boolean canExtract(final URL url, final String mimeType) {
if (url == null) {
return false;
}
return canExtract(url.getFile(), mimeType);
}
/** {@inheritDoc} */
@Override
public boolean canExtract(final String fileName, final String mimeType) {
final String detectedMimeType = determineMimeType(fileName, mimeType);
return isCompressedFile(detectedMimeType) || isArchive(detectedMimeType);
}
/** {@inheritDoc} */
@Override
public Iterator<Record> extract(final InputStream compoundInputStream, final String fileName,
final String contentAttachmentName) throws CompoundExtractorException {
return extract(compoundInputStream, fileName, null, contentAttachmentName);
}
/** {@inheritDoc} */
@Override
public Iterator<Record> extract(final InputStream compoundInputStream, final String fileName,
final String mimeType, final String contentAttachmentName) throws CompoundExtractorException {
final String extractedMimeType = determineMimeType(fileName, mimeType);
if (!canExtract((String) null, extractedMimeType)) {
return new ArrayList<Record>().iterator();
}
final Path tmpDir = _rootTmpDir.resolve(UUID.randomUUID().toString());
final Record newRecord = DataFactory.DEFAULT.createRecord(fileName);
newRecord.getMetadata().put(KEY_FILE_NAME, fileName);
newRecord.getMetadata().put(KEY_IS_ROOT_COMPOUND_RECORD, true);
final List<Record> records;
try {
records =
extractCompoundStream(compoundInputStream, newRecord, fileName, extractedMimeType, tmpDir,
new ArrayList<String>());
} finally {
IOUtils.closeQuietly(compoundInputStream);
}
return new AttachmentSettingIterator(records, contentAttachmentName, tmpDir);
}
/** @return <code>true</code> if mimetype is that of a supported archive format. */
private boolean isArchive(final String detectedMimeType) {
return ARCHIVE_IDENTIFIERS.containsKey(detectedMimeType);
}
/** @return <code>true</code> if mimetype is that of a supported compression format. */
private boolean isCompressedFile(final String detectedMimeType) {
return COMPRESSION_IDENTIFIERS.containsKey(detectedMimeType);
}
/** determines the mime type. */
private String determineMimeType(final String fileName, final String mimeType) {
if (fileName != null) {
if (mimeType == null || mimeType.isEmpty() || APPLICATION_OCTETSTREAM.equalsIgnoreCase(mimeType)) {
try {
final int indexOfPeriod = fileName.lastIndexOf('.');
if (indexOfPeriod >= 0 && fileName.length() > indexOfPeriod) {
return _mimeTypeIdentifier.identify(fileName.substring(indexOfPeriod + 1));
} else {
return _mimeTypeIdentifier.identify(fileName);
}
} catch (final MimeTypeParseException e) {
; // ignore
}
}
}
return mimeType;
}
/**
* extracts a compressed stream.
*
* @throws CompoundExtractorException
*/
private List<Record> extractCompoundStream(final InputStream inputStream, final Record record,
final String fileName, final String mimeType, final Path tmpDir, final List<String> compoundList)
throws CompoundExtractorException {
final List<Record> records = new ArrayList<Record>();
records.add(record);
final List<String> newCompoundList = new ArrayList<String>(compoundList);
if (isArchive(mimeType)) {
record.getMetadata().put(CompoundExtractor.KEY_IS_COMPOUND, true);
newCompoundList.add(fileName);
try {
// TODO: remove this if block when https://issues.apache.org/jira/browse/COMPRESS-219 is resolved
// switch _charset back to string, if you remove this block
if (APPLICATION_ZIP.equalsIgnoreCase(mimeType)) {
final ZipInputStream archiveInputStream = new ZipInputStream(inputStream, _charset);
ZipEntry entry = archiveInputStream.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
final String id = computeId(newCompoundList, entry.getName());
final Record entryRecord = record.getFactory().createRecord(id);
entryRecord.getMetadata().put(CompoundExtractor.KEY_FILE_NAME, entry.getName());
entryRecord.getMetadata().put(CompoundExtractor.KEY_SIZE, entry.getSize());
final long lastModified = entry.getTime();
if (lastModified != -1) {
entryRecord.getMetadata().put(CompoundExtractor.KEY_TIME,
record.getFactory().createDateTimeValue(new Date(lastModified)));
}
entryRecord.getMetadata().put(CompoundExtractor.KEY_COMPOUNDS, AnyUtil.objectToAny(newCompoundList));
records.addAll(extractCompoundStream(new BufferedInputStream(archiveInputStream), entryRecord,
entry.getName(), determineMimeType(entry.getName(), null), tmpDir, newCompoundList));
}
archiveInputStream.closeEntry();
entry = archiveInputStream.getNextEntry();
}
} else {
final ArchiveInputStream archiveInputStream;
if (APPLICATION_ZIP.equalsIgnoreCase(mimeType)) {
archiveInputStream = new ZipArchiveInputStream(inputStream, _charset.name(), true, true);
} else {
archiveInputStream =
new ArchiveStreamFactory().createArchiveInputStream(ARCHIVE_IDENTIFIERS.get(mimeType), inputStream);
}
ArchiveEntry entry = archiveInputStream.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
final String id = computeId(newCompoundList, entry.getName());
final Record entryRecord = record.getFactory().createRecord(id);
entryRecord.getMetadata().put(CompoundExtractor.KEY_FILE_NAME, entry.getName());
entryRecord.getMetadata().put(CompoundExtractor.KEY_SIZE, entry.getSize());
final Date lastModified = entry.getLastModifiedDate();
if (lastModified != null && lastModified.getTime() != -1) {
entryRecord.getMetadata().put(CompoundExtractor.KEY_TIME,
record.getFactory().createDateTimeValue(lastModified));
}
entryRecord.getMetadata().put(CompoundExtractor.KEY_COMPOUNDS, AnyUtil.objectToAny(newCompoundList));
records.addAll(extractCompoundStream(archiveInputStream, entryRecord, entry.getName(),
determineMimeType(entry.getName(), null), tmpDir, newCompoundList));
}
entry = archiveInputStream.getNextEntry();
}
}
} catch (final IOException e) {
_log.warn("Cannot extract archive '" + fileName + "'.", e);
} catch (final ArchiveException e) {
_log.warn("Cannot extract archive '" + fileName + "'.", e);
}
} else if (isCompressedFile(mimeType)) {
record.getMetadata().put(CompoundExtractor.KEY_IS_COMPOUND, true);
newCompoundList.add(fileName);
try {
final CompressorInputStream compressorInputStream =
new CompressorStreamFactory().createCompressorInputStream(COMPRESSION_IDENTIFIERS.get(mimeType),
inputStream);
final Path tmp = Paths.get(fileName);
final String tmpName = tmp.getFileName().toString();
final int lastIndexOfPeriod = tmpName.lastIndexOf('.');
String newFileName;
String suffix;
if (lastIndexOfPeriod >= 0 && tmpName.length() > lastIndexOfPeriod) {
newFileName = tmpName.substring(0, lastIndexOfPeriod);
suffix = tmpName.substring(lastIndexOfPeriod);
} else {
newFileName = "";
suffix = tmpName;
}
if (SUFFIX_TGZ.equalsIgnoreCase(suffix)) {
newFileName += SUFFIX_TAR;
}
final Path tmpFile = writeTempFile(compressorInputStream, newFileName, tmpDir);
final String id = computeId(newCompoundList, newFileName);
final Record extractedCompoundRecord = record.getFactory().createRecord(id);
extractedCompoundRecord.getMetadata().put(KEY_SIZE, Files.size(tmpFile));
extractedCompoundRecord.getMetadata().put(KEY_FILE_NAME, newFileName);
// copy creation time.
if (record.getMetadata().containsKey(KEY_TIME)) {
extractedCompoundRecord.getMetadata().put(KEY_TIME, record.getMetadata().get(KEY_TIME));
}
extractedCompoundRecord.getMetadata().put(CompoundExtractor.KEY_COMPOUNDS,
AnyUtil.objectToAny(newCompoundList));
try (final InputStream fis = Files.newInputStream(tmpFile);) {
// and go on, there might be some other compressed files inside of this one.
records.addAll(extractCompoundStream(fis, extractedCompoundRecord, newFileName,
determineMimeType(newFileName, null), tmpDir, newCompoundList));
} finally {
Files.deleteIfExists(tmpFile);
}
} catch (final IOException e) {
_log.warn("Cannot decompress compressed file '" + fileName + "'.", e);
} catch (final CompressorException e) {
_log.warn("Cannot decompress compressed file '" + fileName + "'.", e);
}
} else {
try {
final Path tmpFile = writeTempFile(inputStream, fileName, tmpDir);
record.getMetadata().put(KEY_TMP_FILE_NAME, tmpFile.toRealPath().toString());
record.getMetadata().put(KEY_SIZE, Files.size(tmpFile));
} catch (final IOException e) {
_log.warn("Cannot store compound content '" + fileName + "'.", e);
}
}
return records;
}
/** Uncompresses a stream to a temporary file. */
private Path writeTempFile(final InputStream inputStream, final String newFileName, final Path tmpDir)
throws IOException {
final Path tmpFile = tmpDir.resolve(UUID.randomUUID().toString() + newFileName);
if (!Files.exists(tmpFile.getParent())) {
Files.createDirectories(tmpFile.getParent());
}
Files.copy(inputStream, tmpFile);
return tmpFile;
}
/** computes the id from the compoundlist and the name. */
private String computeId(final List<String> newCompoundList, final String name) {
final StringBuilder idBuilder = new StringBuilder();
for (final String compound : newCompoundList) {
idBuilder.append(compound).append('/');
}
idBuilder.append(name);
return idBuilder.toString();
}
/** DS service reference injection method. */
public void setMimeTypeIdentifier(final MimeTypeIdentifier mimeTypeIdentifier) {
this._mimeTypeIdentifier = mimeTypeIdentifier;
}
/** DS service reference injection method. */
public void unsetMimeTypeIdentifier(final MimeTypeIdentifier mimeTypeIdentifier) {
if (this._mimeTypeIdentifier == mimeTypeIdentifier) {
this._mimeTypeIdentifier = null;
}
}
}