| /*********************************************************************************************************************** |
| * Copyright (c) 2008,2012 Attensity Europe GmbH and brox IT Solutions GmbH. All rights reserved. This program and the |
| * accompanying materials are made available under the terms of the Eclipse Public License v1.0 which accompanies this |
| * distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Andreas Schank (Attensity Europe GmbH) - initial API and implementation |
| **********************************************************************************************************************/ |
| package org.eclipse.smila.importing.compounds.compress; |
| |
| import java.io.BufferedInputStream; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.net.URL; |
| import java.nio.charset.Charset; |
| import java.nio.charset.StandardCharsets; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| import java.util.ArrayList; |
| import java.util.Date; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Properties; |
| import java.util.UUID; |
| import java.util.zip.ZipEntry; |
| import java.util.zip.ZipInputStream; |
| |
| import org.apache.commons.compress.archivers.ArchiveEntry; |
| import org.apache.commons.compress.archivers.ArchiveException; |
| import org.apache.commons.compress.archivers.ArchiveInputStream; |
| import org.apache.commons.compress.archivers.ArchiveStreamFactory; |
| import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; |
| import org.apache.commons.compress.compressors.CompressorException; |
| import org.apache.commons.compress.compressors.CompressorInputStream; |
| import org.apache.commons.compress.compressors.CompressorStreamFactory; |
| import org.apache.commons.io.FileUtils; |
| import org.apache.commons.io.IOUtils; |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.eclipse.smila.common.mimetype.MimeTypeIdentifier; |
| import org.eclipse.smila.common.mimetype.MimeTypeParseException; |
| import org.eclipse.smila.datamodel.DataFactory; |
| import org.eclipse.smila.datamodel.Record; |
| import org.eclipse.smila.datamodel.util.AnyUtil; |
| import org.eclipse.smila.importing.compounds.CompoundExtractor; |
| import org.eclipse.smila.importing.compounds.CompoundExtractorException; |
| import org.eclipse.smila.utils.config.ConfigUtils; |
| |
| /** |
| * Extractor service using commons compress. |
| */ |
| public class CommonsCompressCompoundExtractorService implements CompoundExtractor { |
| |
| /** the default encoding. */ |
| protected static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8; |
| |
| /** mime type that can mean almost anything and therefore will be ommitted. */ |
| protected static final String APPLICATION_OCTETSTREAM = "application/octet-stream"; |
| |
| /** constant for "tar" suffix. */ |
| protected static final String SUFFIX_TAR = ".tar"; |
| |
| /** constant for "tgz" suffix. */ |
| protected static final String SUFFIX_TGZ = ".tgz"; |
| |
| /** mime type for ZIP. Zips must be handles specifically because of encoding issues. */ |
| protected static final String APPLICATION_ZIP = "application/zip"; |
| |
| /** map from mime type to compressor stream factory identifier. */ |
| protected static final Map<String, String> COMPRESSION_IDENTIFIERS = new HashMap<String, String>(); |
| |
| /** map from mime type to archive stream factory identifier. */ |
| protected static final Map<String, String> ARCHIVE_IDENTIFIERS = new HashMap<String, String>(); |
| |
| /** key for temporary file name. */ |
| protected static final String KEY_TMP_FILE_NAME = "tmpFileName"; |
| |
| /** bundle ID for configuration area access. */ |
| private static final String BUNDLE_ID = "org.eclipse.smila.importing.compounds.compress"; |
| |
| /** encoding. */ |
| protected Charset _charset = DEFAULT_CHARSET; |
| |
| /** log. */ |
| private final Log _log = LogFactory.getLog(getClass()); |
| |
| /** the mime type identifier service . */ |
| private MimeTypeIdentifier _mimeTypeIdentifier; |
| |
| /** the directory where to store the temporary uncompressed files. */ |
| private Path _rootTmpDir; |
| |
| /** service activation. */ |
| protected void activate() { |
| Properties props; |
| try { |
| props = ConfigUtils.getConfigProperties(BUNDLE_ID, "extractor.properties"); |
| } catch (final Exception ex) { |
| _log.info("No configuration " + BUNDLE_ID + "/" + " found, using default settings."); |
| props = new Properties(); |
| } |
| if (props.containsKey("zip.encoding")) { |
| _charset = Charset.forName(props.getProperty("zip.encoding")); |
| } |
| final String rootTmpDirName = props.getProperty("tmp.dir", null); |
| if (rootTmpDirName != null) { |
| _rootTmpDir = Paths.get(rootTmpDirName); |
| } else { |
| _rootTmpDir = Paths.get(FileUtils.getTempDirectoryPath()).resolve(BUNDLE_ID); |
| } |
| // check if there are files left from any prior run: |
| try { |
| FileUtils.deleteDirectory(_rootTmpDir.toFile()); |
| } catch (final IOException e) { |
| _log.warn("Could not delete old temporary files from previous invocation"); |
| } |
| } |
| |
| /** service deactivation. */ |
| protected void deactivate() { |
| if (Files.exists(_rootTmpDir)) { |
| try { |
| FileUtils.deleteDirectory(_rootTmpDir.toFile()); |
| } catch (final IOException e) { |
| _log.warn("Could not clean up temp extraction directory.", e); |
| } |
| } |
| } |
| |
| static { |
| // compressors |
| COMPRESSION_IDENTIFIERS.put("application/x-bzip", CompressorStreamFactory.BZIP2); |
| COMPRESSION_IDENTIFIERS.put("application/bzip2", CompressorStreamFactory.BZIP2); |
| |
| COMPRESSION_IDENTIFIERS.put("application/x-gtar", CompressorStreamFactory.GZIP); |
| COMPRESSION_IDENTIFIERS.put("application/x-gzip", CompressorStreamFactory.GZIP); |
| COMPRESSION_IDENTIFIERS.put("application/x-gunzip", CompressorStreamFactory.GZIP); |
| COMPRESSION_IDENTIFIERS.put("application/gzip", CompressorStreamFactory.GZIP); |
| |
| // archivers |
| ARCHIVE_IDENTIFIERS.put(APPLICATION_ZIP, ArchiveStreamFactory.ZIP); |
| |
| ARCHIVE_IDENTIFIERS.put("application/x-tar", ArchiveStreamFactory.TAR); |
| ARCHIVE_IDENTIFIERS.put("application/tar", ArchiveStreamFactory.TAR); |
| ARCHIVE_IDENTIFIERS.put("application/us-tar", ArchiveStreamFactory.TAR); |
| |
| ARCHIVE_IDENTIFIERS.put("application/cpio", ArchiveStreamFactory.CPIO); |
| ARCHIVE_IDENTIFIERS.put("application/x-cpio", ArchiveStreamFactory.CPIO); |
| ARCHIVE_IDENTIFIERS.put("application/x-bcpio", ArchiveStreamFactory.CPIO); |
| ARCHIVE_IDENTIFIERS.put("application/x-sv4cpio", ArchiveStreamFactory.CPIO); |
| |
| ARCHIVE_IDENTIFIERS.put("application/java-archive", ArchiveStreamFactory.JAR); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| public boolean canExtract(final File file) { |
| if (file == null) { |
| return false; |
| } |
| return canExtract(file.getName(), null); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| public boolean canExtract(final URL url, final String mimeType) { |
| if (url == null) { |
| return false; |
| } |
| return canExtract(url.getFile(), mimeType); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| public boolean canExtract(final String fileName, final String mimeType) { |
| final String detectedMimeType = determineMimeType(fileName, mimeType); |
| return isCompressedFile(detectedMimeType) || isArchive(detectedMimeType); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| public Iterator<Record> extract(final InputStream compoundInputStream, final String fileName, |
| final String contentAttachmentName) throws CompoundExtractorException { |
| return extract(compoundInputStream, fileName, null, contentAttachmentName); |
| } |
| |
| /** {@inheritDoc} */ |
| @Override |
| public Iterator<Record> extract(final InputStream compoundInputStream, final String fileName, |
| final String mimeType, final String contentAttachmentName) throws CompoundExtractorException { |
| final String extractedMimeType = determineMimeType(fileName, mimeType); |
| if (!canExtract((String) null, extractedMimeType)) { |
| return new ArrayList<Record>().iterator(); |
| } |
| |
| final Path tmpDir = _rootTmpDir.resolve(UUID.randomUUID().toString()); |
| final Record newRecord = DataFactory.DEFAULT.createRecord(fileName); |
| newRecord.getMetadata().put(KEY_FILE_NAME, fileName); |
| newRecord.getMetadata().put(KEY_IS_ROOT_COMPOUND_RECORD, true); |
| final List<Record> records; |
| try { |
| records = |
| extractCompoundStream(compoundInputStream, newRecord, fileName, extractedMimeType, tmpDir, |
| new ArrayList<String>()); |
| } finally { |
| IOUtils.closeQuietly(compoundInputStream); |
| } |
| return new AttachmentSettingIterator(records, contentAttachmentName, tmpDir); |
| } |
| |
| /** @return <code>true</code> if mimetype is that of a supported archive format. */ |
| private boolean isArchive(final String detectedMimeType) { |
| return ARCHIVE_IDENTIFIERS.containsKey(detectedMimeType); |
| } |
| |
| /** @return <code>true</code> if mimetype is that of a supported compression format. */ |
| private boolean isCompressedFile(final String detectedMimeType) { |
| return COMPRESSION_IDENTIFIERS.containsKey(detectedMimeType); |
| } |
| |
| /** determines the mime type. */ |
| private String determineMimeType(final String fileName, final String mimeType) { |
| if (fileName != null) { |
| if (mimeType == null || mimeType.isEmpty() || APPLICATION_OCTETSTREAM.equalsIgnoreCase(mimeType)) { |
| try { |
| final int indexOfPeriod = fileName.lastIndexOf('.'); |
| if (indexOfPeriod >= 0 && fileName.length() > indexOfPeriod) { |
| return _mimeTypeIdentifier.identify(fileName.substring(indexOfPeriod + 1)); |
| } else { |
| return _mimeTypeIdentifier.identify(fileName); |
| } |
| } catch (final MimeTypeParseException e) { |
| ; // ignore |
| } |
| } |
| } |
| return mimeType; |
| } |
| |
| /** |
| * extracts a compressed stream. |
| * |
| * @throws CompoundExtractorException |
| */ |
| private List<Record> extractCompoundStream(final InputStream inputStream, final Record record, |
| final String fileName, final String mimeType, final Path tmpDir, final List<String> compoundList) |
| throws CompoundExtractorException { |
| final List<Record> records = new ArrayList<Record>(); |
| records.add(record); |
| final List<String> newCompoundList = new ArrayList<String>(compoundList); |
| if (isArchive(mimeType)) { |
| record.getMetadata().put(CompoundExtractor.KEY_IS_COMPOUND, true); |
| newCompoundList.add(fileName); |
| try { |
| // TODO: remove this if block when https://issues.apache.org/jira/browse/COMPRESS-219 is resolved |
| // switch _charset back to string, if you remove this block |
| if (APPLICATION_ZIP.equalsIgnoreCase(mimeType)) { |
| final ZipInputStream archiveInputStream = new ZipInputStream(inputStream, _charset); |
| ZipEntry entry = archiveInputStream.getNextEntry(); |
| while (entry != null) { |
| if (!entry.isDirectory()) { |
| final String id = computeId(newCompoundList, entry.getName()); |
| final Record entryRecord = record.getFactory().createRecord(id); |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_FILE_NAME, entry.getName()); |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_SIZE, entry.getSize()); |
| final long lastModified = entry.getTime(); |
| if (lastModified != -1) { |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_TIME, |
| record.getFactory().createDateTimeValue(new Date(lastModified))); |
| } |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_COMPOUNDS, AnyUtil.objectToAny(newCompoundList)); |
| records.addAll(extractCompoundStream(new BufferedInputStream(archiveInputStream), entryRecord, |
| entry.getName(), determineMimeType(entry.getName(), null), tmpDir, newCompoundList)); |
| } |
| archiveInputStream.closeEntry(); |
| entry = archiveInputStream.getNextEntry(); |
| } |
| } else { |
| final ArchiveInputStream archiveInputStream; |
| if (APPLICATION_ZIP.equalsIgnoreCase(mimeType)) { |
| archiveInputStream = new ZipArchiveInputStream(inputStream, _charset.name(), true, true); |
| } else { |
| archiveInputStream = |
| new ArchiveStreamFactory().createArchiveInputStream(ARCHIVE_IDENTIFIERS.get(mimeType), inputStream); |
| } |
| ArchiveEntry entry = archiveInputStream.getNextEntry(); |
| while (entry != null) { |
| if (!entry.isDirectory()) { |
| final String id = computeId(newCompoundList, entry.getName()); |
| final Record entryRecord = record.getFactory().createRecord(id); |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_FILE_NAME, entry.getName()); |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_SIZE, entry.getSize()); |
| final Date lastModified = entry.getLastModifiedDate(); |
| if (lastModified != null && lastModified.getTime() != -1) { |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_TIME, |
| record.getFactory().createDateTimeValue(lastModified)); |
| } |
| entryRecord.getMetadata().put(CompoundExtractor.KEY_COMPOUNDS, AnyUtil.objectToAny(newCompoundList)); |
| records.addAll(extractCompoundStream(archiveInputStream, entryRecord, entry.getName(), |
| determineMimeType(entry.getName(), null), tmpDir, newCompoundList)); |
| } |
| entry = archiveInputStream.getNextEntry(); |
| } |
| } |
| } catch (final IOException e) { |
| _log.warn("Cannot extract archive '" + fileName + "'.", e); |
| } catch (final ArchiveException e) { |
| _log.warn("Cannot extract archive '" + fileName + "'.", e); |
| } |
| } else if (isCompressedFile(mimeType)) { |
| record.getMetadata().put(CompoundExtractor.KEY_IS_COMPOUND, true); |
| newCompoundList.add(fileName); |
| try { |
| final CompressorInputStream compressorInputStream = |
| new CompressorStreamFactory().createCompressorInputStream(COMPRESSION_IDENTIFIERS.get(mimeType), |
| inputStream); |
| final Path tmp = Paths.get(fileName); |
| final String tmpName = tmp.getFileName().toString(); |
| final int lastIndexOfPeriod = tmpName.lastIndexOf('.'); |
| String newFileName; |
| String suffix; |
| if (lastIndexOfPeriod >= 0 && tmpName.length() > lastIndexOfPeriod) { |
| newFileName = tmpName.substring(0, lastIndexOfPeriod); |
| suffix = tmpName.substring(lastIndexOfPeriod); |
| } else { |
| newFileName = ""; |
| suffix = tmpName; |
| } |
| if (SUFFIX_TGZ.equalsIgnoreCase(suffix)) { |
| newFileName += SUFFIX_TAR; |
| } |
| final Path tmpFile = writeTempFile(compressorInputStream, newFileName, tmpDir); |
| |
| final String id = computeId(newCompoundList, newFileName); |
| final Record extractedCompoundRecord = record.getFactory().createRecord(id); |
| extractedCompoundRecord.getMetadata().put(KEY_SIZE, Files.size(tmpFile)); |
| extractedCompoundRecord.getMetadata().put(KEY_FILE_NAME, newFileName); |
| // copy creation time. |
| if (record.getMetadata().containsKey(KEY_TIME)) { |
| extractedCompoundRecord.getMetadata().put(KEY_TIME, record.getMetadata().get(KEY_TIME)); |
| } |
| extractedCompoundRecord.getMetadata().put(CompoundExtractor.KEY_COMPOUNDS, |
| AnyUtil.objectToAny(newCompoundList)); |
| try (final InputStream fis = Files.newInputStream(tmpFile);) { |
| // and go on, there might be some other compressed files inside of this one. |
| records.addAll(extractCompoundStream(fis, extractedCompoundRecord, newFileName, |
| determineMimeType(newFileName, null), tmpDir, newCompoundList)); |
| } finally { |
| Files.deleteIfExists(tmpFile); |
| } |
| } catch (final IOException e) { |
| _log.warn("Cannot decompress compressed file '" + fileName + "'.", e); |
| } catch (final CompressorException e) { |
| _log.warn("Cannot decompress compressed file '" + fileName + "'.", e); |
| } |
| } else { |
| try { |
| final Path tmpFile = writeTempFile(inputStream, fileName, tmpDir); |
| record.getMetadata().put(KEY_TMP_FILE_NAME, tmpFile.toRealPath().toString()); |
| record.getMetadata().put(KEY_SIZE, Files.size(tmpFile)); |
| } catch (final IOException e) { |
| _log.warn("Cannot store compound content '" + fileName + "'.", e); |
| } |
| } |
| return records; |
| } |
| |
| /** Uncompresses a stream to a temporary file. */ |
| private Path writeTempFile(final InputStream inputStream, final String newFileName, final Path tmpDir) |
| throws IOException { |
| final Path tmpFile = tmpDir.resolve(UUID.randomUUID().toString() + newFileName); |
| if (!Files.exists(tmpFile.getParent())) { |
| Files.createDirectories(tmpFile.getParent()); |
| } |
| Files.copy(inputStream, tmpFile); |
| return tmpFile; |
| } |
| |
| /** computes the id from the compoundlist and the name. */ |
| private String computeId(final List<String> newCompoundList, final String name) { |
| final StringBuilder idBuilder = new StringBuilder(); |
| for (final String compound : newCompoundList) { |
| idBuilder.append(compound).append('/'); |
| } |
| idBuilder.append(name); |
| return idBuilder.toString(); |
| } |
| |
| /** DS service reference injection method. */ |
| public void setMimeTypeIdentifier(final MimeTypeIdentifier mimeTypeIdentifier) { |
| this._mimeTypeIdentifier = mimeTypeIdentifier; |
| } |
| |
| /** DS service reference injection method. */ |
| public void unsetMimeTypeIdentifier(final MimeTypeIdentifier mimeTypeIdentifier) { |
| if (this._mimeTypeIdentifier == mimeTypeIdentifier) { |
| this._mimeTypeIdentifier = null; |
| } |
| } |
| } |