blob: 37aeee5ab9ac63abf0aae6f35ba563aefa295e11 [file] [log] [blame]
* Copyright (c) 2010 Atos Origin.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* Contributors:
* Anne Haugommard (Atos Origin) - Initial API and implementation
* Antonio Campesino Robles (Ericsson) - Bug 478883
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import java.util.regex.Pattern;
import javax.xml.namespace.NamespaceContext;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import org.eclipse.gendoc.document.parser.documents.Document;
import org.eclipse.gendoc.document.parser.documents.Document.CONFIGURATION;
import org.eclipse.gendoc.document.parser.documents.docx.DocxDocument;
import org.eclipse.gendoc.document.parser.documents.docx.DocxNamespaceContext;
import org.eclipse.gendoc.documents.IAdditionalResourceService;
import org.eclipse.gendoc.documents.IMimeHtmlService;
import org.eclipse.gendoc.documents.ITableService;
import org.eclipse.gendoc.documents.XMLDocumentService;
import org.eclipse.gendoc.documents.metadata.IDocumentMetadataService;
import org.eclipse.gendoc.tags.handlers.impl.RegisteredTags;
import org.w3c.dom.Attr;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
* Specific service for DOCX document management
public class DOCXDocumentService extends XMLDocumentService implements IDocumentMetadataService
/** Service for additional resources */
IAdditionalResourceService additionalResourceService;
IMimeHtmlService mimehtmlservice;
private final String TAG_TABLE = "w:tbl";
private String serviceId;
/** The Constant TRANSFORMER_FACTORY create a XML transformation . */
private static final TransformerFactory TRANSFORMER_FACTORY = TransformerFactory.newInstance();
private ITableService tableService;
public DOCXDocumentService()
additionalResourceService = new DOCXAdditionalResourceService();
mimehtmlservice = new DOCXMimeHtmlService();
tableService = new DOCXTableService();
public DOCXDocumentService(Document document)
additionalResourceService = new DOCXAdditionalResourceService();
mimehtmlservice = new DOCXMimeHtmlService();
tableService = new DOCXTableService();
public String getListLabel()
return null;
public boolean isList(String label)
return false;
public boolean isListItem(String label)
return false;
public boolean isPara(String label)
return "w:p".equals(label) || "w:bookmarkEnd".equals(label);
public boolean isTable(String label)
return "w:tbl".equals(label);
public boolean isRow(String label)
return "w:tr".equals(label);
public String getRowLabel() {
return "w:tr";
public String getCellLabel()
return "w:tc";
public boolean isCell(String label)
return "w:tc".equals(label);
public String getTextStyle()
return "w:t";
public String[] getTextTagLabels()
return new String[] {"w:p", "w:tbl"};
public String getNamingSpaceURL()
// Add name space for validating Office 2010 embedded images
return "xmlns:w=\"\" xmlns:r=\"\"";
public NamespaceContext getNameSpaceContext()
return new DocxNamespaceContext();
public void saveDocument(Document document, String path) throws DocumentServiceException
if (!(document instanceof DocxDocument))
throw new DocumentServiceException("Document is not a valid DOCX document.");
insertDocumentInFile((DocxDocument) document);
((DocxDocument) document).zipToLocation(path);
public IAdditionalResourceService getAdditionalResourceService()
return additionalResourceService;
public IMimeHtmlService getMimeHtmlService()
return mimehtmlservice;
* @throws TransformerFactoryConfigurationError
private void insertDocumentInFile(DocxDocument document)
// back to the beginning
DOMSource domSource = new DOMSource(document.getXMLParser().getDocument());
StreamResult fluxDestination = new StreamResult(new File(document.getUnzipLocationDocumentFile().getAbsolutePath() + "/word/" + document.getXMLParser().getXmlFile().getName()));
TransformerFactory fabrique = TransformerFactory.newInstance();
Transformer transformationIdentite = fabrique.newTransformer();
// DO NOT INDENT => it causes extra spaces
// transformationIdentite.setOutputProperty(OutputKeys.INDENT, "yes");
transformationIdentite.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformationIdentite.transform(domSource, fluxDestination);
while (document.jumpToNextFile());
catch (TransformerConfigurationException e1)
catch (TransformerException e2)
* @param currentNode subtree in which the clean is done.
* @param tagLabels list of known tag labels
* @param baseNode Node on which to start
* @return
* @throws InvalidContentException
protected Node cleanTags(Node currentNode, List<String> tagLabels, Node baseNode) throws InvalidContentException
if (baseNode == null)
return null;
// 2. Check that this node contains the start of a valid tag label
StringBuffer newNodeContent = new StringBuffer(extractNodeTextValue(baseNode));
while (baseNode != null && !containsOneOf(tagLabels, newNodeContent.toString()))
baseNode = findNodeWithStartTag(baseNode, currentNode);
if (baseNode != null)
newNodeContent = new StringBuffer(extractNodeTextValue(baseNode));
if (baseNode == null)
return null;
// 3. Base node is found AND matches a valid tag => Check tag closure
boolean isCompleteTag = containsFullTags(newNodeContent.toString(), tagLabels);
// 4. If tag not closed :
if (!isCompleteTag)
// Find all nodes matching the base node label
NodeList followingNodes = getNextNodes(baseNode, baseNode.getNodeName());
List<Node> nodesToRemove = new ArrayList<Node>();
if (followingNodes != null)
// Append text values of all these nodes until tag closure is found
for (int i = 0; i < followingNodes.getLength(); i++)
Node followingNode = followingNodes.item(i);
String textValue = extractNodeTextValue(followingNode);
//Node nodeToRemove = getBestAscendantUntil(currentNode, followingNodes.item(i));
Node nodeToRemove = getBestAscendantUntil(currentNode, followingNode);
if (nodeToRemove == null && followingNode != baseNode) {
nodeToRemove = followingNode;
if (!nodesToRemove.contains(nodeToRemove))
if (containsFullTags(newNodeContent.toString(), tagLabels))
isCompleteTag = true;
// Remove all nodes that are not useful anymore from initial current Node
for (Node nodeToRemove : nodesToRemove)
if (nodeToRemove != null)
// Replace content of base node with the text stored in "textContent" variable
String[] separated = asText(baseNode).split(XML_TAG_START + "|" + XML_TAG_END);
if (separated != null && separated.length > 1)
newNodeContent.insert(0, XML_TAG_START + separated[1] + XML_TAG_END);
newNodeContent.append(XML_TAG_START + separated[separated.length - 1] + XML_TAG_END);
// Replace invalid characters
String nodeContent = cleanXMLContent(newNodeContent.toString());
// Replace base node by the value of the buffer
Node result = injectNode(baseNode, nodeContent);
return result;
* No similar tags in docx => always return false
* @see, java.lang.String)
protected boolean areSimilarTags(String tagName1, String tagName2) {
return false;
protected String containsSimilarTag (Stack<String> tagStack, String tagName)
return null;
public String getTableLabel()
return TAG_TABLE;
public String getServiceId ()
return this.serviceId;
public void setServiceId (String serviceId)
this.serviceId = serviceId;
protected static Pattern NOBR_REPLACE_PATTERN = Pattern.compile("(?:&lt;\\s*" + RegisteredTags.NOBR + "\\s*/\\s*&gt;)(?:.*?)(?:</w:t>)(?:.*?)(?:<w:t[^>]*>)", Pattern.DOTALL | Pattern.MULTILINE);
public Pattern getNobrReplacePattern()
public String getListId(Node n)
return null;
public String getContinueList(Node currentNode, String idList) throws InvalidContentException
return null;
/* (non-Javadoc)
* @see org.eclipse.gendoc.documents.IDocumentService#format(java.lang.String)
public String format(String input) {
// Do not use paragraph break : styles are lost
String PARAGRAPH_BREAK = "</w:t></w:r></w:p><w:p><w:r><w:t>";
String LINE_BREAK = "</w:t><w:br/><w:t>";
String TAB = "</w:t><w:tab/><w:t>";
String formatted=
// handle carriage return mixed with line feed as carriage returns
// handle carriage return
.replace("\r", LINE_BREAK)
//handle line feed
//handle tabulation
return formatted;
public List<String> getMetadataProperties(Document doc) {
if (getDocument() == null) {
String docName = doc.getXMLParser().getXmlFile().getName();
ArrayList<String> res = new ArrayList<String>();
if (docName.equals("custom.xml")) {
NodeList nl = doc.getXMLParser().getNodesFromXPathExpression(
new DocxNamespaceContextEx(new String[] {
"def", "",
for (int i=0; i<nl.getLength(); i++) {
Element e = (Element)nl.item(i);
} else if (docName.equals("core.xml")) {
NodeList nl = doc.getXMLParser().getNodesFromXPathExpression(
new DocxNamespaceContextEx(new String[] {
"cp", "",
"dc", "",
"dcterms", "",
for (int i=0; i<nl.getLength(); i++) {
Element e = (Element)nl.item(i);
return res;
public String getMetadataValue(Document doc, String propertyId) {
Node n = getPropertyNode(doc, propertyId);
if (n instanceof Element)
return n.getTextContent();
else if (n instanceof Attr) {
return ((Attr) n).getValue();
} else {
return null;
public void setMetadataValue(Document doc, String propertyId, String value) {
Node n = getPropertyNode(doc, propertyId);
if (n instanceof Element)
else if (n instanceof Attr) {
((Attr) n).setValue(value);
public void saveMetadata(Document doc) {
try {
if (doc.getXMLParser().getKind() != CONFIGURATION.metadata)
File f = doc.getXMLParser().getXmlFile();
Transformer t = TRANSFORMER_FACTORY.newTransformer();
t.setOutputProperty(OutputKeys.STANDALONE, "no");
t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
t.transform(new DOMSource(doc.getXMLParser().getDocument()), new StreamResult(f));
} catch (TransformerException e) {
throw new RuntimeException(e);
private Node getPropertyNode(Document doc, String propertyId) {
String docName = doc.getXMLParser().getXmlFile().getName();
if (docName.equals("custom.xml")) {
NodeList nl = doc.getXMLParser().getNodesFromXPathExpression(
new DocxNamespaceContextEx(new String[] {
"def", "",
if (nl.getLength() > 0) {
return nl.item(0);
} else if (docName.equals("core.xml")) {
NodeList nl = doc.getXMLParser().getNodesFromXPathExpression(
new DocxNamespaceContextEx(new String[] {
"cp", "",
"dc", "",
"dcterms", "",
if (nl.getLength() > 0)
return nl.item(0);
return null;
private static class DocxNamespaceContextEx implements NamespaceContext {
public DocxNamespaceContextEx(String[] mapping) {
this.mapping = new HashMap<String,String>(2);
for (int i=0; i<mapping.length; i+=2) {
this.mapping.put(mapping[i], mapping[i+1]);
public String getNamespaceURI(String prefix) {
String uri = mapping.get(prefix);
if (uri == null)
uri = "";
return uri;
public String getPrefix(String uri)
throw new UnsupportedOperationException();
public Iterator getPrefixes(String uri)
throw new UnsupportedOperationException();
private Map<String, String> mapping;
public ITableService getTableService() {
return tableService;