blob: 08556340f5a2645f10e80d9c3e3f13831adf33b8 [file] [log] [blame]
//------------------------------------------------------------------------------
// Copyright (c) 2005, 2006 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.common.html;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import org.eclipse.epf.common.utils.FileUtil;
import org.w3c.tidy.Configuration;
import org.w3c.tidy.Tidy;
/**
* Pretty-formats HTML source and makes it XHTML compliant.
*
* @author Kelvin Low
* @since 1.0
*/
public class HTMLFormatter {
private static final String HTML_BODY_START_TAG = "<body>"; //$NON-NLS-1$
private static final String HTML_BODY_END_TAG = "</body>"; //$NON-NLS-1$
private static final int HTML_BODY_START_TAG_LENGTH = HTML_BODY_START_TAG
.length();
private static final String HTML_COPY = "&copy;";//$NON-NLS-1$
private static final String HTML_EURO = "&euro;";//$NON-NLS-1$
private static final String HTML_REG = "&reg;";//$NON-NLS-1$
private static final String HTML_TRADEMARK = "&trade;";//$NON-NLS-1$
private int lineWidth;
private boolean indent;
private int indentSize;
/**
* Creates a new instance.
*/
public HTMLFormatter() {
this(132, true, 4);
}
/**
* Creates a new instance.
*/
public HTMLFormatter(int lineWidth, boolean indent, int indentSize) {
this.lineWidth = lineWidth;
this.indent = indent;
this.indentSize = indentSize;
}
/**
* Sets the maximum character width of a line.
*
* @param lineWidth
* The line width (in number of characters).
*/
public void setLineWidth(int lineWidth) {
this.lineWidth = lineWidth;
}
/**
* Enables or disables tags indent.
*
* @param indent
* If true, ident the tags.
*/
public void setIndent(boolean indent) {
this.indent = indent;
}
/**
* Sets the indent size.
*
* @param indentSize
* The indent size (in number of characters).
*/
public void setIndentSize(int indentSize) {
this.indentSize = indentSize;
}
/**
* Formats the given HTML source.
*
* @param html
* The HTML source.
* @return The pretty-formatted HTML source.
*/
public String formatHTML(String html) throws Exception {
if (html == null || html.length() == 0) {
return html;
}
Tidy tidy = new Tidy();
tidy.setXHTML(true);
tidy.setDropEmptyParas(false);
tidy.setDropFontTags(false);
tidy.setQuiet(true);
tidy.setShowWarnings(false);
tidy.setSmartIndent(false);
tidy.setTidyMark(false);
tidy.setWraplen(lineWidth);
tidy.setIndentAttributes(false);
tidy.setIndentContent(indent);
tidy.setSpaces(indentSize);
tidy.setCharEncoding(Configuration.UTF8);
ByteArrayInputStream input = new ByteArrayInputStream(html
.getBytes("UTF-8")); //$NON-NLS-1$
ByteArrayOutputStream output = new ByteArrayOutputStream();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
tidy.setErrout(pw);
tidy.parse(input, output);
String error = sw.getBuffer().toString();
if (error != null && error.length() > 0
&& error.startsWith("line") && error.indexOf("column") > 0) { //$NON-NLS-1$ //$NON-NLS-2$
throw new Exception(error);
}
String formattedHTML = new String(output.toByteArray(), "UTF-8"); //$NON-NLS-1$
formattedHTML = escapeHTML(formattedHTML);
if (!html.startsWith("<!DOCTYPE") && !html.startsWith("<html")) { //$NON-NLS-1$ //$NON-NLS-2$
int start = formattedHTML.indexOf(HTML_BODY_START_TAG);
int end = formattedHTML.indexOf(HTML_BODY_END_TAG);
if (start == -1 || end == -1) {
return ""; //$NON-NLS-1$
}
start += HTML_BODY_START_TAG_LENGTH;
if (start >= end) {
return ""; //$NON-NLS-1$
}
start += FileUtil.LINE_SEP_LENGTH;
end -= FileUtil.LINE_SEP_LENGTH;
if (indent && indentSize > 0) {
end -= indentSize;
}
if (start >= end) {
return ""; //$NON-NLS-1$
}
String result = formattedHTML.substring(start, end);
if (indent && indentSize > 0) {
String indentStr = getIndentStr(indentSize * 2);
result = fixIndentation(result, indentStr);
return result;
}
}
return formattedHTML;
}
/**
* Returns the indent string.
*/
private static String getIndentStr(int indentLength) {
if (indentLength == 0) {
return ""; //$NON-NLS-1$
}
StringBuffer indentStr = new StringBuffer();
for (int i = 0; i < indentLength; i++) {
indentStr.append(' ');
}
return indentStr.toString();
}
/**
* Escapes HTML special characters that are not handled correctly by JTidy.
*
* @param html
* The HTML source.
* @return The HTML source with HTML special characters preserved in escaped
* form.
*/
private static String escapeHTML(String html) {
if (html == null || html.length() == 0)
return ""; //$NON-NLS-1$
StringBuffer sb = new StringBuffer();
int len = html.length();
for (int i = 0; i < len; i++) {
char ch = html.charAt(i);
switch (ch) {
case '\u00a9':
sb.append(HTML_COPY);
break;
case '\u00ae':
sb.append(HTML_REG);
break;
case '\u20ac':
sb.append(HTML_EURO);
break;
case '\u2122':
sb.append(HTML_TRADEMARK);
break;
default:
sb.append(ch);
break;
}
}
return sb.toString();
}
public static final String PRE_TAG_START = "<pre>"; //$NON-NLS-1$
public static final String PRE_TAG_END = "</pre>"; //$NON-NLS-1$
public static final int PRE_TAG_END_LENGTH = PRE_TAG_END.length();
/**
* Undo the JTidy indent, but ignore &lt;pre&gt; tags
* @param html
* @param indentStr
* @return
*/
private static String fixIndentation(String html, String indentStr) {
if (html.startsWith(indentStr)) {
html = html.substring(indentStr.length());
}
StringBuffer strBuf = new StringBuffer();
int pre_index = -1;
int last_pre_end_index = -1;
while ((pre_index = html.indexOf(PRE_TAG_START, last_pre_end_index)) != -1) {
strBuf.append(html.substring(last_pre_end_index < 0 ? 0 : last_pre_end_index + PRE_TAG_END_LENGTH, pre_index).replaceAll("\r\n" + indentStr, "\r\n"));
last_pre_end_index = html.indexOf(PRE_TAG_END, pre_index);
if (last_pre_end_index != -1) {
strBuf.append(html.substring(pre_index, last_pre_end_index + PRE_TAG_END_LENGTH));
}
else {
// found <pre>, but no ending </pre> - shouldn't ever get here
// append rest of string and return it
strBuf.append(html.substring(pre_index));
return strBuf.toString();
}
}
strBuf.append(html.substring(last_pre_end_index < 0 ? 0 : last_pre_end_index + PRE_TAG_END_LENGTH).replaceAll("\r\n" + indentStr, "\r\n"));
return strBuf.toString();
}
}