| //------------------------------------------------------------------------------ |
| // Copyright (c) 2005, 2006 IBM Corporation and others. |
| // All rights reserved. This program and the accompanying materials |
| // are made available under the terms of the Eclipse Public License v1.0 |
| // which accompanies this distribution, and is available at |
| // http://www.eclipse.org/legal/epl-v10.html |
| // |
| // Contributors: |
| // IBM Corporation - initial implementation |
| //------------------------------------------------------------------------------ |
| package org.eclipse.epf.common.html; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.PrintWriter; |
| import java.io.StringWriter; |
| import java.io.UnsupportedEncodingException; |
| import java.util.regex.Pattern; |
| |
| import org.eclipse.epf.common.utils.FileUtil; |
| import org.w3c.tidy.Tidy; |
| |
| /** |
| * Pretty-formats HTML source and makes it XHTML compliant. |
| * |
| * @author Kelvin Low |
| * @since 1.0 |
| */ |
| public class HTMLFormatter { |
| |
| private static final String HTML_BODY_START_TAG = "<body>"; //$NON-NLS-1$ |
| |
| private static final String HTML_BODY_END_TAG = "</body>"; //$NON-NLS-1$ |
| |
| private static final int HTML_BODY_START_TAG_LENGTH = HTML_BODY_START_TAG |
| .length(); |
| |
| private static final String HTML_COPY = "©";//$NON-NLS-1$ |
| |
| private static final String HTML_EURO = "€";//$NON-NLS-1$ |
| |
| private static final String HTML_REG = "®";//$NON-NLS-1$ |
| |
| private static final String HTML_TRADEMARK = "™";//$NON-NLS-1$ |
| |
| public static final String DIAGNOSTIC_SOURCE = "org.eclipse.epf.common.HTMLFormatter"; |
| |
| private int lineWidth; |
| |
| private boolean indent; |
| |
| private int indentSize; |
| |
| private String lastErrorStr; |
| |
| /* String location = m.group(1); |
| String lineStr = m.group(2); |
| String columnStr = m.group(3); |
| String errorMsg = m.group(4); |
| */ |
| public static final Pattern jTidyErrorParser = Pattern.compile("(line\\s+(\\d+)\\s+column\\s+(\\d+))\\s+-\\s+(.+)", Pattern.CASE_INSENSITIVE); //$NON-NLS-1$ |
| /** |
| * Creates a new instance. |
| */ |
| public HTMLFormatter() { |
| this(132, true, 4); |
| } |
| |
| /** |
| * Creates a new instance. |
| */ |
| public HTMLFormatter(int lineWidth, boolean indent, int indentSize) { |
| this.lineWidth = lineWidth; |
| this.indent = indent; |
| this.indentSize = indentSize; |
| } |
| |
| /** |
| * Sets the maximum character width of a line. |
| * |
| * @param lineWidth |
| * The line width (in number of characters). |
| */ |
| public void setLineWidth(int lineWidth) { |
| this.lineWidth = lineWidth; |
| } |
| |
| /** |
| * Enables or disables tags indent. |
| * |
| * @param indent |
| * If true, ident the tags. |
| */ |
| public void setIndent(boolean indent) { |
| this.indent = indent; |
| } |
| |
| /** |
| * Sets the indent size. |
| * |
| * @param indentSize |
| * The indent size (in number of characters). |
| */ |
| public void setIndentSize(int indentSize) { |
| this.indentSize = indentSize; |
| } |
| |
| /** |
| * Formats the given HTML source. |
| * |
| * @param html |
| * The HTML source. |
| * @return The pretty-formatted HTML source. |
| */ |
| public String formatHTML(String html) throws UnsupportedEncodingException { |
| if (html == null || html.length() == 0) { |
| return html; |
| } |
| |
| lastErrorStr = null; |
| |
| Tidy tidy = new Tidy(); |
| tidy.setXHTML(true); |
| tidy.setDropEmptyParas(false); |
| tidy.setDropFontTags(false); |
| tidy.setQuiet(true); |
| tidy.setShowWarnings(false); |
| tidy.setSmartIndent(false); |
| tidy.setTidyMark(false); |
| tidy.setWraplen(lineWidth); |
| tidy.setIndentAttributes(false); |
| tidy.setIndentContent(indent); |
| tidy.setSpaces(indentSize); |
| tidy.setInputEncoding("UTF-8"); //$NON-NLS-1$ |
| tidy.setOutputEncoding("UTF-8"); //$NON-NLS-1$ |
| tidy.setForceOutput(true); |
| |
| ByteArrayInputStream input = new ByteArrayInputStream(html |
| .getBytes("UTF-8")); //$NON-NLS-1$ |
| ByteArrayOutputStream output = new ByteArrayOutputStream(); |
| |
| StringWriter sw = new StringWriter(); |
| PrintWriter pw = new PrintWriter(sw); |
| tidy.setErrout(pw); |
| tidy.parse(input, output); |
| String error = sw.getBuffer().toString(); |
| if (error != null && error.length() > 0 |
| && error.startsWith("line") && error.indexOf("column") > 0) { //$NON-NLS-1$ //$NON-NLS-2$ |
| lastErrorStr = error; |
| } |
| |
| String formattedHTML = new String(output.toByteArray(), "UTF-8"); //$NON-NLS-1$ |
| formattedHTML = escapeHTML(formattedHTML); |
| |
| if (!html.startsWith("<!DOCTYPE") && !html.startsWith("<html")) { //$NON-NLS-1$ //$NON-NLS-2$ |
| int start = formattedHTML.indexOf(HTML_BODY_START_TAG); |
| int end = formattedHTML.indexOf(HTML_BODY_END_TAG); |
| if (start == -1 || end == -1) { |
| return ""; //$NON-NLS-1$ |
| } |
| start += HTML_BODY_START_TAG_LENGTH; |
| if (start >= end) { |
| return ""; //$NON-NLS-1$ |
| } |
| start += FileUtil.LINE_SEP_LENGTH; |
| end -= FileUtil.LINE_SEP_LENGTH; |
| if (indent && indentSize > 0) { |
| end -= indentSize; |
| } |
| if (start >= end) { |
| return ""; //$NON-NLS-1$ |
| } |
| String result = formattedHTML.substring(start, end); |
| if (indent && indentSize > 0) { |
| String indentStr = getIndentStr(indentSize * 2); |
| result = fixIndentation(result, indentStr); |
| return result; |
| } |
| } |
| return formattedHTML; |
| } |
| |
| /** |
| * Returns the indent string. |
| */ |
| private static String getIndentStr(int indentLength) { |
| if (indentLength == 0) { |
| return ""; //$NON-NLS-1$ |
| } |
| StringBuffer indentStr = new StringBuffer(); |
| for (int i = 0; i < indentLength; i++) { |
| indentStr.append(' '); |
| } |
| return indentStr.toString(); |
| } |
| |
| /** |
| * Escapes HTML special characters that are not handled correctly by JTidy. |
| * |
| * @param html |
| * The HTML source. |
| * @return The HTML source with HTML special characters preserved in escaped |
| * form. |
| */ |
| private static String escapeHTML(String html) { |
| if (html == null || html.length() == 0) |
| return ""; //$NON-NLS-1$ |
| StringBuffer sb = new StringBuffer(); |
| int len = html.length(); |
| for (int i = 0; i < len; i++) { |
| char ch = html.charAt(i); |
| switch (ch) { |
| case '\u00a9': |
| sb.append(HTML_COPY); |
| break; |
| case '\u00ae': |
| sb.append(HTML_REG); |
| break; |
| case '\u20ac': |
| sb.append(HTML_EURO); |
| break; |
| case '\u2122': |
| sb.append(HTML_TRADEMARK); |
| break; |
| default: |
| sb.append(ch); |
| break; |
| } |
| } |
| return sb.toString(); |
| } |
| |
| public static final String PRE_TAG_START = "<pre>"; //$NON-NLS-1$ |
| public static final String PRE_TAG_END = "</pre>"; //$NON-NLS-1$ |
| public static final int PRE_TAG_END_LENGTH = PRE_TAG_END.length(); |
| /** |
| * Undo the JTidy indent, but ignore <pre> tags |
| * @param html |
| * @param indentStr |
| * @return |
| */ |
| private static String fixIndentation(String html, String indentStr) { |
| if (html.startsWith(indentStr)) { |
| html = html.substring(indentStr.length()); |
| } |
| StringBuffer strBuf = new StringBuffer(); |
| int pre_index = -1; |
| int last_pre_end_index = -1; |
| while ((pre_index = html.indexOf(PRE_TAG_START, last_pre_end_index)) != -1) { |
| strBuf.append(html.substring(last_pre_end_index < 0 ? 0 : last_pre_end_index + PRE_TAG_END_LENGTH, pre_index).replaceAll("\r\n" + indentStr, "\r\n")); |
| last_pre_end_index = html.indexOf(PRE_TAG_END, pre_index); |
| if (last_pre_end_index != -1) { |
| strBuf.append(html.substring(pre_index, last_pre_end_index + PRE_TAG_END_LENGTH)); |
| } |
| else { |
| // found <pre>, but no ending </pre> - shouldn't ever get here |
| // append rest of string and return it |
| strBuf.append(html.substring(pre_index)); |
| return strBuf.toString(); |
| } |
| } |
| strBuf.append(html.substring(last_pre_end_index < 0 ? 0 : last_pre_end_index + PRE_TAG_END_LENGTH).replaceAll("\r\n" + indentStr, "\r\n")); |
| return strBuf.toString(); |
| } |
| |
| public String getLastErrorStr() { |
| return lastErrorStr; |
| } |
| } |