| /******************************************************************************* |
| * Copyright (c) 2016 BSI Business Systems Integration AG. |
| * All rights reserved. This program and the accompanying materials |
| * are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at |
| * http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: |
| * BSI Business Systems Integration AG - initial API and implementation |
| ******************************************************************************/ |
| package org.eclipse.scout.commons.html; |
| |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.eclipse.scout.commons.StringUtility; |
| |
| /** |
| * @since 6.0 (backported) |
| */ |
| public final class HtmlHelper { |
| |
| private static final Pattern HTML_PARAGRAPH_END_TAGS = Pattern.compile("<br/?></div>|</div>|<br/?>|</p>|<p/>|</tr>|</table>", Pattern.CASE_INSENSITIVE); |
| |
| /** |
| * Very basic HTML to plain text conversion, without parsing and building a model. |
| * <p> |
| * The following rules are applied: |
| * <ul> |
| * <li>If the string contains a valid body tag (something between <code><body></code> and |
| * <code></body></code>), only plain text of the body's content is returned. Otherwise, the plain text of the |
| * entire string is returned. |
| * <li><code>null</code> is only returned if the input is <code>null</code>. If no plain text is contained, the empty |
| * string (<code>""</code>) is returned. |
| * <li>The following tags are considered "end of paragraph" and are converted to <code>\n</code>: |
| * <ul> |
| * <li><code><br></div></code> |
| * <li><code><br/></div></code> |
| * <li><code></div></code> |
| * <li><code><br></code> |
| * <li><code><br/></code> |
| * <li><code></p></code> |
| * <li><code><p/></code> |
| * <li><code></tr></code> |
| * <li><code></table></code> |
| * </ul> |
| * <li>All other tags are replaced by a space. |
| * <li>Multiple consecutive spaces are merged to one space. |
| * <li>Leading and trailing whitespace line is removed from each line. |
| * </ul> |
| * <p> |
| */ |
| public static String toPlainText(String html) { |
| if (html == null || html.length() == 0) { |
| return html; |
| } |
| String s = StringUtility.getTag(html, "body", true); |
| if (s == null) { |
| // <body> not found, use entire input |
| s = html; |
| } |
| //newlines |
| s = StringUtility.replace(s, "\r", ""); |
| s = StringUtility.replace(s, "\n", " "); |
| Matcher matcher = HTML_PARAGRAPH_END_TAGS.matcher(s); |
| s = matcher.replaceAll("\n"); |
| //tabs |
| s = StringUtility.replace(s, StringUtility.HTML_ENCODED_TAB, "\t"); |
| //remove tags |
| s = Pattern.compile("<[^>]+>", Pattern.DOTALL).matcher(s).replaceAll(" "); |
| //remove multiple spaces |
| s = s.replaceAll("[ ]+", " "); |
| //remove spaces at the beginning and end of each line |
| s = s.replaceAll("[ ]+\n", "\n"); |
| s = s.replaceAll("\n[ ]+", "\n"); |
| s = unescape(s); |
| |
| // space |
| s = StringUtility.replace(s, " ", " "); |
| s = StringUtility.replace(s, " ", " "); |
| s = StringUtility.replace(s, " ", " "); |
| |
| // tab |
| s = StringUtility.replace(s, "	", "\t"); |
| s = StringUtility.replace(s, "	", "\t"); |
| |
| s = s.trim(); |
| return s; |
| } |
| |
| /** |
| * Escapes the given string for use in HTML code. Useful when inserting data from an untrusted source directly inside |
| * HTML. Unlike {@link StringUtility#htmlEncode(String)}, this method does not alter whitespace. |
| * <p> |
| * According to <a href= |
| * "https://www.owasp.org/index.php/XSS_%28Cross_Site_Scripting%29_Prevention_Cheat_Sheet#RULE_.231_-_HTML_Escape_Before_Inserting_Untrusted_Data_into_HTML_Element_Content" |
| * > OWASP recommendations</a>, the following characters are replaced: |
| * <ul> |
| * <li><code>&</code> --> <code>&amp;</code> |
| * <li><code><</code> --> <code>&lt;</code> |
| * <li><code>></code> --> <code>&gt;</code> |
| * <li><code>"</code> --> <code>&quot;</code> |
| * <li><code>'</code> --> <code>&#39;</code> |
| * <li><code>/</code> --> <code>&#47;</code> |
| * </ul> |
| * |
| * @see https://www.owasp.org/index.php/XSS_%28Cross_Site_Scripting%29_Prevention_Cheat_Sheet |
| */ |
| public static String escape(String text) { |
| if (text == null || text.length() == 0) { |
| return text; |
| } |
| text = StringUtility.replace(text, "&", "&"); |
| text = StringUtility.replace(text, "<", "<"); |
| text = StringUtility.replace(text, ">", ">"); |
| text = StringUtility.replace(text, "\"", """); |
| text = StringUtility.replace(text, "/", "/"); |
| text = StringUtility.replace(text, "'", "'"); |
| return text; |
| } |
| |
| /** |
| * Reverse operation of {@link #escape(String)}. Unlike {@link StringUtility#htmlDecode(String)}, this method does not |
| * alter whitespace. |
| */ |
| public static String unescape(String html) { |
| if (html == null || html.length() == 0) { |
| return html; |
| } |
| |
| String decoded = StringUtility.replace(html, "&", "&"); |
| decoded = StringUtility.replace(decoded, "&", "&"); |
| decoded = StringUtility.replace(decoded, "&", "&"); |
| |
| decoded = StringUtility.replace(decoded, "<", "<"); |
| decoded = StringUtility.replace(decoded, "<", "<"); |
| decoded = StringUtility.replace(decoded, "<", "<"); |
| |
| decoded = StringUtility.replace(decoded, ">", ">"); |
| decoded = StringUtility.replace(decoded, ">", ">"); |
| decoded = StringUtility.replace(decoded, ">", ">"); |
| |
| decoded = StringUtility.replace(decoded, """, "\""); |
| decoded = StringUtility.replace(decoded, """, "\""); |
| decoded = StringUtility.replace(decoded, """, "\""); |
| |
| decoded = StringUtility.replace(decoded, "/", "/"); // no named entity for the slash |
| decoded = StringUtility.replace(decoded, "/", "/"); |
| |
| decoded = StringUtility.replace(decoded, "'", "'"); |
| decoded = StringUtility.replace(decoded, "'", "'"); |
| decoded = StringUtility.replace(decoded, "'", "'"); |
| return decoded; |
| } |
| } |