Bug 559037 - Support bare hyperlinks (without delimiters) in markdown Implemented hyperlink detection for markdown, which has to be enabled by passing "true" to the MarkdownLanguage constructor. The feature uses a heuristic to detect (already escaped) URLs and turn them into links in the output. Change-Id: I5a90da0a5afa8cac86948386a76d5d6db13956df Signed-off-by: Max Bureck <max.bureck@fokus.fraunhofer.de>

commit: 97ec756860751a6a448075d05502fe9caf127ffb [log] [tgz]
author: Max Bureck <max.bureck@fokus.fraunhofer.de> Fri Jan 10 18:18:38 2020 +0100
committer: Max Bureck <max.bureck@fokus.fraunhofer.de> Fri Jan 10 18:26:40 2020 +0100
tree: 2420eb1e4b0eebb42135ed7763fe82a8ae096d73
parent: b583ca6da660743bc791e4bd684ed5648580de21 [diff]
diff --git a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java
index 8092527..2f62a86 100644
--- a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java
+++ b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java

@@ -12,6 +12,7 @@
  *     Alexander Nyßen - support for inline links in phrases
  *     Pierre-Yves B. <pyvesdev@gmail.com> - Bug 552231 - Styling should not apply inside words
  *     Pierre-Yves B. <pyvesdev@gmail.com> - Bug 509033 - markdown misses
+ *     Max Bureck (Fraunhofer FOKUS) - Bug 559037 - Extended automatic link replacement
  *******************************************************************************/
 
 package org.eclipse.mylyn.wikitext.markdown;
@@ -32,6 +33,7 @@
 import org.eclipse.mylyn.wikitext.markdown.internal.block.QuoteBlock;
 import org.eclipse.mylyn.wikitext.markdown.internal.block.UnderlinedHeadingBlock;
 import org.eclipse.mylyn.wikitext.markdown.internal.phrase.BackslashEscapePhraseModifier;
+import org.eclipse.mylyn.wikitext.markdown.internal.phrase.ExtendedAutomaticLinkReplacementToken;
 import org.eclipse.mylyn.wikitext.markdown.internal.phrase.SimplePhraseModifier;
 import org.eclipse.mylyn.wikitext.markdown.internal.phrase.SimpleWordModifier;
 import org.eclipse.mylyn.wikitext.markdown.internal.token.AutomaticLinkReplacementToken;
@@ -60,7 +62,24 @@
  */
 public class MarkdownLanguage extends AbstractMarkupLanguage {
 
+	private final boolean enableHeuristicFeatures;
+
+	/**
+	 * Constructs an instance of MarkdownLanguage with heuristic features disabled.
+	 */
 	public MarkdownLanguage() {
+		this(false);
+	}
+
+	/**
+	 * Constructs an instance of MarkdownLanguage, with the choice to enable heuristic features. Currently only extended
+	 * hyperlink detection (without delimiters) is supported as a heuristic feature
+	 * 
+	 * @param enableHeuristicFeatures
+	 *            if {@code true} enables heristic features.
+	 */
+	public MarkdownLanguage(boolean enableHeuristicFeatures) {
+		this.enableHeuristicFeatures = enableHeuristicFeatures;
 		setName("Markdown"); //$NON-NLS-1$
 	}
 
@@ -116,6 +135,9 @@
 		phraseModifierSyntax.add(new SimpleWordModifier("_", SpanType.EMPHASIS)); //$NON-NLS-1$
 		phraseModifierSyntax.add(new SimplePhraseModifier("~~", SpanType.DELETED)); //$NON-NLS-1$
 		phraseModifierSyntax.add(new SimplePhraseModifier("~", SpanType.DELETED)); //$NON-NLS-1$
+		if (enableHeuristicFeatures) {
+			phraseModifierSyntax.add(new ExtendedAutomaticLinkReplacementToken());
+		}
 	}
 
 	@Override

diff --git a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/internal/phrase/ExtendedAutomaticLinkReplacementToken.java b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/internal/phrase/ExtendedAutomaticLinkReplacementToken.java
new file mode 100644
index 0000000..8098f22
--- /dev/null
+++ b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/internal/phrase/ExtendedAutomaticLinkReplacementToken.java

@@ -0,0 +1,164 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Fraunhofer FOKUS and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ *     Max Bureck (Fraunhofer FOKUS) - initial API and implementation
+ *******************************************************************************/
+
+package org.eclipse.mylyn.wikitext.markdown.internal.phrase;
+
+import java.util.Set;
+
+import org.eclipse.mylyn.wikitext.parser.markup.PatternBasedElement;
+import org.eclipse.mylyn.wikitext.parser.markup.PatternBasedElementProcessor;
+
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * Heuristic replacement for bare hyperlinks (e.g. http://www.eclipse.org) without &lt; and &gt; delimiters to links in
+ * the output. <br>
+ * Links are detected when the prefix "http://" or "www." after punctuation, white space or a line beginneing are
+ * detected. Links starting with "www." will be prefixed with "http://" in the actual link href. The heuristic is
+ * conservative in the way that it only captures URLs containing only characters valid for URLs. However, it does not
+ * check the format and if characters with semantic meaning are only used in valid positions. URLs with un-escaped
+ * characters are not detected. The reason in mostly complexity, since different parts of a URL have to be escaped
+ * differently, and parsing the URL before escaping is complex.<br>
+ * URLs end with either white space, line end, quotation mark " or a &lt; character. This avoids most URLs with invalid
+ * characters to turn into broken links.<br>
+ * <br>
+ * The following rules regarding trailing characters are in place:
+ * <ul>
+ * <li>Trailing punctuation ({@code ?!\"*.:_-~}) is stripped, since it seems more likely these to not part of the
+ * URL.</li>
+ * <li>Trailing closing parenthesis ')' are stripped, as long as the count of opening and closing parenthesis are not
+ * equal/balanced.</li>
+ * <li>If the URL trail contains a sequence that resembles an HTML entity (pattern {@code &[a-zA-Z0-9];}) the detected
+ * sequence will be removed.</li>
+ * <li>If after stripping the URL prefix is simply "http://", "https://" or "www.", the sequence is not recognized as a
+ * URL at all.</li>
+ * </ul>
+ * This behavior is loosely aligned with the
+ * <a href="https://github.github.com/gfm/#extended-autolink-path-validation">GitHub flavored Markdown autolink
+ * extension and GitHub's actual Markdown implementation.</a>.
+ */
+public class ExtendedAutomaticLinkReplacementToken extends PatternBasedElement {
+
+	/**
+	 * Regex based on characters mentioned in RFC-3986: https://www.ietf.org/rfc/rfc3986.txt Note that this Regex does
+	 * not check for a completely valid HTTP link, it only checks for the {@code http(s)://} prefix and if the following
+	 * characters are valid for URLs.
+	 */
+	private static final String AUTOMATIC_LINK_REGEX = "(?<=^|\\s|\\p{Punct})((https?://(?!/)|www\\.)[a-zA-Z0-9:/?#\\[\\]@!$&'\\(\\)\\*+,;=\\-\\._~%]+)(?=$|\"|\\s|<)"; //$NON-NLS-1$
+
+	private static final Set<String> EMPTY_LINKS = ImmutableSet.of("www.", "http://", "https://");
+
+	@Override
+	protected String getPattern(int groupOffset) {
+		return AUTOMATIC_LINK_REGEX;
+	}
+
+	@Override
+	protected int getPatternGroupCount() {
+		return 2;
+	}
+
+	@Override
+	protected PatternBasedElementProcessor newProcessor() {
+		return new PatternBasedElementProcessor() {
+			@Override
+			public void emit() {
+				String href = group(1);
+				int parensBalance = href.codePoints().map(c -> {
+					switch (c) {
+					case '(':
+						return -1;
+					case ')':
+						return +1;
+					default:
+						return 0;
+					}
+				}).sum();
+				// omit punctuation
+				int endIndex = -1;
+				charLoop: for (int i = href.length() - 1; i > 3; i--) {
+					switch (href.charAt(i)) {
+					case '?':
+					case '!':
+					case '\'':
+					case '"':
+					case '*':
+					case '.':
+					case ':':
+					case '_':
+					case '~':
+						endIndex = i;
+						break;
+					case ')':
+						if (parensBalance > 0) {
+							parensBalance--;
+						} else {
+							break charLoop;
+						}
+					case ';':
+						i = skipHtmlEntity(href, i);
+						endIndex = i;
+						break;
+					default:
+						break charLoop;
+					}
+				}
+
+				String linkText;
+				String linkHref;
+				if (endIndex > -1) {
+					linkText = href.substring(0, endIndex);
+				} else {
+					linkText = href;
+				}
+				if (linkText.startsWith("www.")) { //$NON-NLS-1$
+					linkHref = "http://" + linkText; //$NON-NLS-1$
+				} else {
+					linkHref = linkText;
+				}
+
+				if (EMPTY_LINKS.contains(linkText)) {
+					// do not convert "empty" links
+					builder.characters(href);
+				} else {
+					builder.link(linkHref, linkText);
+					// if characters were stripped, add them as regular text after link.
+					if (endIndex > -1) {
+						builder.characters(href.substring(endIndex));
+					}
+				}
+			}
+
+			private int skipHtmlEntity(String href, int endIndex) {
+				for (int i = endIndex - 1; i > 3; i--) {
+					char c = href.charAt(i);
+					boolean isAlphaNum = inRange(c, 'a', 'z') || inRange(c, 'A', 'Z') || inRange(c, '0', '9');
+					// if the character is not in [a-zA-Z0-9], don't skip anything
+					if (c == '&') {
+						return i;
+					}
+					if (!isAlphaNum) {
+						return endIndex;
+					}
+				}
+				// no & found, don't skip anything
+				return endIndex;
+			}
+
+			private boolean inRange(char toCheck, char lowerBound, char upperBound) {
+				return (toCheck >= lowerBound && toCheck <= upperBound);
+			}
+		};
+	}
+
+}

diff --git a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/test/java/org/eclipse/mylyn/internal/wikitext/markdown/tests/MarkdownLanguageExtendedAutomaticLinkReplacementTest.java b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/test/java/org/eclipse/mylyn/internal/wikitext/markdown/tests/MarkdownLanguageExtendedAutomaticLinkReplacementTest.java
new file mode 100644
index 0000000..d6dbc0e
--- /dev/null
+++ b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/test/java/org/eclipse/mylyn/internal/wikitext/markdown/tests/MarkdownLanguageExtendedAutomaticLinkReplacementTest.java

@@ -0,0 +1,168 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Fraunhofer FOKUS and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ *     Max Bureck (Fraunhofer FOKUS) - initial API and implementation
+ *******************************************************************************/
+
+package org.eclipse.mylyn.internal.wikitext.markdown.tests;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.eclipse.mylyn.wikitext.markdown.MarkdownLanguage;
+import org.eclipse.mylyn.wikitext.toolkit.AbstractMarkupGenerationTest;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(Parameterized.class)
+public class MarkdownLanguageExtendedAutomaticLinkReplacementTest
+		extends AbstractMarkupGenerationTest<MarkdownLanguage> {
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		return Arrays.asList(new Object[][] { { "http://" }, { "https://" }, { "" } });
+	}
+
+	private final String urlPrefix;
+
+	private final String hrefPrefix;
+
+	public MarkdownLanguageExtendedAutomaticLinkReplacementTest(String prefix) {
+		urlPrefix = prefix;
+		if (prefix.equals("")) {
+			hrefPrefix = "http://";
+		} else {
+			hrefPrefix = prefix;
+		}
+	}
+
+	@Test
+	public void testOnlyLink() {
+		String markdown = urlPrefix + "www.eclipse.org:80/p2%20update/!+*,';$[foo]/(bar)/~/_emf_/-?bar=baz&oomph#foo";
+		// note that in xhmtl attribute values are escaped, therefore
+		// ' is escaped as &apos; and & is escaped as &amp;
+		String expected = "<p><a href=\"" + hrefPrefix
+				+ "www.eclipse.org:80/p2%20update/!+*,&apos;;$[foo]/(bar)/~/_emf_/-?bar=baz&amp;oomph#foo\">"
+				+ urlPrefix
+				+ "www.eclipse.org:80/p2%20update/!+*,';$[foo]/(bar)/~/_emf_/-?bar=baz&amp;oomph#foo</a></p>";
+		assertMarkup(expected, markdown);
+	}
+
+	@Test
+	public void testLinkWithBalancedParens() {
+		String markdown = urlPrefix + "www.eclipse.org/()((fo(o(bar()g)ee)))";
+		String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org/()((fo(o(bar()g)ee)))\">" + urlPrefix
+				+ "www.eclipse.org/()((fo(o(bar()g)ee)))</a></p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkWithUnbalancedParens() {
+		String markdown = urlPrefix + "www.eclipse.org/)((foo(bar()g)ee)))";
+		String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org/)((foo(bar()g)ee)\">" + urlPrefix
+				+ "www.eclipse.org/)((foo(bar()g)ee)</a>))</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkWithUnbalancedParensAndPunctuation() {
+		String markdown = urlPrefix + "www.eclipse.org/)((foo(bar()g)ee.)!*);)";
+		String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org/)((foo(bar()g)ee.)\">" + urlPrefix
+				+ "www.eclipse.org/)((foo(bar()g)ee.)</a>!*);)</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkWithInvalidCharacter() {
+		String markdown = urlPrefix + "www.eclipse.örg";
+		String expectedOutput = "<p>" + urlPrefix + "www.eclipse.örg</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testEmptyLinkAfterStrip() {
+		String prefix = urlPrefix.isEmpty() ? "www." : urlPrefix;
+		String markdown = prefix + "?.~";
+		String expectedOutput = "<p>" + markdown + "</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkAtBeginning() {
+		String markdown = urlPrefix + "www.eclipse.org foo bar";
+		String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+				+ "www.eclipse.org</a> foo bar</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testInvalidHtmlEntity() {
+		String markdown = urlPrefix + "www.eclipse.org&@mp;";
+		String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org&amp;@mp\">" + urlPrefix
+				+ "www.eclipse.org&amp;@mp</a>;</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testHtmlEntityDetection() {
+		String markdown = urlPrefix + "www.eclipse.org&amp;";
+		String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+				+ "www.eclipse.org</a>&amp;amp;</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkEndOfQuote() {
+		String markdown = "foo " + urlPrefix + "www.eclipse.org/downloads\" bar";
+		String expectedOutput = "<p>foo <a href=\"" + hrefPrefix + "www.eclipse.org/downloads\">" + urlPrefix
+				+ "www.eclipse.org/downloads</a>\" bar</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkEndWithMultiplePunctuationChars() {
+		String markdown = "foo " + urlPrefix + "www.eclipse.org/downloads\"~. bar";
+		String expectedOutput = "<p>foo <a href=\"" + hrefPrefix + "www.eclipse.org/downloads\">" + urlPrefix
+				+ "www.eclipse.org/downloads</a>\"~. bar</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkInText() {
+		String markdown = "foo " + urlPrefix + "www.eclipse.org bar";
+		String expectedOutput = "<p>foo <a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+				+ "www.eclipse.org</a> bar</p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkInList() {
+		String markdown = "\n  - " + urlPrefix + "www.eclipse.org\n";
+		String expectedOutput = "<ul><li><a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+				+ "www.eclipse.org</a></li></ul>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Test
+	public void testLinkAtEnd() {
+		String markdown = "foo bar " + urlPrefix + "www.eclipse.org";
+		String expectedOutput = "<p>foo bar <a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+				+ "www.eclipse.org</a></p>";
+		assertMarkup(expectedOutput, markdown);
+	}
+
+	@Override
+	protected MarkdownLanguage createMarkupLanguage() {
+		return new MarkdownLanguage(true);
+	}
+
+}
commit	97ec756860751a6a448075d05502fe9caf127ffb	[log] [tgz]
author	Max Bureck <max.bureck@fokus.fraunhofer.de>	Fri Jan 10 18:18:38 2020 +0100
committer	Max Bureck <max.bureck@fokus.fraunhofer.de>	Fri Jan 10 18:26:40 2020 +0100
tree	2420eb1e4b0eebb42135ed7763fe82a8ae096d73
parent	b583ca6da660743bc791e4bd684ed5648580de21 [diff]