Bug 559037 - Support bare hyperlinks (without delimiters) in markdown
Implemented hyperlink detection for markdown, which has to be
enabled by passing "true" to the MarkdownLanguage constructor.
The feature uses a heuristic to detect (already escaped) URLs and turn
them into links in the output.
Change-Id: I5a90da0a5afa8cac86948386a76d5d6db13956df
Signed-off-by: Max Bureck <max.bureck@fokus.fraunhofer.de>
diff --git a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java
index 8092527..2f62a86 100644
--- a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java
+++ b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/MarkdownLanguage.java
@@ -12,6 +12,7 @@
* Alexander Nyßen - support for inline links in phrases
* Pierre-Yves B. <pyvesdev@gmail.com> - Bug 552231 - Styling should not apply inside words
* Pierre-Yves B. <pyvesdev@gmail.com> - Bug 509033 - markdown misses
+ * Max Bureck (Fraunhofer FOKUS) - Bug 559037 - Extended automatic link replacement
*******************************************************************************/
package org.eclipse.mylyn.wikitext.markdown;
@@ -32,6 +33,7 @@
import org.eclipse.mylyn.wikitext.markdown.internal.block.QuoteBlock;
import org.eclipse.mylyn.wikitext.markdown.internal.block.UnderlinedHeadingBlock;
import org.eclipse.mylyn.wikitext.markdown.internal.phrase.BackslashEscapePhraseModifier;
+import org.eclipse.mylyn.wikitext.markdown.internal.phrase.ExtendedAutomaticLinkReplacementToken;
import org.eclipse.mylyn.wikitext.markdown.internal.phrase.SimplePhraseModifier;
import org.eclipse.mylyn.wikitext.markdown.internal.phrase.SimpleWordModifier;
import org.eclipse.mylyn.wikitext.markdown.internal.token.AutomaticLinkReplacementToken;
@@ -60,7 +62,24 @@
*/
public class MarkdownLanguage extends AbstractMarkupLanguage {
+ private final boolean enableHeuristicFeatures;
+
+ /**
+ * Constructs an instance of MarkdownLanguage with heuristic features disabled.
+ */
public MarkdownLanguage() {
+ this(false);
+ }
+
+ /**
+ * Constructs an instance of MarkdownLanguage, with the choice to enable heuristic features. Currently only extended
+ * hyperlink detection (without delimiters) is supported as a heuristic feature
+ *
+ * @param enableHeuristicFeatures
+ * if {@code true} enables heristic features.
+ */
+ public MarkdownLanguage(boolean enableHeuristicFeatures) {
+ this.enableHeuristicFeatures = enableHeuristicFeatures;
setName("Markdown"); //$NON-NLS-1$
}
@@ -116,6 +135,9 @@
phraseModifierSyntax.add(new SimpleWordModifier("_", SpanType.EMPHASIS)); //$NON-NLS-1$
phraseModifierSyntax.add(new SimplePhraseModifier("~~", SpanType.DELETED)); //$NON-NLS-1$
phraseModifierSyntax.add(new SimplePhraseModifier("~", SpanType.DELETED)); //$NON-NLS-1$
+ if (enableHeuristicFeatures) {
+ phraseModifierSyntax.add(new ExtendedAutomaticLinkReplacementToken());
+ }
}
@Override
diff --git a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/internal/phrase/ExtendedAutomaticLinkReplacementToken.java b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/internal/phrase/ExtendedAutomaticLinkReplacementToken.java
new file mode 100644
index 0000000..8098f22
--- /dev/null
+++ b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/main/java/org/eclipse/mylyn/wikitext/markdown/internal/phrase/ExtendedAutomaticLinkReplacementToken.java
@@ -0,0 +1,164 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Fraunhofer FOKUS and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Max Bureck (Fraunhofer FOKUS) - initial API and implementation
+ *******************************************************************************/
+
+package org.eclipse.mylyn.wikitext.markdown.internal.phrase;
+
+import java.util.Set;
+
+import org.eclipse.mylyn.wikitext.parser.markup.PatternBasedElement;
+import org.eclipse.mylyn.wikitext.parser.markup.PatternBasedElementProcessor;
+
+import com.google.common.collect.ImmutableSet;
+
+/**
+ * Heuristic replacement for bare hyperlinks (e.g. http://www.eclipse.org) without < and > delimiters to links in
+ * the output. <br>
+ * Links are detected when the prefix "http://" or "www." after punctuation, white space or a line beginneing are
+ * detected. Links starting with "www." will be prefixed with "http://" in the actual link href. The heuristic is
+ * conservative in the way that it only captures URLs containing only characters valid for URLs. However, it does not
+ * check the format and if characters with semantic meaning are only used in valid positions. URLs with un-escaped
+ * characters are not detected. The reason in mostly complexity, since different parts of a URL have to be escaped
+ * differently, and parsing the URL before escaping is complex.<br>
+ * URLs end with either white space, line end, quotation mark " or a < character. This avoids most URLs with invalid
+ * characters to turn into broken links.<br>
+ * <br>
+ * The following rules regarding trailing characters are in place:
+ * <ul>
+ * <li>Trailing punctuation ({@code ?!\"*.:_-~}) is stripped, since it seems more likely these to not part of the
+ * URL.</li>
+ * <li>Trailing closing parenthesis ')' are stripped, as long as the count of opening and closing parenthesis are not
+ * equal/balanced.</li>
+ * <li>If the URL trail contains a sequence that resembles an HTML entity (pattern {@code &[a-zA-Z0-9];}) the detected
+ * sequence will be removed.</li>
+ * <li>If after stripping the URL prefix is simply "http://", "https://" or "www.", the sequence is not recognized as a
+ * URL at all.</li>
+ * </ul>
+ * This behavior is loosely aligned with the
+ * <a href="https://github.github.com/gfm/#extended-autolink-path-validation">GitHub flavored Markdown autolink
+ * extension and GitHub's actual Markdown implementation.</a>.
+ */
+public class ExtendedAutomaticLinkReplacementToken extends PatternBasedElement {
+
+ /**
+ * Regex based on characters mentioned in RFC-3986: https://www.ietf.org/rfc/rfc3986.txt Note that this Regex does
+ * not check for a completely valid HTTP link, it only checks for the {@code http(s)://} prefix and if the following
+ * characters are valid for URLs.
+ */
+ private static final String AUTOMATIC_LINK_REGEX = "(?<=^|\\s|\\p{Punct})((https?://(?!/)|www\\.)[a-zA-Z0-9:/?#\\[\\]@!$&'\\(\\)\\*+,;=\\-\\._~%]+)(?=$|\"|\\s|<)"; //$NON-NLS-1$
+
+ private static final Set<String> EMPTY_LINKS = ImmutableSet.of("www.", "http://", "https://");
+
+ @Override
+ protected String getPattern(int groupOffset) {
+ return AUTOMATIC_LINK_REGEX;
+ }
+
+ @Override
+ protected int getPatternGroupCount() {
+ return 2;
+ }
+
+ @Override
+ protected PatternBasedElementProcessor newProcessor() {
+ return new PatternBasedElementProcessor() {
+ @Override
+ public void emit() {
+ String href = group(1);
+ int parensBalance = href.codePoints().map(c -> {
+ switch (c) {
+ case '(':
+ return -1;
+ case ')':
+ return +1;
+ default:
+ return 0;
+ }
+ }).sum();
+ // omit punctuation
+ int endIndex = -1;
+ charLoop: for (int i = href.length() - 1; i > 3; i--) {
+ switch (href.charAt(i)) {
+ case '?':
+ case '!':
+ case '\'':
+ case '"':
+ case '*':
+ case '.':
+ case ':':
+ case '_':
+ case '~':
+ endIndex = i;
+ break;
+ case ')':
+ if (parensBalance > 0) {
+ parensBalance--;
+ } else {
+ break charLoop;
+ }
+ case ';':
+ i = skipHtmlEntity(href, i);
+ endIndex = i;
+ break;
+ default:
+ break charLoop;
+ }
+ }
+
+ String linkText;
+ String linkHref;
+ if (endIndex > -1) {
+ linkText = href.substring(0, endIndex);
+ } else {
+ linkText = href;
+ }
+ if (linkText.startsWith("www.")) { //$NON-NLS-1$
+ linkHref = "http://" + linkText; //$NON-NLS-1$
+ } else {
+ linkHref = linkText;
+ }
+
+ if (EMPTY_LINKS.contains(linkText)) {
+ // do not convert "empty" links
+ builder.characters(href);
+ } else {
+ builder.link(linkHref, linkText);
+ // if characters were stripped, add them as regular text after link.
+ if (endIndex > -1) {
+ builder.characters(href.substring(endIndex));
+ }
+ }
+ }
+
+ private int skipHtmlEntity(String href, int endIndex) {
+ for (int i = endIndex - 1; i > 3; i--) {
+ char c = href.charAt(i);
+ boolean isAlphaNum = inRange(c, 'a', 'z') || inRange(c, 'A', 'Z') || inRange(c, '0', '9');
+ // if the character is not in [a-zA-Z0-9], don't skip anything
+ if (c == '&') {
+ return i;
+ }
+ if (!isAlphaNum) {
+ return endIndex;
+ }
+ }
+ // no & found, don't skip anything
+ return endIndex;
+ }
+
+ private boolean inRange(char toCheck, char lowerBound, char upperBound) {
+ return (toCheck >= lowerBound && toCheck <= upperBound);
+ }
+ };
+ }
+
+}
diff --git a/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/test/java/org/eclipse/mylyn/internal/wikitext/markdown/tests/MarkdownLanguageExtendedAutomaticLinkReplacementTest.java b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/test/java/org/eclipse/mylyn/internal/wikitext/markdown/tests/MarkdownLanguageExtendedAutomaticLinkReplacementTest.java
new file mode 100644
index 0000000..d6dbc0e
--- /dev/null
+++ b/wikitext/core/org.eclipse.mylyn.wikitext.markdown/src/test/java/org/eclipse/mylyn/internal/wikitext/markdown/tests/MarkdownLanguageExtendedAutomaticLinkReplacementTest.java
@@ -0,0 +1,168 @@
+/*******************************************************************************
+ * Copyright (c) 2020 Fraunhofer FOKUS and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *
+ * Contributors:
+ * Max Bureck (Fraunhofer FOKUS) - initial API and implementation
+ *******************************************************************************/
+
+package org.eclipse.mylyn.internal.wikitext.markdown.tests;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.eclipse.mylyn.wikitext.markdown.MarkdownLanguage;
+import org.eclipse.mylyn.wikitext.toolkit.AbstractMarkupGenerationTest;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+@RunWith(Parameterized.class)
+public class MarkdownLanguageExtendedAutomaticLinkReplacementTest
+ extends AbstractMarkupGenerationTest<MarkdownLanguage> {
+
+ @Parameters
+ public static Collection<Object[]> data() {
+ return Arrays.asList(new Object[][] { { "http://" }, { "https://" }, { "" } });
+ }
+
+ private final String urlPrefix;
+
+ private final String hrefPrefix;
+
+ public MarkdownLanguageExtendedAutomaticLinkReplacementTest(String prefix) {
+ urlPrefix = prefix;
+ if (prefix.equals("")) {
+ hrefPrefix = "http://";
+ } else {
+ hrefPrefix = prefix;
+ }
+ }
+
+ @Test
+ public void testOnlyLink() {
+ String markdown = urlPrefix + "www.eclipse.org:80/p2%20update/!+*,';$[foo]/(bar)/~/_emf_/-?bar=baz&oomph#foo";
+ // note that in xhmtl attribute values are escaped, therefore
+ // ' is escaped as ' and & is escaped as &
+ String expected = "<p><a href=\"" + hrefPrefix
+ + "www.eclipse.org:80/p2%20update/!+*,';$[foo]/(bar)/~/_emf_/-?bar=baz&oomph#foo\">"
+ + urlPrefix
+ + "www.eclipse.org:80/p2%20update/!+*,';$[foo]/(bar)/~/_emf_/-?bar=baz&oomph#foo</a></p>";
+ assertMarkup(expected, markdown);
+ }
+
+ @Test
+ public void testLinkWithBalancedParens() {
+ String markdown = urlPrefix + "www.eclipse.org/()((fo(o(bar()g)ee)))";
+ String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org/()((fo(o(bar()g)ee)))\">" + urlPrefix
+ + "www.eclipse.org/()((fo(o(bar()g)ee)))</a></p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkWithUnbalancedParens() {
+ String markdown = urlPrefix + "www.eclipse.org/)((foo(bar()g)ee)))";
+ String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org/)((foo(bar()g)ee)\">" + urlPrefix
+ + "www.eclipse.org/)((foo(bar()g)ee)</a>))</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkWithUnbalancedParensAndPunctuation() {
+ String markdown = urlPrefix + "www.eclipse.org/)((foo(bar()g)ee.)!*);)";
+ String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org/)((foo(bar()g)ee.)\">" + urlPrefix
+ + "www.eclipse.org/)((foo(bar()g)ee.)</a>!*);)</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkWithInvalidCharacter() {
+ String markdown = urlPrefix + "www.eclipse.örg";
+ String expectedOutput = "<p>" + urlPrefix + "www.eclipse.örg</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testEmptyLinkAfterStrip() {
+ String prefix = urlPrefix.isEmpty() ? "www." : urlPrefix;
+ String markdown = prefix + "?.~";
+ String expectedOutput = "<p>" + markdown + "</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkAtBeginning() {
+ String markdown = urlPrefix + "www.eclipse.org foo bar";
+ String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+ + "www.eclipse.org</a> foo bar</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testInvalidHtmlEntity() {
+ String markdown = urlPrefix + "www.eclipse.org&@mp;";
+ String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org&@mp\">" + urlPrefix
+ + "www.eclipse.org&@mp</a>;</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testHtmlEntityDetection() {
+ String markdown = urlPrefix + "www.eclipse.org&";
+ String expectedOutput = "<p><a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+ + "www.eclipse.org</a>&amp;</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkEndOfQuote() {
+ String markdown = "foo " + urlPrefix + "www.eclipse.org/downloads\" bar";
+ String expectedOutput = "<p>foo <a href=\"" + hrefPrefix + "www.eclipse.org/downloads\">" + urlPrefix
+ + "www.eclipse.org/downloads</a>\" bar</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkEndWithMultiplePunctuationChars() {
+ String markdown = "foo " + urlPrefix + "www.eclipse.org/downloads\"~. bar";
+ String expectedOutput = "<p>foo <a href=\"" + hrefPrefix + "www.eclipse.org/downloads\">" + urlPrefix
+ + "www.eclipse.org/downloads</a>\"~. bar</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkInText() {
+ String markdown = "foo " + urlPrefix + "www.eclipse.org bar";
+ String expectedOutput = "<p>foo <a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+ + "www.eclipse.org</a> bar</p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkInList() {
+ String markdown = "\n - " + urlPrefix + "www.eclipse.org\n";
+ String expectedOutput = "<ul><li><a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+ + "www.eclipse.org</a></li></ul>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Test
+ public void testLinkAtEnd() {
+ String markdown = "foo bar " + urlPrefix + "www.eclipse.org";
+ String expectedOutput = "<p>foo bar <a href=\"" + hrefPrefix + "www.eclipse.org\">" + urlPrefix
+ + "www.eclipse.org</a></p>";
+ assertMarkup(expectedOutput, markdown);
+ }
+
+ @Override
+ protected MarkdownLanguage createMarkupLanguage() {
+ return new MarkdownLanguage(true);
+ }
+
+}