| /*=============================================================================# |
| # Copyright (c) 2015, 2021 Stephan Wahlbrink and others. |
| # |
| # This program and the accompanying materials are made available under the |
| # terms of the Eclipse Public License 2.0 which is available at |
| # https://www.eclipse.org/legal/epl-2.0, or the Apache License, Version 2.0 |
| # which is available at https://www.apache.org/licenses/LICENSE-2.0. |
| # |
| # SPDX-License-Identifier: EPL-2.0 OR Apache-2.0 |
| # |
| # Contributors: |
| # Stephan Wahlbrink <sw@wahlbrink.eu> - initial API and implementation |
| #=============================================================================*/ |
| |
| package org.eclipse.statet.jcommons.text.core.util; |
| |
| import org.eclipse.statet.jcommons.lang.NonNullByDefault; |
| import org.eclipse.statet.jcommons.lang.Nullable; |
| import org.eclipse.statet.jcommons.text.core.input.FilterParserInput; |
| import org.eclipse.statet.jcommons.text.core.input.StringParserInput; |
| import org.eclipse.statet.jcommons.text.core.input.TextParserInput; |
| |
| |
| /** |
| * Text parser input stripping out HTML markup providing decoded content text. |
| */ |
| @NonNullByDefault |
| public class HtmlStripParserInput extends FilterParserInput { |
| |
| private static class Match { |
| |
| private final char[] chars; |
| private final int n; |
| |
| public Match(final char[] chars, final int n) { |
| this.chars= chars; |
| this.n= n; |
| } |
| |
| } |
| |
| |
| public HtmlStripParserInput(final TextParserInput source, final int defaultBufferSize) { |
| super(source, defaultBufferSize); |
| } |
| |
| public HtmlStripParserInput(final TextParserInput source) { |
| this(source, DEFAULT_BUFFER_SIZE); |
| } |
| |
| public HtmlStripParserInput(final String source) { |
| this(new StringParserInput(source).init(), |
| Math.min(source.length(), DEFAULT_BUFFER_SIZE) ); |
| } |
| |
| |
| @Override |
| protected int read(final TextParserInput in, final char[] buffer, |
| final int[] beginIndexes, final int[] endIndexes, |
| final int beginIdx, final int requiredEnd, final int recommendEnd) { |
| int idx= beginIdx; |
| ITER_C0: while (idx < recommendEnd) { |
| final int c0= in.get(0); |
| C0: switch (c0) { |
| case EOF: |
| break ITER_C0; |
| case '<': |
| if (consumeTag(in)) { |
| continue ITER_C0; |
| } |
| break C0; |
| case '&': |
| { final Match match= readEntity(in); |
| if (match != null) { |
| if (idx + match.chars.length <= buffer.length) { |
| final int beginIndex= in.getIndex(); |
| final int endIndex= beginIndex + in.getLengthInSource(match.n); |
| for (int i= 0; i < match.chars.length; i++, idx++) { |
| buffer[idx]= match.chars[i]; |
| beginIndexes[idx]= beginIndex; |
| endIndexes[idx]= endIndex; |
| } |
| in.consume(match.n); |
| continue ITER_C0; |
| } |
| else { |
| break ITER_C0; |
| } |
| } |
| break C0; |
| } |
| default: |
| break C0; |
| } |
| |
| buffer[idx]= (char) c0; |
| beginIndexes[idx]= in.getIndex(); |
| endIndexes[idx]= in.getIndex() + in.getLengthInSource(1); |
| idx++; |
| in.consume(1); |
| continue; |
| } |
| beginIndexes[idx]= in.getIndex(); |
| return idx; |
| } |
| |
| |
| private boolean consumeTag(final TextParserInput in) { |
| // after: < |
| int n; |
| switch (in.get(1)) { |
| // EOF: |
| // return false; -> default |
| |
| case '?': // pi |
| n= 2; |
| ITER_CN: while (true) { |
| switch (in.get(n++)) { |
| case EOF: |
| in.consume(n - 1); |
| return true; |
| case '"': |
| n= consumeQuoteD(in, n); |
| continue ITER_CN; |
| case '\'': |
| n= consumeQuoteS(in, n); |
| continue ITER_CN; |
| case '?': |
| if (in.get(n) == '>') { |
| in.consume(n + 1); |
| return true; |
| } |
| //$FALL-THROUGH$ |
| default: |
| if (n >= 0x400) { |
| in.consume(n); |
| n= 0; |
| } |
| continue ITER_CN; |
| } |
| } |
| |
| case '!': |
| if (in.matches(2, '-', '-')) { // comment |
| n= 4; |
| ITER_CN: while (true) { |
| switch (in.get(n++)) { |
| case EOF: |
| in.consume(n - 1); |
| return true; |
| case '>': |
| if (in.matches(n - 3, '-', '-')) { |
| in.consume(n); |
| return true; |
| } |
| //$FALL-THROUGH$ |
| default: |
| if (n >= 0x400) { |
| in.consume(n); |
| n= 0; |
| } |
| continue ITER_CN; |
| } |
| } |
| } |
| //$FALL-THROUGH$ |
| |
| case '/': |
| case 'A': |
| case 'B': |
| case 'C': |
| case 'D': |
| case 'E': |
| case 'F': |
| case 'G': |
| case 'H': |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| case 'O': |
| case 'P': |
| case 'Q': |
| case 'R': |
| case 'S': |
| case 'T': |
| case 'U': |
| case 'V': |
| case 'W': |
| case 'X': |
| case 'Y': |
| case 'Z': |
| case 'a': |
| case 'b': |
| case 'c': |
| case 'd': |
| case 'e': |
| case 'f': |
| case 'g': |
| case 'h': |
| case 'i': |
| case 'j': |
| case 'k': |
| case 'l': |
| case 'm': |
| case 'n': |
| case 'o': |
| case 'p': |
| case 'q': |
| case 'r': |
| case 's': |
| case 't': |
| case 'u': |
| case 'v': |
| case 'w': |
| case 'x': |
| case 'y': |
| case 'z': |
| // tag start |
| n= 2; |
| ITER_CN: while (true) { |
| switch (in.get(n++)) { |
| case EOF: |
| in.consume(n - 1); |
| return true; |
| case '"': |
| n= consumeQuoteD(in, n); |
| continue ITER_CN; |
| case '\'': |
| n= consumeQuoteS(in, n); |
| continue ITER_CN; |
| case '>': |
| in.consume(n); |
| return true; |
| default: |
| if (n >= 0x400) { |
| in.consume(n); |
| n= 0; |
| } |
| continue ITER_CN; |
| } |
| } |
| |
| default: |
| return false; |
| } |
| } |
| |
| private int consumeQuoteD(final TextParserInput in, int n) { |
| while (true) { |
| switch (in.get(n++)) { |
| case EOF: |
| return n - 1; |
| case '"': |
| return n; |
| default: |
| if (n >= 0x400) { |
| in.consume(n); |
| n= 0; |
| } |
| continue; |
| } |
| } |
| } |
| |
| private int consumeQuoteS(final TextParserInput in, int n) { |
| while (true) { |
| switch (in.get(n++)) { |
| case EOF: |
| return n - 1; |
| case '\'': |
| return n; |
| default: |
| if (n >= 0x400) { |
| in.consume(n); |
| n= 0; |
| } |
| continue; |
| } |
| } |
| } |
| |
| private @Nullable Match readEntity(final TextParserInput in) { |
| // after: & |
| int n; |
| switch (in.get(1)) { |
| // case EOF: |
| // return null; -> default |
| |
| case '#': |
| switch (in.get(2)) { |
| // case EOF: |
| // return null; -> default |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| n= 3; |
| ITER_CN: while (n < 10) { |
| switch (in.get(n++)) { |
| // case EOF: |
| // break; -> default |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| continue ITER_CN; |
| case ';': |
| return resolveEntity(in, Integer.parseInt(in.getString(2, n - 1), 10), n); |
| default: |
| break ITER_CN; |
| } |
| } |
| return null; |
| case 'x': |
| case 'X': |
| n= 3; |
| ITER_CN: while (n < 10) { |
| switch (in.get(n++)) { |
| // case EOF: |
| // break; // -> default |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| case 'A': |
| case 'B': |
| case 'C': |
| case 'D': |
| case 'E': |
| case 'F': |
| case 'a': |
| case 'b': |
| case 'c': |
| case 'd': |
| case 'e': |
| case 'f': |
| continue ITER_CN; |
| case ';': |
| if (n > 4) { |
| return resolveEntity(in, Integer.parseInt(in.getString(3, n - 1), 16), n); |
| } |
| break ITER_CN; |
| default: |
| break ITER_CN; |
| } |
| } |
| return null; |
| default: |
| return null; |
| } |
| |
| case 'A': |
| case 'B': |
| case 'C': |
| case 'D': |
| case 'E': |
| case 'F': |
| case 'G': |
| case 'H': |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| case 'O': |
| case 'P': |
| case 'Q': |
| case 'R': |
| case 'S': |
| case 'T': |
| case 'U': |
| case 'V': |
| case 'W': |
| case 'X': |
| case 'Y': |
| case 'Z': |
| case 'a': |
| case 'b': |
| case 'c': |
| case 'd': |
| case 'e': |
| case 'f': |
| case 'g': |
| case 'h': |
| case 'i': |
| case 'j': |
| case 'k': |
| case 'l': |
| case 'm': |
| case 'n': |
| case 'o': |
| case 'p': |
| case 'q': |
| case 'r': |
| case 's': |
| case 't': |
| case 'u': |
| case 'v': |
| case 'w': |
| case 'x': |
| case 'y': |
| case 'z': |
| n= 2; |
| ITER_CN: while (n < 40) { |
| switch (in.get(n++)) { |
| // case EOF: |
| // break; -> default |
| case 'A': |
| case 'B': |
| case 'C': |
| case 'D': |
| case 'E': |
| case 'F': |
| case 'G': |
| case 'H': |
| case 'I': |
| case 'J': |
| case 'K': |
| case 'L': |
| case 'M': |
| case 'N': |
| case 'O': |
| case 'P': |
| case 'Q': |
| case 'R': |
| case 'S': |
| case 'T': |
| case 'U': |
| case 'V': |
| case 'W': |
| case 'X': |
| case 'Y': |
| case 'Z': |
| case 'a': |
| case 'b': |
| case 'c': |
| case 'd': |
| case 'e': |
| case 'f': |
| case 'g': |
| case 'h': |
| case 'i': |
| case 'j': |
| case 'k': |
| case 'l': |
| case 'm': |
| case 'n': |
| case 'o': |
| case 'p': |
| case 'q': |
| case 'r': |
| case 's': |
| case 't': |
| case 'u': |
| case 'v': |
| case 'w': |
| case 'x': |
| case 'y': |
| case 'z': |
| continue ITER_CN; |
| case ';': |
| return resolveEntity(in, in.getString(1, n - 1), n); |
| default: |
| break ITER_CN; |
| } |
| } |
| return null; |
| |
| default: |
| return null; |
| } |
| } |
| |
| private static @Nullable Match resolveEntity(final TextParserInput in, final int codePoint, final int n) { |
| try { |
| return new Match(Character.toChars(codePoint), n); |
| } |
| catch (final IllegalArgumentException e) { |
| } |
| return null; |
| } |
| |
| private static @Nullable Match resolveEntity(final TextParserInput in, final String name, final int n) { |
| final HtmlUtils.Entity entity= HtmlUtils.getNamedEntity(name); |
| if (entity != null) { |
| return new Match(entity.getChars(), n); |
| } |
| return null; |
| } |
| |
| } |