/*=============================================================================#
 # Copyright (c) 2015, 2021 Stephan Wahlbrink and others.
 # 
 # This program and the accompanying materials are made available under the
 # terms of the Eclipse Public License 2.0 which is available at
 # https://www.eclipse.org/legal/epl-2.0, or the Apache License, Version 2.0
 # which is available at https://www.apache.org/licenses/LICENSE-2.0.
 # 
 # SPDX-License-Identifier: EPL-2.0 OR Apache-2.0
 # 
 # Contributors:
 #     Stephan Wahlbrink <sw@wahlbrink.eu> - initial API and implementation
 #=============================================================================*/

package org.eclipse.statet.jcommons.text.core.util;

import org.eclipse.statet.jcommons.lang.NonNullByDefault;
import org.eclipse.statet.jcommons.lang.Nullable;
import org.eclipse.statet.jcommons.text.core.input.FilterParserInput;
import org.eclipse.statet.jcommons.text.core.input.StringParserInput;
import org.eclipse.statet.jcommons.text.core.input.TextParserInput;


/**
 * Text parser input stripping out HTML markup providing decoded content text.
 */
@NonNullByDefault
public class HtmlStripParserInput extends FilterParserInput {
	
	private static class Match {
		
		private final char[] chars;
		private final int n;
		
		public Match(final char[] chars, final int n) {
			this.chars= chars;
			this.n= n;
		}
		
	}
	
	
	public HtmlStripParserInput(final TextParserInput source, final int defaultBufferSize) {
		super(source, defaultBufferSize);
	}
	
	public HtmlStripParserInput(final TextParserInput source) {
		this(source, DEFAULT_BUFFER_SIZE);
	}
	
	public HtmlStripParserInput(final String source) {
		this(new StringParserInput(source).init(),
				Math.min(source.length(), DEFAULT_BUFFER_SIZE) );
	}
	
	
	@Override
	protected int read(final TextParserInput in, final char[] buffer,
			final int[] beginIndexes, final int[] endIndexes,
			final int beginIdx, final int requiredEnd, final int recommendEnd) {
		int idx= beginIdx;
		ITER_C0: while (idx < recommendEnd) {
			final int c0= in.get(0);
			C0: switch (c0) {
			case EOF:
				break ITER_C0;
			case '<':
				if (consumeTag(in)) {
					continue ITER_C0;
				}
				break C0;
			case '&':
			{	final Match match= readEntity(in);
				if (match != null) {
					if (idx + match.chars.length <= buffer.length) {
						final int beginIndex= in.getIndex();
						final int endIndex= beginIndex + in.getLengthInSource(match.n);
						for (int i= 0; i < match.chars.length; i++, idx++) {
							buffer[idx]= match.chars[i];
							beginIndexes[idx]= beginIndex;
							endIndexes[idx]= endIndex;
						}
						in.consume(match.n);
						continue ITER_C0;
					}
					else {
						break ITER_C0;
					}
				}
				break C0;
			}
			default:
				break C0;
			}
			
			buffer[idx]= (char) c0;
			beginIndexes[idx]= in.getIndex();
			endIndexes[idx]= in.getIndex() + in.getLengthInSource(1);
			idx++;
			in.consume(1);
			continue;
		}
		beginIndexes[idx]= in.getIndex();
		return idx;
	}
	
	
	private boolean consumeTag(final TextParserInput in) {
		// after: <
		int n;
		switch (in.get(1)) {
//		EOF:
//			return false; -> default
		
		case '?': // pi
			n= 2;
			ITER_CN: while (true) {
				switch (in.get(n++)) {
				case EOF:
					in.consume(n - 1);
					return true;
				case '"':
					n= consumeQuoteD(in, n);
					continue ITER_CN;
				case '\'':
					n= consumeQuoteS(in, n);
					continue ITER_CN;
				case '?':
					if (in.get(n) == '>') {
						in.consume(n + 1);
						return true;
					}
					//$FALL-THROUGH$
				default:
					if (n >= 0x400) {
						in.consume(n);
						n= 0;
					}
					continue ITER_CN;
				}
			}
		
		case '!':
			if (in.matches(2, '-', '-')) { // comment
				n= 4;
				ITER_CN: while (true) {
					switch (in.get(n++)) {
					case EOF:
						in.consume(n - 1);
						return true;
					case '>':
						if (in.matches(n - 3, '-', '-')) {
							in.consume(n);
							return true;
						}
						//$FALL-THROUGH$
					default:
						if (n >= 0x400) {
							in.consume(n);
							n= 0;
						}
						continue ITER_CN;
					}
				}
			}
			//$FALL-THROUGH$
		
		case '/':
		case 'A':
		case 'B':
		case 'C':
		case 'D':
		case 'E':
		case 'F':
		case 'G':
		case 'H':
		case 'I':
		case 'J':
		case 'K':
		case 'L':
		case 'M':
		case 'N':
		case 'O':
		case 'P':
		case 'Q':
		case 'R':
		case 'S':
		case 'T':
		case 'U':
		case 'V':
		case 'W':
		case 'X':
		case 'Y':
		case 'Z':
		case 'a':
		case 'b':
		case 'c':
		case 'd':
		case 'e':
		case 'f':
		case 'g':
		case 'h':
		case 'i':
		case 'j':
		case 'k':
		case 'l':
		case 'm':
		case 'n':
		case 'o':
		case 'p':
		case 'q':
		case 'r':
		case 's':
		case 't':
		case 'u':
		case 'v':
		case 'w':
		case 'x':
		case 'y':
		case 'z':
			// tag start
			n= 2;
			ITER_CN: while (true) {
				switch (in.get(n++)) {
				case EOF:
					in.consume(n - 1);
					return true;
				case '"':
					n= consumeQuoteD(in, n);
					continue ITER_CN;
				case '\'':
					n= consumeQuoteS(in, n);
					continue ITER_CN;
				case '>':
					in.consume(n);
					return true;
				default:
					if (n >= 0x400) {
						in.consume(n);
						n= 0;
					}
					continue ITER_CN;
				}
			}
		
		default:
			return false;
		}
	}
	
	private int consumeQuoteD(final TextParserInput in, int n) {
		while (true) {
			switch (in.get(n++)) {
			case EOF:
				return n - 1;
			case '"':
				return n;
			default:
				if (n >= 0x400) {
					in.consume(n);
					n= 0;
				}
				continue;
			}
		}
	}
	
	private int consumeQuoteS(final TextParserInput in, int n) {
		while (true) {
			switch (in.get(n++)) {
			case EOF:
				return n - 1;
			case '\'':
				return n;
			default:
				if (n >= 0x400) {
					in.consume(n);
					n= 0;
				}
				continue;
			}
		}
	}
	
	private @Nullable Match readEntity(final TextParserInput in) {
		// after: &
		int n;
		switch (in.get(1)) {
//		case EOF:
//			return null; -> default
		
		case '#':
			switch (in.get(2)) {
//			case EOF:
//				return null; -> default
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
				n= 3;
				ITER_CN: while (n < 10) {
					switch (in.get(n++)) {
//					case EOF:
//						break; -> default
					case '0':
					case '1':
					case '2':
					case '3':
					case '4':
					case '5':
					case '6':
					case '7':
					case '8':
					case '9':
						continue ITER_CN;
					case ';':
						return resolveEntity(in, Integer.parseInt(in.getString(2, n - 1), 10), n);
					default:
						break ITER_CN;
					}
				}
				return null;
			case 'x':
			case 'X':
				n= 3;
				ITER_CN: while (n < 10) {
					switch (in.get(n++)) {
//					case EOF:
//						break; // -> default
					case '0':
					case '1':
					case '2':
					case '3':
					case '4':
					case '5':
					case '6':
					case '7':
					case '8':
					case '9':
					case 'A':
					case 'B':
					case 'C':
					case 'D':
					case 'E':
					case 'F':
					case 'a':
					case 'b':
					case 'c':
					case 'd':
					case 'e':
					case 'f':
						continue ITER_CN;
					case ';':
						if (n > 4) {
							return resolveEntity(in, Integer.parseInt(in.getString(3, n - 1), 16), n);
						}
						break ITER_CN;
					default:
						break ITER_CN;
					}
				}
				return null;
			default:
				return null;
			}
		
		case 'A':
		case 'B':
		case 'C':
		case 'D':
		case 'E':
		case 'F':
		case 'G':
		case 'H':
		case 'I':
		case 'J':
		case 'K':
		case 'L':
		case 'M':
		case 'N':
		case 'O':
		case 'P':
		case 'Q':
		case 'R':
		case 'S':
		case 'T':
		case 'U':
		case 'V':
		case 'W':
		case 'X':
		case 'Y':
		case 'Z':
		case 'a':
		case 'b':
		case 'c':
		case 'd':
		case 'e':
		case 'f':
		case 'g':
		case 'h':
		case 'i':
		case 'j':
		case 'k':
		case 'l':
		case 'm':
		case 'n':
		case 'o':
		case 'p':
		case 'q':
		case 'r':
		case 's':
		case 't':
		case 'u':
		case 'v':
		case 'w':
		case 'x':
		case 'y':
		case 'z':
			n= 2;
			ITER_CN: while (n < 40) {
				switch (in.get(n++)) {
//				case EOF:
//					break; -> default
				case 'A':
				case 'B':
				case 'C':
				case 'D':
				case 'E':
				case 'F':
				case 'G':
				case 'H':
				case 'I':
				case 'J':
				case 'K':
				case 'L':
				case 'M':
				case 'N':
				case 'O':
				case 'P':
				case 'Q':
				case 'R':
				case 'S':
				case 'T':
				case 'U':
				case 'V':
				case 'W':
				case 'X':
				case 'Y':
				case 'Z':
				case 'a':
				case 'b':
				case 'c':
				case 'd':
				case 'e':
				case 'f':
				case 'g':
				case 'h':
				case 'i':
				case 'j':
				case 'k':
				case 'l':
				case 'm':
				case 'n':
				case 'o':
				case 'p':
				case 'q':
				case 'r':
				case 's':
				case 't':
				case 'u':
				case 'v':
				case 'w':
				case 'x':
				case 'y':
				case 'z':
					continue ITER_CN;
				case ';':
					return resolveEntity(in, in.getString(1, n - 1), n);
				default:
					break ITER_CN;
				}
			}
			return null;
		
		default:
			return null;
		}
	}
	
	private static @Nullable Match resolveEntity(final TextParserInput in, final int codePoint, final int n) {
		try {
			return new Match(Character.toChars(codePoint), n);
		}
		catch (final IllegalArgumentException e) {
		}
		return null;
	}
	
	private static @Nullable Match resolveEntity(final TextParserInput in, final String name, final int n) {
		final HtmlUtils.Entity entity= HtmlUtils.getNamedEntity(name);
		if (entity != null) {
			return new Match(entity.getChars(), n);
		}
		return null;
	}
	
}
