blob: a6652fcde03872cb0c2c04e6aa10303fa528df16 [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2000, 2012 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/legal/epl-2.0/
*
* SPDX-License-Identifier: EPL-2.0
*
* Contributors:
* IBM Corporation - initial API and implementation
* Sergey Prigogin, Google
*******************************************************************************/
package org.eclipse.cdt.internal.ui.text;
import java.text.CharacterIterator;
import org.eclipse.core.runtime.Assert;
import com.ibm.icu.text.BreakIterator;
/**
* A C break iterator. It returns all breaks, including before and after
* whitespace, and it returns all camel case breaks.
* <p>
* A line break may be any of "\n", "\r", "\r\n", "\n\r".
* </p>
*
* @since 4.0
*/
public class CBreakIterator extends BreakIterator {
/**
* A run of common characters.
*/
protected static abstract class Run {
/** The length of this run. */
protected int length;
public Run() {
init();
}
/**
* Returns <code>true</code> if this run consumes <code>ch</code>,
* <code>false</code> otherwise. If <code>true</code> is returned,
* the length of the receiver is adjusted accordingly.
*
* @param ch the character to test
* @return <code>true</code> if <code>ch</code> was consumed
*/
protected boolean consume(char ch) {
if (isValid(ch)) {
length++;
return true;
}
return false;
}
/**
* Whether this run accepts that character; does not update state. Called
* from the default implementation of <code>consume</code>.
*
* @param ch the character to test
* @return <code>true</code> if <code>ch</code> is accepted
*/
protected abstract boolean isValid(char ch);
/**
* Resets this run to the initial state.
*/
protected void init() {
length = 0;
}
}
static final class Whitespace extends Run {
@Override
protected boolean isValid(char ch) {
return Character.isWhitespace(ch) && ch != '\n' && ch != '\r';
}
}
static final class LineDelimiter extends Run {
/** State: INIT -> delimiter -> EXIT. */
private char fState;
private static final char INIT = '\0';
private static final char EXIT = '\1';
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#init()
*/
@Override
protected void init() {
super.init();
fState = INIT;
}
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#consume(char)
*/
@Override
protected boolean consume(char ch) {
if (!isValid(ch) || fState == EXIT)
return false;
if (fState == INIT) {
fState = ch;
length++;
return true;
} else if (fState != ch) {
fState = EXIT;
length++;
return true;
} else {
return false;
}
}
@Override
protected boolean isValid(char ch) {
return ch == '\n' || ch == '\r';
}
}
static final class Identifier extends Run {
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#isValid(char)
*/
@Override
protected boolean isValid(char ch) {
return Character.isJavaIdentifierPart(ch);
}
}
static final class CamelCaseIdentifier extends Run {
/* states */
private static final int S_INIT = 0;
private static final int S_LOWER = 1;
private static final int S_ONE_CAP = 2;
private static final int S_ALL_CAPS = 3;
private static final int S_UNDERSCORE = 4;
private static final int S_EXIT = 5;
private static final int S_EXIT_MINUS_ONE = 6;
/* character types */
private static final int K_INVALID = 0;
private static final int K_LOWER = 1;
private static final int K_UPPER = 2;
private static final int K_UNDERSCORE = 3;
private static final int K_OTHER = 4;
private int fState;
private static final int[][] MATRIX = new int[][] {
// K_INVALID, K_LOWER, K_UPPER, K_UNDERSCORE, K_OTHER
{ S_EXIT, S_LOWER, S_ONE_CAP, S_UNDERSCORE, S_LOWER }, // S_INIT
{ S_EXIT, S_LOWER, S_EXIT, S_UNDERSCORE, S_LOWER }, // S_LOWER
{ S_EXIT, S_LOWER, S_ALL_CAPS, S_UNDERSCORE, S_LOWER }, // S_ONE_CAP
{ S_EXIT, S_EXIT_MINUS_ONE, S_ALL_CAPS, S_UNDERSCORE, S_LOWER }, // S_ALL_CAPS
{ S_EXIT, S_EXIT, S_EXIT, S_UNDERSCORE, S_EXIT }, // S_UNDERSCORE
};
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#init()
*/
@Override
protected void init() {
super.init();
fState = S_INIT;
}
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#consumes(char)
*/
@Override
protected boolean consume(char ch) {
int kind = getKind(ch);
fState = MATRIX[fState][kind];
switch (fState) {
case S_LOWER:
case S_ONE_CAP:
case S_ALL_CAPS:
case S_UNDERSCORE:
length++;
return true;
case S_EXIT:
return false;
case S_EXIT_MINUS_ONE:
length--;
return false;
default:
Assert.isTrue(false);
return false;
}
}
/**
* Determines the kind of a character.
*
* @param ch the character to test
*/
private int getKind(char ch) {
if (Character.isUpperCase(ch))
return K_UPPER;
if (Character.isLowerCase(ch))
return K_LOWER;
if (ch == '_')
return K_UNDERSCORE;
if (Character.isJavaIdentifierPart(ch)) // digits...
return K_OTHER;
return K_INVALID;
}
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#isValid(char)
*/
@Override
protected boolean isValid(char ch) {
return Character.isJavaIdentifierPart(ch);
}
}
static final class Other extends Run {
/*
* @see org.eclipse.cdt.internal.ui.text.CBreakIterator.Run#isValid(char)
*/
@Override
protected boolean isValid(char ch) {
return !Character.isWhitespace(ch) && !Character.isJavaIdentifierPart(ch);
}
}
private static final Run WHITESPACE = new Whitespace();
private static final Run DELIMITER = new LineDelimiter();
private static final Run IDENTIFIER = new Identifier();
private static final Run CAMELCASE = new CamelCaseIdentifier();
private static final Run OTHER = new Other();
/** The platform break iterator (word instance) used as a base. */
protected final BreakIterator fIterator;
/** The text we operate on. */
protected CharSequence fText;
/** our current position for the stateful methods. */
private int fIndex;
/** Break on camel case word boundaries */
private boolean fCamelCaseBreakEnabled = true;
/**
* Creates a new break iterator.
*/
public CBreakIterator() {
fIterator = BreakIterator.getWordInstance();
fIndex = fIterator.current();
}
/*
* @see com.ibm.icu.text.BreakIterator#current()
*/
@Override
public int current() {
return fIndex;
}
/*
* @see com.ibm.icu.text.BreakIterator#first()
*/
@Override
public int first() {
fIndex = fIterator.first();
return fIndex;
}
/*
* @see com.ibm.icu.text.BreakIterator#following(int)
*/
@Override
public int following(int offset) {
// work around too eager IAEs in standard implementation
if (offset == getText().getEndIndex())
return DONE;
int next = fIterator.following(offset);
if (next == DONE)
return DONE;
// TODO deal with complex script word boundaries
// Math.min(offset + run.length, next) does not work
// since BreakIterator.getWordInstance considers _ as boundaries
// seems to work fine, however
Run run = consumeRun(offset);
return offset + run.length;
}
/**
* Consumes a run of characters at the limits of which we introduce a break.
* @param offset the offset to start at
* @return the run that was consumed
*/
private Run consumeRun(int offset) {
// assert offset < length
char ch = fText.charAt(offset);
int length = fText.length();
Run run = getRun(ch);
while (run.consume(ch) && offset < length - 1) {
offset++;
ch = fText.charAt(offset);
}
return run;
}
/**
* Returns a run based on a character.
*
* @param ch the character to test
* @return the correct character given <code>ch</code>
*/
private Run getRun(char ch) {
Run run;
if (WHITESPACE.isValid(ch))
run = WHITESPACE;
else if (DELIMITER.isValid(ch))
run = DELIMITER;
else if (IDENTIFIER.isValid(ch)) {
if (fCamelCaseBreakEnabled)
run = CAMELCASE;
else
run = IDENTIFIER;
} else if (OTHER.isValid(ch))
run = OTHER;
else {
Assert.isTrue(false);
return null;
}
run.init();
return run;
}
/*
* @see com.ibm.icu.text.BreakIterator#getText()
*/
@Override
public CharacterIterator getText() {
return fIterator.getText();
}
/*
* @see com.ibm.icu.text.BreakIterator#isBoundary(int)
*/
@Override
public boolean isBoundary(int offset) {
if (offset == getText().getBeginIndex())
return true;
return following(offset - 1) == offset;
}
/*
* @see com.ibm.icu.text.BreakIterator#last()
*/
@Override
public int last() {
fIndex = fIterator.last();
return fIndex;
}
/*
* @see com.ibm.icu.text.BreakIterator#next()
*/
@Override
public int next() {
fIndex = following(fIndex);
return fIndex;
}
/*
* @see com.ibm.icu.text.BreakIterator#next(int)
*/
@Override
public int next(int n) {
return fIterator.next(n);
}
/*
* @see com.ibm.icu.text.BreakIterator#preceding(int)
*/
@Override
public int preceding(int offset) {
if (offset == getText().getBeginIndex())
return DONE;
if (isBoundary(offset - 1))
return offset - 1;
int previous = offset - 1;
do {
previous = fIterator.preceding(previous);
} while (!isBoundary(previous));
int last = DONE;
while (previous < offset) {
last = previous;
previous = following(previous);
}
return last;
}
/*
* @see com.ibm.icu.text.BreakIterator#previous()
*/
@Override
public int previous() {
fIndex = preceding(fIndex);
return fIndex;
}
/*
* @see com.ibm.icu.text.BreakIterator#setText(java.lang.String)
*/
@Override
public void setText(String newText) {
setText((CharSequence) newText);
}
/**
* Creates a break iterator given a char sequence.
* @param newText the new text
*/
public void setText(CharSequence newText) {
fText = newText;
fIterator.setText(new SequenceCharacterIterator(newText));
first();
}
/*
* @see com.ibm.icu.text.BreakIterator#setText(java.text.CharacterIterator)
*/
@Override
public void setText(CharacterIterator newText) {
if (newText instanceof CharSequence) {
fText = (CharSequence) newText;
fIterator.setText(newText);
first();
} else {
throw new UnsupportedOperationException("CharacterIterator not supported"); //$NON-NLS-1$
}
}
/**
* Enables breaks at word boundaries inside a camel case identifier.
*
* @param camelCaseBreakEnabled <code>true</code> to enable, <code>false</code> to disable.
*/
public void setCamelCaseBreakEnabled(boolean camelCaseBreakEnabled) {
fCamelCaseBreakEnabled = camelCaseBreakEnabled;
}
/**
* @return <code>true</code> if breaks at word boundaries inside
* a camel case identifier are enabled.
*/
public boolean isCamelCaseBreakEnabled() {
return fCamelCaseBreakEnabled;
}
}