blob: 7fa3f61057c2ecc3f8d9da463653c843d55149ce [file] [log] [blame]
/*******************************************************************************
* Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved.
* This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html
*
* Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation
*******************************************************************************/
package org.eclipse.smila.tika.test;
import java.util.Set;
import org.eclipse.smila.common.language.Language;
import org.eclipse.smila.common.language.LanguageIdentifyService;
import org.eclipse.smila.test.DeclarativeServiceTestCase;
public class TestTikaLanguageIdentifier extends DeclarativeServiceTestCase {
private LanguageIdentifyService _identifier;
@Override
protected void setUp() throws Exception {
super.setUp();
_identifier = getService(LanguageIdentifyService.class);
}
public void testSupportedLanguages() {
final String[] expectedLanguages =
{ "ro", "ca", "no", "hu", "lt", "th", "de", "fi", "sv", "fr", "be", "sl", "sk", "uk", "da", "is", "it", "gl",
"el", "pl", "pt", "eo", "en", "ru", "et", "es", "nl" };
final Set<String> supportedLanguages = _identifier.getSupportedLanguages();
assertEquals(expectedLanguages.length, supportedLanguages.size());
for (final String lang : expectedLanguages) {
assertTrue("Language " + lang + " should be supported.", supportedLanguages.contains(lang));
}
}
public void testGermanText() throws Exception {
assertLanguage("de", "german", false, "Sein oder nicht sein, das ist hier die Frage.");
}
public void testEnglishText() throws Exception {
assertLanguage("en", "english", false, "To be or not to be that is the question.");
}
public void testFrenchText() throws Exception {
assertLanguage("fr", "french", false, "Être, ou ne pas être, telle est la question.");
}
public void testItalianText() throws Exception {
assertLanguage("it", "italian", false, "Essere, o non essere, questo è il dilemma");
}
public void testSpanishText() throws Exception {
assertLanguage("es", "spanish", false, "Ser o no ser, ésa es la pregunta. "
+ "¿Qué es más noble para el espíritu?"
+ "¿Sufrir los dardos y golpes del destino o tomar las armas contra un "
+ "mar de angustias y terminar con ellas combatiéndolas?");
}
public void testPortugueseText() throws Exception {
assertLanguage("pt", "portuguese", false, "Ser ou não ser, eis a questão");
}
public void testDutchText() throws Exception {
assertLanguage("nl", "dutch", false, "Te zijn of niet te zijn, dat is de kwestie");
}
public void testRussianText() throws Exception {
// TODO this is reported as "is", maybe we can find a better text.
// assertLanguage("ru", "russian", false, "Быть или не быть. Вопро� в том, что благородней");
}
public void testSlovakText() throws Exception {
assertLanguage("sk", "slovak", false,
"Hamlet, celý názov Tragédia o Hamletovi, Princovi dánskom je nájznámejšou a "
+ "naj�astejšie citovanou divadelnou hrou anglického dramatika Williama Shakespearea.");
}
public void testSlovenianText() throws Exception {
assertLanguage("sl", "slovenian", false, "Hamlet je tragedija angleškega dramatika Williama Shakespeara. "
+ "Igra je nastala v njegovem tako imenovanem »drugem obdobju« njegovega ustvarjanja v letih med "
+ "1601 in 1608, ki štejejo za dramatikovo najzrelejše obdobje.");
}
public void testDanishText() throws Exception {
assertLanguage("da", "danish", false, "At være eller ikke at være: det er spørgsmålet");
}
public void testNynorskText() throws Exception {
assertLanguage("no", "nynorsk", false, "å vere eller å ikkje vere, det er spørsmålet");
}
public void testSwedish() throws Exception {
assertLanguage("sv", "swedish", false, "Att vara eller icke vara, det är frågan");
}
public void testFinnishText() throws Exception {
assertLanguage("fi", "finnish", false, "ollako vai eikö olla");
}
public void testEsperantoText() {
assertLanguage("eo", null, false, "Ĉu esti aŭ ne esti, tio estas la demando");
}
public void testEmptyString() {
assertNull(_identifier.identify(""));
assertNull(_identifier.identify(null));
}
public void testAlternativeNames() throws Exception {
assertEquals("german", _identifier.getAlternativeName("de"));
assertEquals("english", _identifier.getAlternativeName("en"));
assertEquals("french", _identifier.getAlternativeName("fr"));
assertEquals("spanish", _identifier.getAlternativeName("es"));
assertEquals("dutch", _identifier.getAlternativeName("nl"));
assertEquals("russian", _identifier.getAlternativeName("ru"));
assertEquals("italian", _identifier.getAlternativeName("it"));
assertNull(_identifier.getAlternativeName("tlh"));
}
private void assertLanguage(final String expectedLanguage, final String expectedName,
final boolean expectedIsCertain, final String text) {
final Language detectedLanguage = _identifier.identify(text);
assertEquals("wrong ISO code", expectedLanguage, detectedLanguage.getIsoLanguage());
assertEquals("wrong language name", expectedName, detectedLanguage.getAlternativeName());
assertEquals("wrong isCertain flag", expectedIsCertain, detectedLanguage.isCertain());
}
}