| /******************************************************************************* |
| * Copyright (c) 2008, 2013 Empolis Information Management GmbH and brox IT Solutions GmbH. All rights reserved. |
| * This program and the accompanying materials are made available under the terms of the Eclipse Public License v1.0 |
| * which accompanies this distribution, and is available at http://www.eclipse.org/legal/epl-v10.html |
| * |
| * Contributors: Andreas Weber (Empolis Information Management GmbH) - initial API and implementation |
| *******************************************************************************/ |
| package org.eclipse.smila.tika.test; |
| |
| import java.util.Set; |
| |
| import org.eclipse.smila.common.language.Language; |
| import org.eclipse.smila.common.language.LanguageIdentifyService; |
| import org.eclipse.smila.test.DeclarativeServiceTestCase; |
| |
| public class TestTikaLanguageIdentifier extends DeclarativeServiceTestCase { |
| |
| private LanguageIdentifyService _identifier; |
| |
| @Override |
| protected void setUp() throws Exception { |
| super.setUp(); |
| _identifier = getService(LanguageIdentifyService.class); |
| } |
| |
| public void testSupportedLanguages() { |
| final String[] expectedLanguages = |
| { "ro", "ca", "no", "hu", "lt", "th", "de", "fi", "sv", "fr", "be", "sl", "sk", "uk", "da", "is", "it", "gl", |
| "el", "pl", "pt", "eo", "en", "ru", "et", "es", "nl" }; |
| final Set<String> supportedLanguages = _identifier.getSupportedLanguages(); |
| assertEquals(expectedLanguages.length, supportedLanguages.size()); |
| for (final String lang : expectedLanguages) { |
| assertTrue("Language " + lang + " should be supported.", supportedLanguages.contains(lang)); |
| } |
| } |
| |
| public void testGermanText() throws Exception { |
| assertLanguage("de", "german", false, "Sein oder nicht sein, das ist hier die Frage."); |
| } |
| |
| public void testEnglishText() throws Exception { |
| assertLanguage("en", "english", false, "To be or not to be that is the question."); |
| } |
| |
| public void testFrenchText() throws Exception { |
| assertLanguage("fr", "french", false, "Être, ou ne pas être, telle est la question."); |
| } |
| |
| public void testItalianText() throws Exception { |
| assertLanguage("it", "italian", false, "Essere, o non essere, questo è il dilemma"); |
| } |
| |
| public void testSpanishText() throws Exception { |
| assertLanguage("es", "spanish", false, "Ser o no ser, ésa es la pregunta. " |
| + "¿Qué es más noble para el espÃritu?" |
| + "¿Sufrir los dardos y golpes del destino o tomar las armas contra un " |
| + "mar de angustias y terminar con ellas combatiéndolas?"); |
| } |
| |
| public void testPortugueseText() throws Exception { |
| assertLanguage("pt", "portuguese", false, "Ser ou não ser, eis a questão"); |
| } |
| |
| public void testDutchText() throws Exception { |
| assertLanguage("nl", "dutch", false, "Te zijn of niet te zijn, dat is de kwestie"); |
| } |
| |
| public void testRussianText() throws Exception { |
| // TODO this is reported as "is", maybe we can find a better text. |
| // assertLanguage("ru", "russian", false, "Быть или не быть. Вопро� в том, что благородней"); |
| } |
| |
| public void testSlovakText() throws Exception { |
| assertLanguage("sk", "slovak", false, |
| "Hamlet, celý názov Tragédia o Hamletovi, Princovi dánskom je nájznámejšou a " |
| + "naj�astejšie citovanou divadelnou hrou anglického dramatika Williama Shakespearea."); |
| } |
| |
| public void testSlovenianText() throws Exception { |
| assertLanguage("sl", "slovenian", false, "Hamlet je tragedija angleškega dramatika Williama Shakespeara. " |
| + "Igra je nastala v njegovem tako imenovanem »drugem obdobju« njegovega ustvarjanja v letih med " |
| + "1601 in 1608, ki štejejo za dramatikovo najzrelejše obdobje."); |
| } |
| |
| public void testDanishText() throws Exception { |
| assertLanguage("da", "danish", false, "At være eller ikke at være: det er spørgsmålet"); |
| } |
| |
| public void testNynorskText() throws Exception { |
| assertLanguage("no", "nynorsk", false, "å vere eller å ikkje vere, det er spørsmålet"); |
| } |
| |
| public void testSwedish() throws Exception { |
| assertLanguage("sv", "swedish", false, "Att vara eller icke vara, det är frågan"); |
| } |
| |
| public void testFinnishText() throws Exception { |
| assertLanguage("fi", "finnish", false, "ollako vai eikö olla"); |
| } |
| |
| public void testEsperantoText() { |
| assertLanguage("eo", null, false, "Ĉu esti aŠne esti, tio estas la demando"); |
| } |
| |
| public void testEmptyString() { |
| assertNull(_identifier.identify("")); |
| assertNull(_identifier.identify(null)); |
| } |
| |
| public void testAlternativeNames() throws Exception { |
| assertEquals("german", _identifier.getAlternativeName("de")); |
| assertEquals("english", _identifier.getAlternativeName("en")); |
| assertEquals("french", _identifier.getAlternativeName("fr")); |
| assertEquals("spanish", _identifier.getAlternativeName("es")); |
| assertEquals("dutch", _identifier.getAlternativeName("nl")); |
| assertEquals("russian", _identifier.getAlternativeName("ru")); |
| assertEquals("italian", _identifier.getAlternativeName("it")); |
| assertNull(_identifier.getAlternativeName("tlh")); |
| } |
| |
| private void assertLanguage(final String expectedLanguage, final String expectedName, |
| final boolean expectedIsCertain, final String text) { |
| final Language detectedLanguage = _identifier.identify(text); |
| assertEquals("wrong ISO code", expectedLanguage, detectedLanguage.getIsoLanguage()); |
| assertEquals("wrong language name", expectedName, detectedLanguage.getAlternativeName()); |
| assertEquals("wrong isCertain flag", expectedIsCertain, detectedLanguage.isCertain()); |
| } |
| |
| } |