diff --git a/setup.cfg b/setup.cfg index dff1260d..835f55b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ install_requires = numpy>=1.17 scikit-learn>=0.22 spacy>=2.2.2 + langdetect>=1.0.7 tqdm>=4.3 nltk>=3.3 plotly>=4.2.0 diff --git a/tests/test_indexes.py b/tests/test_indexes.py index adc08008..67900051 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -1,10 +1,11 @@ import pandas as pd +from parameterized import parameterized + from texthero import nlp, visualization, preprocessing, representation from . import PandasTestCase import unittest import string -from parameterized import parameterized # Define valid inputs for different functions. @@ -25,6 +26,7 @@ test_cases_nlp = [ ["named_entities", nlp.named_entities, (s_text,)], ["noun_chunks", nlp.noun_chunks, (s_text,)], + ["infer_lang", nlp.infer_lang, (s_text,)], ] test_cases_preprocessing = [ diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 2df9db61..7a2c3066 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -68,3 +68,127 @@ def test_count_sentences_wrong_index(self): t_different_index = pd.Series(["", ""], index=[5, 7]) self.assertFalse(counted_sentences_s.index.equals(t_different_index.index)) + + def test_infer_lang(self): + # no found words in the following languages it, hr and hi that the function succeeds to detect. + s = pd.Series( + [ + "Wêreld", + "مرحبا بالعالم", + "български", + "ওহে বিশ্ব", + "català", + "Ahoj světe", + "Helo Byd", + "dansk", + "Deutsch", + "Γειά σου Κόσμε", + "fox", + "Hola Mundo", + "Tere, Maailm", + "فارسی", + "Hei maailma", + "Bonjour le monde", + "હેલો વર્લ્ડ", + "שלום עולם", + "Helló Világ", + "Bahasa", + "こんにちは世界", + "ಹಲೋ ವರ್ಲ್ಡ್", + "안녕하세요 세계", + "lietuvių kalba", + "Sveika pasaule", + "Здраво свету", + "ഹലോ വേൾഡ്", + "मराठी", + "नेपाली", + "Vlaams", + "Norsk", + "ਸਤਿ ਸ੍ਰੀ ਅਕਾਲ ਦੁਨਿਆ", + "Witaj świecie", + "Olá Mundo", + "Română", + "русский", + "Slovenský", + "Pozdravljen, svet", + "Soomaaliga", + "Përshendetje Botë", + "Hej världen", + "Kiswahili", + "வணக்கம் உலகம்", + "హలో ప్రపంచ", + "สวัสดีชาวโลก", + "Wikang Tagalog", + "Selam Dünya", + "Привіт Світ", + "ہیلو دنیا", + "Chào thế giới", + "中文", + "中華民國國歌", + # "धन्यवाद", + # "Lijepa naša domovino", + # "Italiano", + ] + ) + + s_true = pd.Series( + [ + "af", + "ar", + "bg", + "bn", + "ca", + "cs", + "cy", + "da", + "de", + "el", + "en", + "es", + "et", + "fa", + "fi", + "fr", + "gu", + "he", + "hu", + "id", + "ja", + "kn", + "ko", + "lt", + "lv", + "mk", + "ml", + "mr", + "ne", + "nl", + "no", + "pa", + "pl", + "pt", + "ro", + "ru", + "sk", + "sl", + "so", + "sq", + "sv", + "sw", + "ta", + "te", + "th", + "tl", + "tr", + "uk", + "ur", + "vi", + "zh-cn", + "zh-tw", + # 'hi', + # 'hr', + # 'it' + ] + ) + s_result = nlp.infer_lang(s) + self.assertEqual(s_result, s_true) diff --git a/texthero/nlp.py b/texthero/nlp.py index 52956d5c..85404b34 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -4,6 +4,9 @@ import spacy import pandas as pd +from langdetect import detect_langs +from langdetect.lang_detect_exception import LangDetectException +from langdetect.language import Language def named_entities(s, package="spacy"): @@ -129,3 +132,71 @@ def count_sentences(s: pd.Series) -> pd.Series: number_of_sentences.append(sentences) return pd.Series(number_of_sentences, index=s.index) + + +def _Language_to_tuple(lang: Language): + return (str(lang.lang), "%.5f" % float(lang.prob)) + + +def _detect_language_probability(s): + """ + gured out appling detect_langs function on sentence + :param s + """ + try: + detected_language = list(map(_Language_to_tuple, detect_langs(s))) + return detected_language + except LangDetectException: + return ("UNKNOWN", 0.0) + + +def _detect_language(s): + """ + gured out appling detect_langs function on sentence + :param s + """ + try: + detected_language = str(detect_langs(s)[0].lang) + return detected_language + except LangDetectException: + return "UNKNOWN" + + +def infer_lang(s, probability=False): + """ + Return languages and their probabilities. + + Return a Pandas Series where each row contains a ISO nomenclature of the "average" infer language. + + If probability = True then each row contains a list of tuples + + Tuple : (language, probability) + + Note: infer_lang is nondeterministic function + + Parameters + ---------- + s : Pandas Series + probability (optional) : boolean + + supports 55 languages out of the box (ISO 639-1 codes) + ------------------------------------------------------ + af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, + hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, + pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series("This is an English text!.") + >>> hero.infer_lang(s) + 0 en + dtype: object + + """ + + if probability: + return s.apply(_detect_language_probability) + else: + return s.apply(_detect_language)