franco-bach commited on
Commit
5f102c2
1 Parent(s): 351494a

feat: language detection

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. utils.py +21 -33
app.py CHANGED
@@ -92,7 +92,7 @@ def check_errors(text):
92
  gr.Warning(f'La longitud del texto ({len(text)} caracteres) sobrepasa el máximo permitido.')
93
  error = True
94
  if not CheckLanguageIsSpanish().detect_spanish(text):
95
- gr.Warning('El texto tiene que estar en Español.')
96
  error = True
97
  return error
98
 
 
92
  gr.Warning(f'La longitud del texto ({len(text)} caracteres) sobrepasa el máximo permitido.')
93
  error = True
94
  if not CheckLanguageIsSpanish().detect_spanish(text):
95
+ gr.Warning('El texto está en Ingles o posee oraciones en dicho idioma.')
96
  error = True
97
  return error
98
 
utils.py CHANGED
@@ -1,37 +1,25 @@
1
- # import spacy
2
- # from spacy.language import Language
3
- # from spacy_langdetect import LanguageDetector
4
 
5
- # class CheckLanguageIsSpanish:
6
- # def __init__(self):
7
-
8
- # self.nlp_es = spacy.load("es_core_news_sm")
9
- # Language.factory("language_detector", func=self.get_lang_detector)
10
- # self.nlp_es.add_pipe('language_detector', last=True)
11
-
12
- # def get_lang_detector(self, nlp, name):
13
- # return LanguageDetector()
14
 
15
- # def detect_spanish(self, text):
16
- # doc = self.nlp_es(text)
17
- # lang_prediction = doc._.language
18
- # confidence = lang_prediction['score'] * 100
19
- # if confidence > 95:
20
- # print(f"espanio perri {confidence}")
21
- # return True
22
- # else:
23
- # print(f"no espanio perri {confidence}")
24
- # return False
25
-
26
- from langdetect import detect_langs
27
 
28
- class CheckLanguageIsSpanish:
29
- def detect_spanish(self, text, confidence_threshold=0.85):
30
- lang_results = detect_langs(text)
31
- top_language = lang_results[0].lang
32
- top_confidence = lang_results[0].prob
33
 
34
- if top_language == 'es' and top_confidence >= confidence_threshold:
35
- return True
36
- else:
37
- return False
 
 
 
 
 
 
1
+ from lingua import Language, LanguageDetectorBuilder
 
 
2
 
3
+ class CheckLanguageIsSpanish:
4
+ def __init__(self):
5
+ self.languages = [Language.SPANISH, Language.ENGLISH]
6
+ self.detector = LanguageDetectorBuilder.from_languages(*self.languages).build()
 
 
 
 
 
7
 
8
+ def detect_english(self, text):
9
+ spanish = True
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Compares confidence values between languages
12
+ confidence_values = self.detector.compute_language_confidence_values(text)
13
+ confidence_dict = {confidence.language.name: confidence.value for confidence in confidence_values}
14
+ if confidence_dict["ENGLISH"] > confidence_dict["SPANISH"]:
15
+ spanish = False
16
 
17
+ # Checks if there is at least one sentence with 5 words or more in English
18
+ languages = self.detector.detect_multiple_languages_of(text)
19
+ for result in languages:
20
+ if result.language.name == "ENGLISH":
21
+ if result.word_count >= 5:
22
+ spanish = False
23
+ break
24
+
25
+ return spanish