Demosthene-OR commited on
Commit
cb4b492
1 Parent(s): dea6b5e

Création de la route /lang_id_dl

Browse files

pour identifier la langue d'une liste de phrases grâce au Deep Learning

data/multilingue/lan_code.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ heb,ind,tha,ido,fin,zsm,ukr,vol,nld,wuu,mkd,bel,sqi,est,tuk,avk,dan,gos,ilo,tok,eng,gcf,lfn,glg,cat,asm,hye,bul,jbo,lat,ota,spa,uig,urd,cbk,aze,ara,ron,tur,hin,yid,cmn,grn,run,gle,ben,por,shi,afr,isl,rus,eus,arq,tlh,nob,slk,mhr,war,jpn,ber,vie,fra,kmr,tgl,kat,pol,lit,kab,mon,epo,kaz,ell,ina,kzj,deu,swe,ile,hrv,nds,tat,dtp,kor,mar,oci,ita,hun,yue,srp,cor,pes,frr,ckb,lvs,bre,ces
data/multilingue/lan_to_language.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"cmn": "Mandarin Chinese", "deu": "German", "rus": "Russian", "fra": "French", "eng": "English", "jpn": "Japanese", "spa": "Spanish", "ita": "Italian", "kor": "Korean", "vie": "Vietnamese", "nld": "Dutch", "epo": "Esperanto", "por": "Portuguese", "tur": "Turkish", "heb": "Hebrew", "hun": "Hungarian", "ell": "Modern Greek (1453-)", "ind": "Indonesian", "ara": "Arabic", "arz": "Egyptian Arabic", "fin": "Finnish", "bul": "Bulgarian", "yue": "Yue Chinese", "swe": "Swedish", "ukr": "Ukrainian", "bel": "Belarusian", "que": "Quechua", "ces": "Czech", "swh": "Swahili (individual language)", "nno": "Norwegian Nynorsk", "wuu": "Wu Chinese", "nob": "Norwegian Bokm\u00e5l", "zsm": "Standard Malay", "est": "Estonian", "kat": "Georgian", "pol": "Polish", "lat": "Latin", "urd": "Urdu", "sqi": "Albanian", "isl": "Icelandic", "fry": "Western Frisian", "afr": "Afrikaans", "ron": "Romanian", "fao": "Faroese", "san": "Sanskrit", "bre": "Breton", "tat": "Tatar", "yid": "Yiddish", "uig": "Uighur", "uzb": "Uzbek", "srp": "Serbian", "qya": "Quenya", "dan": "Danish", "pes": "Iranian Persian", "slk": "Slovak", "eus": "Basque", "cycl": "CycL", "acm": "Mesopotamian Arabic", "tgl": "Tagalog", "lvs": "Standard Latvian", "kaz": "Kazakh", "hye": "Armenian", "hin": "Hindi", "lit": "Lithuanian", "ben": "Bengali", "cat": "Catalan", "bos": "Bosnian", "hrv": "Croatian", "tha": "Thai", "orv": "Old Russian", "cha": "Chamorro", "mon": "Mongolian", "lzh": "Literary Chinese", "scn": "Sicilian", "gle": "Irish", "mkd": "Macedonian", "slv": "Slovenian", "frm": "Middle French (ca. 1400-1600)", "glg": "Galician", "vol": "Volap\u00fck", "ain": "Ainu (Japan)", "jbo": "Lojban", "tok": "Toki Pona", "ina": "Interlingua (International Auxiliary Language Association)", "nds": "Low German", "mal": "Malayalam", "tlh": "Klingon", "roh": "Romansh", "ltz": "Luxembourgish", "oss": "Ossetian", "ido": "Ido", "gla": "Scottish Gaelic", "mlt": "Maltese", "sco": "Scots", "ast": "Asturian", "jav": "Javanese", "oci": "Occitan (post 1500)", "ile": "Interlingue", "ota": "Ottoman Turkish (1500-1928)", "xal": "Kalmyk", "tel": "Telugu", "sjn": "Sindarin", "nov": "Novial", "khm": "Central Khmer", "tpi": "Tok Pisin", "ang": "Old English (ca. 450-1100)", "aze": "Azerbaijani", "tgk": "Tajik", "tuk": "Turkmen", "chv": "Chuvash", "hsb": "Upper Sorbian", "dsb": "Lower Sorbian", "bod": "Tibetan", "sme": "Northern Sami", "cym": "Welsh", "mri": "Maori", "ksh": "K\u00f6lsch", "kmr": "Northern Kurdish", "ewe": "Ewe", "kab": "Kabyle", "ber": "Berber languages", "tpw": "Tup\u00ed", "udm": "Udmurt", "lld": "Ladin", "pms": "Piemontese", "lad": "Ladino", "grn": "Guarani", "mlg": "Malagasy", "xho": "Xhosa", "pnb": "Western Panjabi", "grc": "Ancient Greek (to 1453)", "hat": "Haitian", "lao": "Lao", "npi": "Nepali (individual language)", "cor": "Cornish", "nah": "Nahuatl", "avk": "Kotava", "mar": "Marathi", "guj": "Gujarati", "pan": "Panjabi", "kir": "Kirghiz", "myv": "Erzya", "prg": "Prussian", "sux": "Sumerian", "crs": "Seselwa Creole French", "ckt": "Chukot", "bak": "Bashkir", "zlm": "Malay (individual language)", "hil": "Hiligaynon", "cbk": "Chavacano", "chr": "Cherokee", "nav": "Navajo", "lkt": "Lakota", "enm": "Middle English (1100-1500)", "arq": "Algerian Arabic", "lin": "Lingala", "abk": "Abkhazian", "pcd": "Picard", "rom": "Romany", "gsw": "Swiss German", "tam": "Tamil", "zul": "Zulu", "awa": "Awadhi", "wln": "Walloon", "amh": "Amharic", "bar": "Bavarian", "hbo": "Ancient Hebrew", "mhr": "Eastern Mari", "bho": "Bhojpuri", "mrj": "Western Mari", "ckb": "Central Kurdish", "osx": "Old Saxon", "pfl": "Pfaelzisch", "mgm": "Mambae", "sna": "Shona", "mah": "Marshallese", "hau": "Hausa", "kan": "Kannada", "nog": "Nogai", "sin": "Sinhala", "glv": "Manx", "dng": "Dungan", "kal": "Kalaallisut", "liv": "Liv", "vro": "V\u00f5ro", "apc": "North Levantine Arabic", "jdt": "Judeo-Tat", "fur": "Friulian", "che": "Chechen", "haw": "Hawaiian", "yor": "Yoruba", "crh": "Crimean Tatar", "pdc": "Pennsylvania German", "ppl": "Pipil", "kin": "Kinyarwanda", "shs": "Shuswap", "mnw": "Mon", "tet": "Tetum", "sah": "Yakut", "kum": "Kumyk", "ngt": "Ngeq", "nya": "Nyanja", "pus": "Pushto", "hif": "Fiji Hindi", "mya": "Burmese", "moh": "Mohawk", "wol": "Wolof", "tir": "Tigrinya", "ton": "Tonga (Tonga Islands)", "lzz": "Laz", "oar": "Old Aramaic (up to 700 BCE)", "lug": "Ganda", "brx": "Bodo (India)", "non": "Old Norse", "mww": "Hmong Daw", "hak": "Hakka Chinese", "nlv": "Orizaba Nahuatl", "ngu": "Guerrero Nahuatl", "bua": "Buriat", "aym": "Aymara", "vec": "Venetian", "ibo": "Igbo", "tkl": "Tokelau", "bam": "Bambara", "kha": "Khasi", "ceb": "Cebuano", "lou": "Louisiana Creole", "fuc": "Pulaar", "smo": "Samoan", "gag": "Gagauz", "lfn": "Lingua Franca Nova", "arg": "Aragonese", "umb": "Umbundu", "tyv": "Tuvinian", "kjh": "Khakas", "oji": "Ojibwa", "cyo": "Cuyonon", "urh": "Urhobo", "kzj": "Coastal Kadazan", "pam": "Pampanga", "srd": "Sardinian", "lmo": "Lombard", "swg": "Swabian", "mdf": "Moksha", "gil": "Gilbertese", "snd": "Sindhi", "tso": "Tsonga", "sot": "Southern Sotho", "zza": "Zaza", "tsn": "Tswana", "pau": "Palauan", "som": "Somali", "egl": "Emilian", "ady": "Adyghe", "asm": "Assamese", "ori": "Oriya (macrolanguage)", "dtp": "Kadazan Dusun", "cho": "Choctaw", "max": "North Moluccan Malay", "kam": "Kamba (Kenya)", "niu": "Niuean", "sag": "Sango", "ilo": "Iloko", "kaa": "Kara-Kalpak", "fuv": "Nigerian Fulfulde", "nch": "Central Huasteca Nahuatl", "hoc": "Ho", "iba": "Iban", "gbm": "Garhwali", "sun": "Sundanese", "war": "Waray (Philippines)", "mvv": "Tagal Murut", "pap": "Papiamento", "ary": "Moroccan Arabic", "kxi": "Keningau Murut", "csb": "Kashubian", "pag": "Pangasinan", "cos": "Corsican", "rif": "Tarifit", "kek": "Kekch\u00ed", "krc": "Karachay-Balkar", "aii": "Assyrian Neo-Aramaic", "ban": "Balinese", "ssw": "Swati", "tvl": "Tuvalu", "mfe": "Morisyen", "tah": "Tahitian", "bvy": "Baybayanon", "bcl": "Central Bikol", "hnj": "Hmong Njua", "nau": "Nauru", "nst": "Tase Naga", "afb": "Gulf Arabic", "quc": "K'iche'", "min": "Minangkabau", "tmw": "Temuan", "mad": "Madurese", "bjn": "Banjar", "mai": "Maithili", "cjy": "Jinyu Chinese", "got": "Gothic", "hsn": "Xiang Chinese", "gan": "Gan Chinese", "tzl": "Talossan", "dws": "Dutton World Speedwords", "ldn": "L\u00e1adan", "afh": "Afrihili", "sgs": "Samogitian", "krl": "Karelian", "vep": "Veps", "rue": "Rusyn", "tly": "Talysh", "mic": "Mi'kmaq", "ext": "Extremaduran", "izh": "Ingrian", "sma": "Southern Sami", "jam": "Jamaican Creole English", "cmo": "Central Mnong", "mwl": "Mirandese", "kpv": "Komi-Zyrian", "koi": "Komi-Permyak", "bis": "Bislama", "ike": "Eastern Canadian Inuktitut", "run": "Rundi", "evn": "Evenki", "ryu": "Central Okinawan", "mnc": "Manchu", "aoz": "Uab Meto", "otk": "Old Turkish", "kas": "Kashmiri", "aln": "Gheg Albanian", "akl": "Aklanon", "yua": "Yucateco", "shy": "Tachawit", "fkv": "Kven Finnish", "gos": "Gronings", "fij": "Fijian", "thv": "Tahaggart Tamahaq", "zgh": "Standard Moroccan Tamazight", "gcf": "Guadeloupean Creole French", "cay": "Cayuga", "xmf": "Mingrelian", "tig": "Tigre", "div": "Dhivehi", "lij": "Ligurian", "rap": "Rapanui", "hrx": "Hunsrik", "cpi": "Chinese Pidgin English", "tts": "Northeastern Thai", "gaa": "Ga", "tmr": "Jewish Babylonian Aramaic (ca. 200-1200 CE)", "iii": "Sichuan Yi", "ltg": "Latgalian", "bzt": "Brithenig", "syc": "Classical Syriac", "emx": "Erromintxela", "gom": "Goan Konkani", "chg": "Chagatai", "osp": "Old Spanish", "stq": "Saterfriesisch", "frr": "Northern Frisian", "fro": "Old French (842-ca. 1400)", "nys": "Nyunga", "toi": "Tonga (Zambia)", "new": "Newari", "phn": "Phoenician", "jpa": "Jewish Palestinian Aramaic", "rel": "Rendille", "drt": "Drents", "chn": "Chinook jargon", "pli": "Pali", "laa": "Southern Subanen", "bal": "Baluchi", "hdn": "Northern Haida", "hax": "Southern Haida", "mik": "Mikasuki", "ajp": "South Levantine Arabic", "xqa": "Karakhanid", "pal": "Pahlavi", "crk": "Plains Cree", "mni": "Manipuri", "lut": "Lushootseed", "ayl": "Libyan Arabic", "ood": "Tohono O'odham", "sdh": "Southern Kurdish", "ofs": "Old Frisian", "nus": "Nuer", "kiu": "Kirmanjki (individual language)", "diq": "Dimli (individual language)", "qxq": "Qashqa'i", "alt": "Southern Altai", "bfz": "Mahasu Pahari", "klj": "Turkic Khalaj", "mus": "Creek", "srn": "Sranan Tongo", "guc": "Wayuu", "lim": "Limburgan", "zea": "Zeeuws", "shi": "Tachelhit", "mnr": "Mono (USA)", "bom": "Berom", "sat": "Santali", "szl": "Silesian", "igs": "Interglossa"}
main_dl.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI, HTTPException, Header, Depends, Request, Response
2
  from fastapi.responses import JSONResponse
3
  from fastapi.security import HTTPBasic, HTTPBasicCredentials
4
  from fastapi.exceptions import RequestValidationError
@@ -12,9 +12,15 @@ from filesplit.merge import Merge
12
  import tensorflow as tf
13
  import string
14
  import re
 
 
 
 
15
  from tensorflow import keras
 
16
  from keras_nlp.layers import TransformerEncoder
17
  from tensorflow.keras import layers
 
18
  from tensorflow.keras.utils import plot_model
19
 
20
  api = FastAPI()
@@ -237,14 +243,55 @@ def load_all_data():
237
 
238
  rnn_en_fr, rnn_fr_en, transformer_en_fr, transformer_fr_en = load_all_data()
239
 
240
-
241
- def find_lang_label(lang_sel):
242
- global lang_tgt, label_lang
243
- return label_lang[lang_tgt.index(lang_sel)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
  @api.get('/', name="Vérification que l'API fonctionne")
246
  def check_api():
247
  load_all_data()
 
 
248
  return {'message': "L'API fonctionne"}
249
 
250
  @api.get('/small_vocab/rnn', name="Traduction par RNN")
@@ -293,115 +340,7 @@ def affiche_modele(lang_tgt:str,
293
 
294
  # Retourner l'image en tant que réponse HTTP avec le type de contenu approprié
295
  return Response(content=image_data, media_type="image/png")
296
- '''
297
- def run():
298
-
299
- global n1, df_data_src, df_data_tgt, translation_model, placeholder, model_speech
300
- global df_data_en, df_data_fr, lang_classifier, translation_en_fr, translation_fr_en
301
- global lang_tgt, label_lang
302
-
303
- st.write("")
304
- st.title(tr(title))
305
- #
306
- st.write("## **"+tr("Explications")+" :**\n")
307
-
308
- st.markdown(tr(
309
- """
310
- Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
311
- """)
312
- , unsafe_allow_html=True)
313
- st.markdown(tr(
314
- """
315
- La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
316
- un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
317
- avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
318
- """)
319
- , unsafe_allow_html=True)
320
- st.image("assets/deepnlp_graph1.png",use_column_width=True)
321
- st.markdown(tr(
322
- """
323
- Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
324
- Vous en trouverez :red[**5 illustrations**] ci-dessous.
325
- """)
326
- , unsafe_allow_html=True)
327
-
328
- # Utilisation du module translate
329
- lang_tgt = ['en','fr','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
330
- label_lang = ['Anglais','Français','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
331
-
332
- lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
333
- 'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
334
- 'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
335
-
336
- st.write("#### "+tr("Choisissez le type de traduction")+" :")
337
-
338
- chosen_id = tab_bar(data=[
339
- TabBarItemData(id="tab1", title="small vocab", description=tr("avec Keras et un RNN")),
340
- TabBarItemData(id="tab2", title="small vocab", description=tr("avec Keras et un Transformer")),
341
- TabBarItemData(id="tab3", title=tr("Phrase personnelle"), description=tr("à écrire")),
342
- TabBarItemData(id="tab4", title=tr("Phrase personnelle"), description=tr("à dicter")),
343
- TabBarItemData(id="tab5", title=tr("Funny translation !"), description=tr("avec le Fine Tuning"))],
344
- default="tab1")
345
-
346
- if (chosen_id == "tab1") or (chosen_id == "tab2") :
347
- if (chosen_id == "tab1"):
348
- st.write("<center><h5><b>"+tr("Schéma d'un Réseau de Neurones Récurrents")+"</b></h5></center>", unsafe_allow_html=True)
349
- st.image("assets/deepnlp_graph3.png",use_column_width=True)
350
- else:
351
- st.write("<center><h5><b>"+tr("Schéma d'un Transformer")+"</b></h5></center>", unsafe_allow_html=True)
352
- st.image("assets/deepnlp_graph12.png",use_column_width=True)
353
- st.write("## **"+tr("Paramètres")+" :**\n")
354
- TabContainerHolder = st.container()
355
- Sens = TabContainerHolder.radio(tr('Sens')+':',('Anglais -> Français','Français -> Anglais'), horizontal=True)
356
- Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
357
-
358
- if (Lang=='en_fr'):
359
- df_data_src = df_data_en
360
- df_data_tgt = df_data_fr
361
- if (chosen_id == "tab1"):
362
- translation_model = rnn_en_fr
363
- else:
364
- translation_model = transformer_en_fr
365
- else:
366
- df_data_src = df_data_fr
367
- df_data_tgt = df_data_en
368
- if (chosen_id == "tab1"):
369
- translation_model = rnn_fr_en
370
- else:
371
- translation_model = transformer_fr_en
372
- sentence1 = st.selectbox(tr("Selectionnez la 1ere des 3 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
373
- n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
374
-
375
- st.write("## **"+tr("Résultats")+" :**\n")
376
- if (chosen_id == "tab1"):
377
- display_translation(n1, Lang,1)
378
- else:
379
- display_translation(n1, Lang,2)
380
-
381
- st.write("## **"+tr("Details sur la méthode")+" :**\n")
382
- if (chosen_id == "tab1"):
383
- st.markdown(tr(
384
- """
385
- Nous avons utilisé 2 Gated Recurrent Units.
386
- Vous pouvez constater que la traduction avec un RNN est relativement lente.
387
- Ceci est notamment du au fait que les tokens passent successivement dans les GRU,
388
- alors que les calculs sont réalisés en parrallèle dans les Transformers.
389
- Le score BLEU est bien meilleur que celui des traductions mot à mot.
390
- <br>
391
- """)
392
- , unsafe_allow_html=True)
393
- else:
394
- st.markdown(tr(
395
- """
396
- Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
397
- La dimension de l'embedding des tokens = 256
398
- La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
399
- <br>
400
- """)
401
- , unsafe_allow_html=True)
402
- st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5>", unsafe_allow_html=True)
403
- plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file=st.session_state.ImagePath+'/model_plot.png')
404
- st.image(st.session_state.ImagePath+'/model_plot.png',use_column_width=True)
405
- st.write("</center>", unsafe_allow_html=True)
406
-
407
- '''
 
1
+ from fastapi import FastAPI, HTTPException, Header, Depends, Request, Response, Query
2
  from fastapi.responses import JSONResponse
3
  from fastapi.security import HTTPBasic, HTTPBasicCredentials
4
  from fastapi.exceptions import RequestValidationError
 
12
  import tensorflow as tf
13
  import string
14
  import re
15
+ import json
16
+ import csv
17
+ import tiktoken
18
+ from sklearn.preprocessing import LabelEncoder
19
  from tensorflow import keras
20
+ import keras
21
  from keras_nlp.layers import TransformerEncoder
22
  from tensorflow.keras import layers
23
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
24
  from tensorflow.keras.utils import plot_model
25
 
26
  api = FastAPI()
 
243
 
244
  rnn_en_fr, rnn_fr_en, transformer_en_fr, transformer_fr_en = load_all_data()
245
 
246
+ # ==== Language identifier ====
247
+
248
+ def encode_text(textes):
249
+ global tokenizer
250
+
251
+ max_length=250
252
+ sequences = tokenizer.encode_batch(textes)
253
+ return pad_sequences(sequences, maxlen=max_length, padding='post')
254
+
255
+ def read_list_lan():
256
+
257
+ with open(dataPath+'/multilingue/lan_code.csv', 'r') as fichier_csv:
258
+ reader = csv.reader(fichier_csv)
259
+ lan_code = next(reader)
260
+ return lan_code
261
+
262
+ def init_dl_identifier():
263
+ global tokenizer, dl_model, label_encoder, lan_to_language
264
+
265
+ tokenizer = tiktoken.get_encoding("cl100k_base")
266
+ # Lisez le contenu du fichier JSON
267
+ with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
268
+ lan_to_language = json.load(fichier)
269
+ label_encoder = LabelEncoder()
270
+ list_lan = read_list_lan()
271
+ lan_identified = [lan_to_language[l] for l in list_lan]
272
+ label_encoder.fit(list_lan)
273
+ merge = Merge(dataPath+"/dl_id_lang_split", dataPath, "dl_tiktoken_id_language_model.h5").merge(cleanup=False)
274
+ dl_model = keras.models.load_model(dataPath+"/dl_tiktoken_id_language_model.h5")
275
+ return
276
+
277
+ def lang_id_dl(sentences):
278
+ global dl_model, label_encoder, lan_to_language
279
+
280
+ if "str" in str(type(sentences)): predictions = dl_model.predict(encode_text([sentences]))
281
+ else: predictions = dl_model.predict(encode_text(sentences))
282
+ # Décodage des prédictions en langues
283
+ predicted_labels_encoded = np.argmax(predictions, axis=1)
284
+ predicted_languages = label_encoder.classes_[predicted_labels_encoded]
285
+ if "str" in str(type(sentences)): return lan_to_language[predicted_languages[0]]
286
+ else: return [l for l in predicted_languages]
287
+
288
+ # ==== Endpoints ====
289
 
290
  @api.get('/', name="Vérification que l'API fonctionne")
291
  def check_api():
292
  load_all_data()
293
+ read_list_lan()
294
+ init_dl_identifier()
295
  return {'message': "L'API fonctionne"}
296
 
297
  @api.get('/small_vocab/rnn', name="Traduction par RNN")
 
340
 
341
  # Retourner l'image en tant que réponse HTTP avec le type de contenu approprié
342
  return Response(content=image_data, media_type="image/png")
343
+
344
+ @api.get('/lang_id_dl', name="Id de langue DL")
345
+ async def language_id_dl(sentences:List[str] = Query(..., min_length=1)):
346
+ return lang_id_dl(sentences)