avr23-cds-translation2

Sleeping

App Files Files Community

Demosthene-OR commited on Dec 3, 2023

Commit

babcb18

•

1 Parent(s): 95043cb

Add

Browse files

Files changed (9) hide show

app.py +35 -13
tabs/data_viz_tab.py +42 -37
tabs/exploration_tab.py +98 -100
tabs/game_tab.py +43 -22
tabs/id_lang_tab.py +102 -72
tabs/intro.py +48 -20
tabs/modelisation_dict_tab.py +182 -168
tabs/modelisation_seq2seq_tab.py +94 -96
translate_app.py +18 -0

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ from streamlit_option_menu import option_menu
 # Define TITLE, TEAM_MEMBERS and PROMOTION values, in config.py.
 import config
 from tabs.custom_vectorizer import custom_tokenizer, custom_preprocessor
 # Initialize a session state variable that tracks the sidebar state (either 'expanded' or 'collapsed').
 if 'sidebar_state' not in st.session_state:
@@ -20,10 +20,23 @@ st.set_page_config (
     initial_sidebar_state=st.session_state.sidebar_state
 )
 # Define the root folders depending on local/cloud run
 thisfile = os.path.abspath(__file__)
 if ('/' in thisfile):
     os.chdir(os.path.dirname(thisfile))
 # Nécessaire pour la version windows 11
 os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
@@ -43,20 +56,27 @@ st.markdown(f"<style>{style}</style>", unsafe_allow_html=True)
 # as value as follow :
 TABS = OrderedDict(
     [
-        (intro.sidebar_name, intro),
-        (exploration_tab.sidebar_name, exploration_tab),
-        (data_viz_tab.sidebar_name, data_viz_tab),
-        (id_lang_tab.sidebar_name, id_lang_tab),
-        (modelisation_dict_tab.sidebar_name, modelisation_dict_tab),
-        (modelisation_seq2seq_tab.sidebar_name, modelisation_seq2seq_tab),
-        (game_tab.sidebar_name, game_tab ),
     ]
 )
-def run():
     global lang_tgt, label_lang
     st.sidebar.image(
         "assets/demosthene_logo.png",
         width=270,
@@ -64,7 +84,7 @@ def run():
     with st.sidebar:
         tab_name = option_menu(None, list(TABS.keys()),
                                # icons=['house', 'bi-binoculars', 'bi bi-graph-up', 'bi-chat-right-text','bi-book', 'bi-body-text'], menu_icon="cast", default_index=0,
-                               icons=['house', 'binoculars', 'graph-up', 'search','book', 'chat-right-text', 'controller'], menu_icon="cast", default_index=0,
                                styles={"container": {"padding": "0!important","background-color": "#10b8dd", "border-radius": "0!important"},
                                        "nav-link": {"font-size": "1rem", "text-align": "left", "margin":"0em", "padding": "0em",
                                                     "padding-left": "0.2em", "--hover-color": "#eee", "font-weight": "400",
@@ -78,9 +98,11 @@ def run():
     for member in config.TEAM_MEMBERS:
         st.sidebar.markdown(member.sidebar_markdown(), unsafe_allow_html=True)
     tab = TABS[tab_name]
     tab.run()
 if __name__ == "__main__":
     run()

 # Define TITLE, TEAM_MEMBERS and PROMOTION values, in config.py.
 import config
 from tabs.custom_vectorizer import custom_tokenizer, custom_preprocessor
+import os
+from translate_app import tr
 # Initialize a session state variable that tracks the sidebar state (either 'expanded' or 'collapsed').
 if 'sidebar_state' not in st.session_state:
     initial_sidebar_state=st.session_state.sidebar_state
 )
+# Si l'application tourne localement, session_state.Cloud == 0
+# Si elle tourne sur le Cloud de Hugging Face, ==1
+st.session_state.Cloud = 1
+# En fonction de la valeur de varible précédente, le data path est différent
+if st.session_state.Cloud == 0:
+    st.session_state.DataPath = "../data"
+    st.session_state.reCalcule = False
+else:
+    st.session_state.DataPath = "data"
+    st.session_state.reCalcule = False
 # Define the root folders depending on local/cloud run
 thisfile = os.path.abspath(__file__)
+print("Path before:",os.path.abspath(__file__))
 if ('/' in thisfile):
     os.chdir(os.path.dirname(thisfile))
+print("Path after:",os.path.abspath(__file__))
 # Nécessaire pour la version windows 11
 os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
 # as value as follow :
 TABS = OrderedDict(
     [
+        (tr(intro.sidebar_name), intro),
+        (tr(exploration_tab.sidebar_name), exploration_tab),
+        (tr(data_viz_tab.sidebar_name), data_viz_tab),
+        (tr(id_lang_tab.sidebar_name), id_lang_tab),
+        (tr(modelisation_dict_tab.sidebar_name), modelisation_dict_tab),
+        (tr(modelisation_seq2seq_tab.sidebar_name), modelisation_seq2seq_tab),
+        (tr(game_tab.sidebar_name), game_tab ),
     ]
 )
+lang_tgt   = ['fr','en','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
+label_lang = ['Français', 'Anglais','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
+@st.cache_data
+def find_lang_label(lang_sel):
     global lang_tgt, label_lang
+    return label_lang[lang_tgt.index(lang_sel)]
+def run():
     st.sidebar.image(
         "assets/demosthene_logo.png",
         width=270,
     with st.sidebar:
         tab_name = option_menu(None, list(TABS.keys()),
                                # icons=['house', 'bi-binoculars', 'bi bi-graph-up', 'bi-chat-right-text','bi-book', 'bi-body-text'], menu_icon="cast", default_index=0,
+                               icons=['house', 'binoculars', 'graph-up', 'search','book', 'chat-right-text','controller'], menu_icon="cast", default_index=0,
                                styles={"container": {"padding": "0!important","background-color": "#10b8dd", "border-radius": "0!important"},
                                        "nav-link": {"font-size": "1rem", "text-align": "left", "margin":"0em", "padding": "0em",
                                                     "padding-left": "0.2em", "--hover-color": "#eee", "font-weight": "400",
     for member in config.TEAM_MEMBERS:
         st.sidebar.markdown(member.sidebar_markdown(), unsafe_allow_html=True)
+    with st.sidebar:
+        st.selectbox("langue:",lang_tgt, format_func = find_lang_label, key="Language", label_visibility="hidden")
     tab = TABS[tab_name]
     tab.run()
 if __name__ == "__main__":
     run()

tabs/data_viz_tab.py CHANGED Viewed

@@ -17,10 +17,11 @@ from gensim import corpora
 import networkx as nx
 from sklearn.manifold import TSNE
 from gensim.models import KeyedVectors
 title = "Data Vizualization"
 sidebar_name = "Data Vizualization"
 with contextlib.redirect_stdout(open(os.devnull, "w")):
     nltk.download('stopwords')
@@ -34,7 +35,7 @@ if ((first_line+max_lines)>137860):
 # Nombre maximum de ligne à afficher pour les DataFrame
 max_lines_to_display = 50
-@st.cache_data(ttl='1h00s')
 def load_data(path):
     input_file = os.path.join(path)
@@ -47,7 +48,7 @@ def load_data(path):
     data = data.split('\n')
     return data[first_line:min(len(data),first_line+max_lines)]
-@st.cache_data(ttl='1h00s')
 def load_preprocessed_data(path,data_type):
     input_file = os.path.join(path)
@@ -68,14 +69,14 @@ def load_preprocessed_data(path,data_type):
             data=data2
         return data
-@st.cache_data(ttl='1h00s')
 def load_all_preprocessed_data(lang):
-    txt           =load_preprocessed_data('data/preprocess_txt_'+lang,0)
-    corpus        =load_preprocessed_data('data/preprocess_corpus_'+lang,0)
-    txt_split     = load_preprocessed_data('data/preprocess_txt_split_'+lang,3)
-    df_count_word = pd.concat([load_preprocessed_data('data/preprocess_df_count_word1_'+lang,1), load_preprocessed_data('data/preprocess_df_count_word2_'+lang,1)])
-    sent_len      =load_preprocessed_data('data/preprocess_sent_len_'+lang,2)
-    vec_model= KeyedVectors.load_word2vec_format('data/mini.wiki.'+lang+'.align.vec')
     return txt, corpus, txt_split, df_count_word,sent_len, vec_model
 #Chargement des textes complet dans les 2 langues
@@ -92,7 +93,7 @@ def plot_word_cloud(text, title, masque, stop_words, background_color = "white")
                    max_font_size=50, random_state=42)
     # Générer et afficher le nuage de mots
     fig=plt.figure(figsize= (20,10))
-    plt.title(title, fontsize=25, color="green")
     wc.generate(text)
     # getting current axes
@@ -130,7 +131,7 @@ def dist_frequence_mots(df_count_word):
     sns.set()
     fig = plt.figure() #figsize=(4,4)
-    plt.title("Nombre d'apparitions des mots", fontsize=16)
     chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]);
     chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8)
@@ -174,7 +175,7 @@ def dist_longueur_phrase(sent_len,sent_len2, lang1, lang2 ):
     chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step",
                          common_norm=False, multiple="layer", discrete=True, stat='proportion')
     plt.xticks([2,4,6,8,10,12,14,16,18,20,22])
-    chart.set(title='Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)');
     st.pyplot(fig)
     '''
@@ -245,8 +246,8 @@ def proximite():
     labels = []
     tokens = []
-    nb_words = st.slider('Nombre de mots à afficher :',10,50, value=20)
-    df = pd.read_csv('data/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False)
     words_en = df.index.to_list()[:nb_words]
     words_fr = df['Francais'].to_list()[:nb_words]
@@ -280,7 +281,7 @@ def proximite():
                      va='bottom',
                      color= color,
                      size=20)
-    plt.title("Proximité des mots anglais avec leur traduction", fontsize=30, color="green")
     plt.legend(loc='best');
     st.pyplot(fig)
@@ -292,13 +293,13 @@ def run():
     global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr
     st.write("")
-    st.title(title)
     #
-    st.write("## **Paramètres :**\n")
-    Langue = st.radio('Langue:',('Anglais','Français'), horizontal=True)
-    first_line = st.slider('No de la premiere ligne à analyser :',0,137859)
-    max_lines = st.select_slider('Nombre de lignes à analyser :',
                               options=[1,5,10,15,100, 500, 1000,'Max'])
     if max_lines=='Max':
         max_lines=137860
@@ -328,74 +329,78 @@ def run():
         st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
     st.write("")
-    tab1, tab2, tab3, tab4, tab5 = st.tabs(["World Cloud", "Frequence","Distribution longueur", "Co-occurence", "Proximité"])
     with tab1:
-        st.subheader("World Cloud")
-        st.markdown(
             """
             On remarque, en changeant de langue, que certains mot de taille importante dans une langue,
             apparaissent avec une taille identique dans l'autre langue.
             La traduction mot à mot sera donc peut-être bonne.
-            """
         )
         if (Langue == 'Anglais'):
             text = ""
             # Initialiser la variable des mots vides
             stop_words = set(stopwords.words('english'))
             for e in txt_en : text += e
-            plot_word_cloud(text, "English words corpus", "images/coeur.png", stop_words)
         else:
             text = ""
             # Initialiser la variable des mots vides
             stop_words = set(stopwords.words('french'))
             for e in txt_fr : text += e
-            plot_word_cloud(text,"Mots français du corpus", "images/coeur.png", stop_words)
     with tab2:
-        st.subheader("Frequence d'apparition des mots")
-        st.markdown(
             """
             On remarque, en changeant de langue, que certains mot fréquents dans une langue,
             apparaissent aussi fréquemment dans l'autre langue.
             Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne.
-            """
         )
         if (Langue == 'Anglais'):
             dist_frequence_mots(df_count_word_en)
         else:
             dist_frequence_mots(df_count_word_fr)
     with tab3:
-        st.subheader("Distribution des longueurs de phrases")
-        st.markdown(
             """
             Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes),
             on constate une certaine similitude dans les ditributions de longueur de phrases.
             Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise.
-            """
         )
         if (Langue == 'Anglais'):
             dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français')
         else:
             dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais')
     with tab4:
-        st.subheader("Co-occurence des mots dans une phrase")
         if (Langue == 'Anglais'):
             graphe_co_occurence(txt_split_en[:1000],corpus_en)
         else:
             graphe_co_occurence(txt_split_fr[:1000],corpus_fr)
     with tab5:
-        st.subheader("Proximité sémantique des mots (Word Embedding)")
-        st.markdown(
             """
             MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit
             notamment des "Word Embedding" multilingues
             Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique.
             Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais).
             En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec.
             Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot.
-            """
         )
         st.write("")
         proximite()

 import networkx as nx
 from sklearn.manifold import TSNE
 from gensim.models import KeyedVectors
+from translate_app import tr
 title = "Data Vizualization"
 sidebar_name = "Data Vizualization"
+dataPath = st.session_state.DataPath
 with contextlib.redirect_stdout(open(os.devnull, "w")):
     nltk.download('stopwords')
 # Nombre maximum de ligne à afficher pour les DataFrame
 max_lines_to_display = 50
+@st.cache_data
 def load_data(path):
     input_file = os.path.join(path)
     data = data.split('\n')
     return data[first_line:min(len(data),first_line+max_lines)]
+@st.cache_data
 def load_preprocessed_data(path,data_type):
     input_file = os.path.join(path)
             data=data2
         return data
+@st.cache_data
 def load_all_preprocessed_data(lang):
+    txt           =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
+    corpus        =load_preprocessed_data(dataPath+'/preprocess_corpus_'+lang,0)
+    txt_split     = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
+    df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
+    sent_len      =load_preprocessed_data(dataPath+'/preprocess_sent_len_'+lang,2)
+    vec_model= KeyedVectors.load_word2vec_format(dataPath+'/mini.wiki.'+lang+'.align.vec')
     return txt, corpus, txt_split, df_count_word,sent_len, vec_model
 #Chargement des textes complet dans les 2 langues
                    max_font_size=50, random_state=42)
     # Générer et afficher le nuage de mots
     fig=plt.figure(figsize= (20,10))
+    plt.title(tr(title), fontsize=25, color="green")
     wc.generate(text)
     # getting current axes
     sns.set()
     fig = plt.figure() #figsize=(4,4)
+    plt.title(tr("Nombre d'apparitions des mots"), fontsize=16)
     chart = sns.barplot(x='mots',y='occurences',data=nb_occurences.iloc[:40]);
     chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right', size=8)
     chart = sns.histplot(df, color=['r','b'], label=[lang1,lang2], binwidth=1, binrange=[2,22], element="step",
                          common_norm=False, multiple="layer", discrete=True, stat='proportion')
     plt.xticks([2,4,6,8,10,12,14,16,18,20,22])
+    chart.set(title=tr('Distribution du nombre de mots sur '+str(len(sent_len))+' phrase(s)'));
     st.pyplot(fig)
     '''
     labels = []
     tokens = []
+    nb_words = st.slider(tr('Nombre de mots à afficher')+' :',10,50, value=20)
+    df = pd.read_csv('../data/dict_we_en_fr',header=0,index_col=0, encoding ="utf-8", keep_default_na=False)
     words_en = df.index.to_list()[:nb_words]
     words_fr = df['Francais'].to_list()[:nb_words]
                      va='bottom',
                      color= color,
                      size=20)
+    plt.title(tr("Proximité des mots anglais avec leur traduction"), fontsize=30, color="green")
     plt.legend(loc='best');
     st.pyplot(fig)
     global full_txt_fr, full_corpus_fr, full_txt_split_fr, full_df_count_word_fr,full_sent_len_fr, vec_model_fr
     st.write("")
+    st.title(tr(title))
     #
+    st.write("## **"+tr("Paramètres")+" :**\n")
+    Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
+    first_line = st.slider(tr('No de la premiere ligne à analyser')+' :',0,137859)
+    max_lines = st.select_slider(tr('Nombre de lignes à analyser')+' :',
                               options=[1,5,10,15,100, 500, 1000,'Max'])
     if max_lines=='Max':
         max_lines=137860
         st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
     st.write("")
+    tab1, tab2, tab3, tab4, tab5 = st.tabs([tr("World Cloud"), tr("Frequence"),tr("Distribution longueur"), tr("Co-occurence"), tr("Proximité")])
     with tab1:
+        st.subheader(tr("World Cloud"))
+        st.markdown(tr(
             """
             On remarque, en changeant de langue, que certains mot de taille importante dans une langue,
             apparaissent avec une taille identique dans l'autre langue.
             La traduction mot à mot sera donc peut-être bonne.
+            """)
         )
         if (Langue == 'Anglais'):
             text = ""
             # Initialiser la variable des mots vides
             stop_words = set(stopwords.words('english'))
             for e in txt_en : text += e
+            plot_word_cloud(text, "English words corpus", "../images/coeur.png", stop_words)
         else:
             text = ""
             # Initialiser la variable des mots vides
             stop_words = set(stopwords.words('french'))
             for e in txt_fr : text += e
+            plot_word_cloud(text,"Mots français du corpus", "../images/coeur.png", stop_words)
     with tab2:
+        st.subheader(tr("Frequence d'apparition des mots"))
+        st.markdown(tr(
             """
             On remarque, en changeant de langue, que certains mot fréquents dans une langue,
             apparaissent aussi fréquemment dans l'autre langue.
             Cela peut nous laisser penser que la traduction mot à mot sera peut-être bonne.
+            """)
         )
         if (Langue == 'Anglais'):
             dist_frequence_mots(df_count_word_en)
         else:
             dist_frequence_mots(df_count_word_fr)
     with tab3:
+        st.subheader(tr("Distribution des longueurs de phrases"))
+        st.markdown(tr(
             """
             Malgré quelques différences entre les 2 langues (les phrases anglaises sont généralement un peu plus courtes),
             on constate une certaine similitude dans les ditributions de longueur de phrases.
             Cela peut nous laisser penser que la traduction mot à mot ne sera pas si mauvaise.
+            """)
         )
         if (Langue == 'Anglais'):
             dist_longueur_phrase(sent_len_en, sent_len_fr, 'Anglais','Français')
         else:
             dist_longueur_phrase(sent_len_fr, sent_len_en, 'Français', 'Anglais')
     with tab4:
+        st.subheader(tr("Co-occurence des mots dans une phrase"))
         if (Langue == 'Anglais'):
             graphe_co_occurence(txt_split_en[:1000],corpus_en)
         else:
             graphe_co_occurence(txt_split_fr[:1000],corpus_fr)
     with tab5:
+        st.subheader(tr("Proximité sémantique des mots (Word Embedding)") )
+        st.markdown(tr(
             """
             MUSE est une bibliothèque Python pour l'intégration de mots multilingues, qui fournit
             notamment des "Word Embedding" multilingues
             Facebook fournit des dictionnaires de référence. Ces embeddings sont des embeddings fastText Wikipedia pour 30 langues qui ont été alignés dans un espace espace vectoriel unique.
             Dans notre cas, nous avons utilisé 2 mini-dictionnaires d'environ 3000 mots (Français et Anglais).
+            """)
+        )
+        st.markdown(tr(
+            """
             En novembre 2015, l'équipe de recherche de Facebook a créé fastText qui est une extension de la bibliothèque word2vec.
             Elle s'appuie sur Word2Vec en apprenant des représentations vectorielles pour chaque mot et les n-grammes trouvés dans chaque mot.
+            """)
         )
         st.write("")
         proximite()

tabs/exploration_tab.py CHANGED Viewed

@@ -6,13 +6,21 @@ import collections
 from nltk.tokenize import word_tokenize
 from nltk import download
 from ast import literal_eval
-# import contextlib
-# import re
-# import nltk
-# from nltk.corpus import stopwords
 title = "Exploration et Preprocessing"
 sidebar_name = "Exploration et Preprocessing"
 # Indiquer si l'on veut enlever les stop words. C'est un processus long
 stopwords_to_do = True
@@ -29,10 +37,12 @@ if ((first_line+max_lines)>137860):
 # Nombre maximum de ligne à afficher pour les DataFrame
 max_lines_to_display = 50
 download('punkt')
-# nltk.download('averaged_perceptron_tagger')
-# nltk.download('stopwords')
 @st.cache_data
 def load_data(path):
@@ -67,23 +77,25 @@ def load_preprocessed_data(path,data_type):
             data=data2
         return data
-# @st.cache_data(ttl='1h00s')
 def load_all_preprocessed_data(lang):
-    txt             =load_preprocessed_data('data/preprocess_txt_'+lang,0)
-    txt_split       = load_preprocessed_data('data/preprocess_txt_split_'+lang,3)
-    txt_lem         = load_preprocessed_data('data/preprocess_txt_lem_'+lang,0)
-    txt_wo_stopword = load_preprocessed_data('data/preprocess_txt_wo_stopword_'+lang,0)
-    df_count_word   = pd.concat([load_preprocessed_data('data/preprocess_df_count_word1_'+lang,1), load_preprocessed_data('data/preprocess_df_count_word2_'+lang,1)])
     return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word
 #Chargement des textes complet dans les 2 langues
-full_txt_en = load_data('data/small_vocab_en')
-full_txt_fr = load_data('data/small_vocab_fr')
-# Chargement du résultat du préprocessing
-_ , full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
-_ , full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
-"""
 def remove_stopwords(text, lang):
     stop_words = set(stopwords.words(lang))
     # stop_words will contain  set all english stopwords
@@ -245,7 +257,7 @@ def preprocess_txt (data, lang):
     txt_n_unique_val=  pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
     return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
- """
 def count_world(data):
     word_count = collections.Counter()
@@ -269,47 +281,45 @@ def display_preprocess_results(lang, data, data_split, data_lem, data_wosw, txt_
     txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros)
     # Affichage du nombre de mot en fonction du pré-processing réalisé
-    tab1, tab2, tab3, tab4 = st.tabs(["Résumé", "Tokenisation","Lemmatisation", "Sans Stopword"])
     with tab1:
-        st.subheader("Résumé du pré-processing")
-        st.write("**Nombre de phrases                     : "+str(nb_phrases)+"**")
-        st.write("**Nombre de mots                        : "+str(nb_mots)+"**")
-        st.write("**Nombre de mots uniques                : "+str(nb_mots_uniques)+"**")
         st.write("")
-        st.write("\n**Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):**")
         st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
     with tab2:
-        st.subheader("Tokenisation")
-        st.write('Texte "splited":')
         st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800)
-        st.write("**Nombre de mots uniques                : "+str(nb_mots_uniques)+"**")
         st.write("")
-        st.write("\n**Mots uniques:**")
         st.markdown(corpus[:500])
-        st.write("\n**Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):**")
         st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
     with tab3:
-        st.subheader("Lemmatisation")
         if lemmatize_to_do:
-            st.dataframe(pd.DataFrame(data=data_lem,columns=['Texte lemmatisé'],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
             # Si langue anglaise, affichage du taggage des mots
-            """
-            if lang == 'en':
-                for i in range(min(5,len(data))):
-                    s = str(nltk.pos_tag(data_split[i]))
-                    st.markdown("**Texte avec Tags     "+str(i)+"** : "+s)
-            """
-            st.write("**Nombre de mots uniques lemmatisés     : "+str(nb_mots_lem)+"**")
             st.write("")
-            st.write("\n**Mots uniques lemmatisés:**")
             st.markdown(mots_lem[:500])
     with tab4:
-        st.subheader("Sans Stopword")
         if stopwords_to_do:
             st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
-            st.write("**Nombre de mots uniques sans stop words: "+str(nb_mots_wo_stopword)+"**")
             st.write("")
-            st.write("\n**Mots uniques sans stop words:**")
             st.markdown(mots_wo_sw[:500])
@@ -319,40 +329,40 @@ def run():
     global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr
     st.write("")
-    st.title(title)
-    st.write("## **Explications :**\n")
-    st.markdown(
         """
         Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines.
         Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto.
         Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français.
         :red[**Small_vocab**] contient 137 860 phrases en anglais et français.
         Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing).
         Ces taches sont, par exemple:
-        * le :red[**nettoyage**] du texte (enlever les majuscules et la ponctuation)
-        * la :red[**tokenisation**] (découpage du texte en mots)
-        * la :red[**lemmatisation**] (traitement lexical qui permet de donner une forme unique à toutes les "variations" d'un même mot)
-        * l'élimination des :red[**mots "transparents**"] (sans utilité pour la compréhension, tels que les articles).
-        Ce prétraintement se conclut avec la contruction d'un :red[**Bag Of Worlds**], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)
-        """
-    )
     #
-    st.write("## **Paramètres :**\n")
-    Langue = st.radio('Langue:',('Anglais','Français'), horizontal=True)
-    first_line = st.slider('No de la premiere ligne à analyser:',0,137859)
-    max_lines = st.select_slider('Nombre de lignes à analyser:',
                               options=[1,5,10,15,100, 500, 1000,'Max'])
     if max_lines=='Max':
         max_lines=137860
     if ((first_line+max_lines)>137860):
         max_lines = max(137860-first_line,0)
-    # if ((max_lines-first_line)>1000):
-    #     lemmatize_to_do = True
-    # else:
-    #     lemmatize_to_do = False
     last_line = first_line+max_lines
     if (Langue=='Anglais'):
@@ -361,62 +371,50 @@ def run():
         st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
     st.write("")
-    # Chargement du résultat du préprocessing (max lignes = max_lines)
     txt_en = full_txt_en[first_line:last_line]
-    txt_split_en = full_txt_split_en[first_line:last_line]
-    txt_lem_en = full_txt_lem_en[first_line:last_line]
-    txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line]
-    df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1]
     txt_fr = full_txt_fr[first_line:last_line]
-    txt_split_fr = full_txt_split_fr[first_line:last_line]
-    txt_lem_fr = full_txt_lem_fr[first_line:last_line]
-    txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line]
-    df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1]
     # Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots
     # et calculer nombre d'occurences des mots dans chaque phrase
     if (Langue == 'Anglais'):
-        st.write("## **Préprocessing de small_vocab_en :**\n")
         if max_lines>10000:
             with st.status(":sunglasses:", expanded=True):
-                # txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en  = preprocess_txt (txt_en,'en')
                 display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
         else:
-            # txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en  = preprocess_txt (txt_en,'en')
             display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
     else:
-        st.write("## **Préprocessing de small_vocab_fr :**\n")
         if max_lines>10000:
             with st.status(":sunglasses:", expanded=True):
-                # txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr  = preprocess_txt (txt_fr,'fr')
                 display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
         else:
-            # txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr  = preprocess_txt (txt_fr,'fr')
             display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
-    # Might be used later....
-    # DEFAULT_TEXT = """Google was founded in September 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University in California. Together they own about 14 percent of its shares and control 56 percent of the stockholder voting power through supervoting stock. They incorporated Google as a California privately held company on September 4, 1998, in California. Google was then reincorporated in Delaware on October 22, 2002."""
-    """
-    spacy_model = "en_core_web_sm"
-    text = st.text_area("Text to analyze", DEFAULT_TEXT, height=200)
-    doc = spacy_streamlit.process_text(spacy_model, text)
-    spacy_streamlit.visualize_ner(
-        doc,
-        labels=["PERSON", "DATE", "GPE"],
-        show_table=False,
-        title="Persons, dates and locations",
-        )
-    st.text(f"Analyzed using spaCy model {spacy_model}")
-    """
-    # models = ["en_core_web_sm"]
-    # default_text = "Google was founded in September 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University in California. Together they own about 14 percent of its shares and control 56 percent of the stockholder voting power through supervoting stock. They incorporated Google as a California privately held company on September 4, 1998, in California. Google was then reincorporated in Delaware on October 22, 2002."
-    # spacy_streamlit.visualize(models, default_text)

 from nltk.tokenize import word_tokenize
 from nltk import download
 from ast import literal_eval
+from translate_app import tr
+if st.session_state.Cloud == 0:
+    import nltk
+    import contextlib
+    import re
+    from nltk.corpus import stopwords
+    import warnings
+    warnings.filterwarnings('ignore')
+# from PIL import Image
+# import time
+# import random
 title = "Exploration et Preprocessing"
 sidebar_name = "Exploration et Preprocessing"
+dataPath = st.session_state.DataPath
 # Indiquer si l'on veut enlever les stop words. C'est un processus long
 stopwords_to_do = True
 # Nombre maximum de ligne à afficher pour les DataFrame
 max_lines_to_display = 50
 download('punkt')
+if st.session_state.Cloud == 0:
+    download('averaged_perceptron_tagger')
+    with contextlib.redirect_stdout(open(os.devnull, "w")):
+        download('stopwords')
 @st.cache_data
 def load_data(path):
             data=data2
         return data
+@st.cache_data
 def load_all_preprocessed_data(lang):
+    txt             =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0)
+    txt_split       = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3)
+    txt_lem         = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0)
+    txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0)
+    df_count_word   = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)])
     return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word
 #Chargement des textes complet dans les 2 langues
+full_txt_en = load_data(dataPath+'/small_vocab_en')
+full_txt_fr = load_data(dataPath+'/small_vocab_fr')
+# Chargement du résultat du préprocessing, si st.session_state.reCalcule == False
+if not st.session_state.reCalcule:
+    full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en')
+    full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr')
 def remove_stopwords(text, lang):
     stop_words = set(stopwords.words(lang))
     # stop_words will contain  set all english stopwords
     txt_n_unique_val=  pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float)
     return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length
 def count_world(data):
     word_count = collections.Counter()
     txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros)
     # Affichage du nombre de mot en fonction du pré-processing réalisé
+    tab1, tab2, tab3, tab4 = st.tabs([tr("Résumé"), tr("Tokenisation"),tr("Lemmatisation"), tr("Sans Stopword")])
     with tab1:
+        st.subheader(tr("Résumé du pré-processing"))
+        st.write("**"+tr("Nombre de phrases")+"                     : "+str(nb_phrases)+"**")
+        st.write("**"+tr("Nombre de mots")+"                        : "+str(nb_mots)+"**")
+        st.write("**"+tr("Nombre de mots uniques")+"                : "+str(nb_mots_uniques)+"**")
         st.write("")
+        st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**")
         st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
     with tab2:
+        st.subheader(tr("Tokenisation"))
+        st.write(tr('Texte "splited":'))
         st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800)
+        st.write("**"+tr("Nombre de mots uniques")+"                : "+str(nb_mots_uniques)+"**")
         st.write("")
+        st.write("\n**"+tr("Mots uniques")+":**")
         st.markdown(corpus[:500])
+        st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**")
         st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800)
     with tab3:
+        st.subheader(tr("Lemmatisation"))
         if lemmatize_to_do:
+            st.dataframe(pd.DataFrame(data=data_lem,columns=[tr('Texte lemmatisé')],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
             # Si langue anglaise, affichage du taggage des mots
+            # if lang == 'en':
+            #     for i in range(min(5,len(data))):
+            #         s = str(nltk.pos_tag(data_split[i]))
+            #         st.markdown("**Texte avec Tags     "+str(i)+"** : "+s)
+            st.write("**"+tr("Nombre de mots uniques lemmatisés")+"     : "+str(nb_mots_lem)+"**")
             st.write("")
+            st.write("\n**"+tr("Mots uniques lemmatisés:")+"**")
             st.markdown(mots_lem[:500])
     with tab4:
+        st.subheader(tr("Sans Stopword"))
         if stopwords_to_do:
             st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800)
+            st.write("**"+tr("Nombre de mots uniques sans stop words")+": "+str(nb_mots_wo_stopword)+"**")
             st.write("")
+            st.write("\n**"+tr("Mots uniques sans stop words")+":**")
             st.markdown(mots_wo_sw[:500])
     global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr
     st.write("")
+    st.title(tr(title))
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
         """
         Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines.
         Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto.
         Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français.
         :red[**Small_vocab**] contient 137 860 phrases en anglais et français.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
         Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing).
         Ces taches sont, par exemple:
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(
+        "* "+tr("le :red[**nettoyage**] du texte (enlever les majuscules et la ponctuation)")+"\n"+ \
+        "* "+tr("la :red[**tokenisation**] (découpage du texte en mots)")+"\n"+ \
+        "* "+tr("la :red[**lemmatisation**] (traitement lexical qui permet de donner une forme unique à toutes les \"variations\" d'un même mot)")+"\n"+ \
+        "* "+tr("l'élimination des :red[**mots \"transparents\"**] (sans utilité pour la compréhension, tels que les articles).")+"  \n"+ \
+        tr("Ce prétraintement se conclut avec la contruction d'un :red[**Bag Of Worlds**], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)")
+    , unsafe_allow_html=True)
     #
+    st.write("## **"+tr("Paramètres")+" :**\n")
+    Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True)
+    first_line = st.slider(tr('No de la premiere ligne à analyser:'),0,137859)
+    max_lines = st.select_slider(tr('Nombre de lignes à analyser:'),
                               options=[1,5,10,15,100, 500, 1000,'Max'])
     if max_lines=='Max':
         max_lines=137860
     if ((first_line+max_lines)>137860):
         max_lines = max(137860-first_line,0)
     last_line = first_line+max_lines
     if (Langue=='Anglais'):
         st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800)
     st.write("")
+    # Chargement des textes sélectionnés dans les 2 langues (max lignes = max_lines)
     txt_en = full_txt_en[first_line:last_line]
     txt_fr = full_txt_fr[first_line:last_line]
+    # Elimination des phrases non traduites
+    # txt_en, txt_fr = clean_untranslated_sentence(txt_en, txt_fr)
+    if not st.session_state.reCalcule:
+        txt_split_en = full_txt_split_en[first_line:last_line]
+        txt_lem_en = full_txt_lem_en[first_line:last_line]
+        txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line]
+        df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1]
+        txt_split_fr = full_txt_split_fr[first_line:last_line]
+        txt_lem_fr = full_txt_lem_fr[first_line:last_line]
+        txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line]
+        df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1]
     # Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots
     # et calculer nombre d'occurences des mots dans chaque phrase
     if (Langue == 'Anglais'):
+        st.write("## **"+tr("Préprocessing de small_vocab_en")+" :**\n")
         if max_lines>10000:
             with st.status(":sunglasses:", expanded=True):
+                if st.session_state.reCalcule:
+                    txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en  = preprocess_txt (txt_en,'en')
                 display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
         else:
+            if st.session_state.reCalcule:
+                txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en  = preprocess_txt (txt_en,'en')
             display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en)
     else:
+        st.write("## **"+tr("Préprocessing de small_vocab_fr")+" :**\n")
         if max_lines>10000:
             with st.status(":sunglasses:", expanded=True):
+                if st.session_state.reCalcule:
+                    txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr  = preprocess_txt (txt_fr,'fr')
                 display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)
         else:
+            if st.session_state.reCalcule:
+                txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr  = preprocess_txt (txt_fr,'fr')
             display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr)

tabs/game_tab.py CHANGED Viewed

@@ -10,17 +10,20 @@ import csv
 from extra_streamlit_components import tab_bar, TabBarItemData
 import matplotlib.pyplot as plt
 from datetime import datetime
 title = "Jouez avec nous !"
 sidebar_name = "Jeu"
 @st.cache_data
 def init_game():
     new = int(time.time())
-    sentence_test = pd.read_csv('data/multilingue/sentence_test_extract.csv')
     sentence_test = sentence_test[4750:]
     # Lisez le contenu du fichier JSON
-    with open('data/multilingue/lan_to_language.json', 'r') as fichier:
         lan_to_language = json.load(fichier)
     t_now = time.time()
     return sentence_test, lan_to_language, new, t_now
@@ -65,16 +68,16 @@ def calc_score(n_rep,duration):
     return s
 def read_leaderboard():
-    return pd.read_csv('data/game_leaderboard.csv', index_col=False,encoding='utf8')
 def write_leaderboard(lb):
     lb['Nom'] = lb['Nom'].astype(str)
     lb['Rang'] = lb['Rang'].astype(int)
-    lb.to_csv(path_or_buf='data/game_leaderboard.csv',columns=['Rang','Nom','Score','Timestamp','BR','Duree'],index=False, header=True,encoding='utf8')
 def display_leaderboard():
     lb = read_leaderboard()
-    st.write("**Leaderboard :**")
     list_champ = """
         | Rang | Nom        | Score |
         |------|------------|-------|"""
@@ -86,38 +89,56 @@ def display_leaderboard():
     return lb
 def write_log(TS,Nom,Score,BR,Duree):
-    log = pd.read_csv('data/game_log.csv', index_col=False,encoding='utf8')
     date_heure = datetime.fromtimestamp(TS)
     Date = date_heure.strftime('%Y-%m-%d %H:%M:%S')
     log = pd.concat([log, pd.DataFrame(data={'Date':[Date], 'Nom':[Nom],'Score':[Score],'BR':[BR],'Duree':[Duree]})], ignore_index=True)
-    log.to_csv(path_or_buf='data/game_log.csv',columns=['Date','Nom','Score','BR','Duree'],index=False, header=True,encoding='utf8')
 def display_files():
-    log = pd.read_csv('data/game_log.csv', index_col=False,encoding='utf8')
-    lb = pd.read_csv('data/game_leaderboard.csv', index_col=False,encoding='utf8')
     st.dataframe(lb)
     st.dataframe(log)
 def run():
     global sentence_test, lan_to_language
     sentence_test, lan_to_language, new, t_debut = init_game()
     st.write("")
-    st.title(title)
-    st.write("#### **Etes vous un expert es Langues ?**\n")
-    st.markdown(
         """
         Essayer de trouvez, sans aide, la langue des 5 phrases suivantes.
         Attention : Vous devez être le plus rapide possible !
-        """, unsafe_allow_html=True
         )
     st.write("")
-    player_name = st.text_input("Quel est votre nom ?")
     if player_name == 'display_files':
         display_files()
         return
     score = 0
     col1, col2 = st.columns([0.7,0.3])
@@ -133,7 +154,7 @@ def run():
         t_previous_debut = t_debut
         t_debut = time.time()
-        if st.button(label="Valider", type="primary"):
             st.cache_data.clear()
             nb_bonnes_reponses = 0
@@ -147,21 +168,21 @@ def run():
             score = calc_score(nb_bonnes_reponses,duration)
             write_log(time.time(),player_name,score,nb_bonnes_reponses,duration)
             if nb_bonnes_reponses >=4:
-                st.write(":red[**Félicitations, vous avez "+str(nb_bonnes_reponses)+" bonnes réponses !**]")
-                st.write(":red[Votre score est de "+str(score)+" points]")
             else:
                 if nb_bonnes_reponses >1 : s="s"
                 else: s=""
-                st.write("**:red[Vous avez "+str(nb_bonnes_reponses)+" bonne"+s+" réponse"+s+".]**")
                 if nb_bonnes_reponses >0 : s="s"
                 else: s=""
-                st.write(":red[Votre score est de "+str(score)+" point"+s+"]")
-            st.write("Bonne réponses:")
             for i in range(5):
                 st.write("- "+sentence_test['sentence'].iloc[sent_sel[i]]+" -> :blue[**"+lan_to_language[sentence_test['lan_code'].iloc[sent_sel[i]]]+"**]")
                 new = int(time.time())
-            st.button(label="Play again ?", type="primary")
             with col2:
                 now = time.time()

 from extra_streamlit_components import tab_bar, TabBarItemData
 import matplotlib.pyplot as plt
 from datetime import datetime
+import tracemalloc
+from translate_app import tr
 title = "Jouez avec nous !"
 sidebar_name = "Jeu"
+dataPath = st.session_state.DataPath
 @st.cache_data
 def init_game():
     new = int(time.time())
+    sentence_test = pd.read_csv(dataPath+'/multilingue/sentence_test_extract.csv')
     sentence_test = sentence_test[4750:]
     # Lisez le contenu du fichier JSON
+    with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
         lan_to_language = json.load(fichier)
     t_now = time.time()
     return sentence_test, lan_to_language, new, t_now
     return s
 def read_leaderboard():
+    return pd.read_csv(dataPath+'/game_leaderboard.csv', index_col=False,encoding='utf8')
 def write_leaderboard(lb):
     lb['Nom'] = lb['Nom'].astype(str)
     lb['Rang'] = lb['Rang'].astype(int)
+    lb.to_csv(path_or_buf=dataPath+'/game_leaderboard.csv',columns=['Rang','Nom','Score','Timestamp','BR','Duree'],index=False, header=True,encoding='utf8')
 def display_leaderboard():
     lb = read_leaderboard()
+    st.write("**"+tr("Leaderboard")+" :**")
     list_champ = """
         | Rang | Nom        | Score |
         |------|------------|-------|"""
     return lb
 def write_log(TS,Nom,Score,BR,Duree):
+    log = pd.read_csv(dataPath+'/game_log.csv', index_col=False,encoding='utf8')
     date_heure = datetime.fromtimestamp(TS)
     Date = date_heure.strftime('%Y-%m-%d %H:%M:%S')
     log = pd.concat([log, pd.DataFrame(data={'Date':[Date], 'Nom':[Nom],'Score':[Score],'BR':[BR],'Duree':[Duree]})], ignore_index=True)
+    log.to_csv(path_or_buf=dataPath+'/game_log.csv',columns=['Date','Nom','Score','BR','Duree'],index=False, header=True,encoding='utf8')
 def display_files():
+    log = pd.read_csv(dataPath+'/game_log.csv', index_col=False,encoding='utf8')
+    lb = pd.read_csv(dataPath+'/game_leaderboard.csv', index_col=False,encoding='utf8')
     st.dataframe(lb)
     st.dataframe(log)
 def run():
     global sentence_test, lan_to_language
     sentence_test, lan_to_language, new, t_debut = init_game()
     st.write("")
+    st.title(tr(title))
+    st.write("#### **"+tr("Etes vous un expert es Langues ?")+"**\n")
+    st.markdown(tr(
         """
         Essayer de trouvez, sans aide, la langue des 5 phrases suivantes.
         Attention : Vous devez être le plus rapide possible !
+        """), unsafe_allow_html=True
         )
     st.write("")
+    player_name = st.text_input(tr("Quel est votre nom ?"))
     if player_name == 'display_files':
         display_files()
         return
+    elif player_name == 'malloc_start':
+        tracemalloc.start(30)
+        return
+    elif player_name == 'malloc_stop':
+        snapshot = tracemalloc.take_snapshot()
+        top_stats = snapshot.statistics('traceback')
+        # pick the biggest memory block
+        for k in range(3):
+            stat = top_stats[k]
+            print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
+            for line in stat.traceback.format():
+                print(line)
+        total_mem = sum(stat.size for stat in top_stats)
+        print("Total allocated size: %.1f KiB" % (total_mem / 1024))
+        return
     score = 0
     col1, col2 = st.columns([0.7,0.3])
         t_previous_debut = t_debut
         t_debut = time.time()
+        if st.button(label=tr("Valider"), type="primary"):
             st.cache_data.clear()
             nb_bonnes_reponses = 0
             score = calc_score(nb_bonnes_reponses,duration)
             write_log(time.time(),player_name,score,nb_bonnes_reponses,duration)
             if nb_bonnes_reponses >=4:
+                st.write(":red[**"+tr("Félicitations, vous avez "+str(nb_bonnes_reponses)+" bonnes réponses !")+"**]")
+                st.write(":red["+tr("Votre score est de "+str(score)+" points")+"]")
             else:
                 if nb_bonnes_reponses >1 : s="s"
                 else: s=""
+                st.write("**:red["+tr("Vous avez "+str(nb_bonnes_reponses)+" bonne"+s+" réponse"+s+".")+"]**")
                 if nb_bonnes_reponses >0 : s="s"
                 else: s=""
+                st.write(":red["+tr("Votre score est de "+str(score)+" point"+s)+"]")
+            st.write(tr("Bonne réponses")+":")
             for i in range(5):
                 st.write("- "+sentence_test['sentence'].iloc[sent_sel[i]]+" -> :blue[**"+lan_to_language[sentence_test['lan_code'].iloc[sent_sel[i]]]+"**]")
                 new = int(time.time())
+            st.button(label=tr("Play again ?"), type="primary")
             with col2:
                 now = time.time()

tabs/id_lang_tab.py CHANGED Viewed

@@ -20,9 +20,11 @@ from sklearn.decomposition import PCA
 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn import naive_bayes
 title = "Identification de langue"
 sidebar_name = "Identification de langue"
 # CountVectorizer a une liste de phrase en entrée.
 # Cette fonction met les données d'entrée dans le bon format
@@ -47,7 +49,7 @@ def create_BOW(data):
 def load_vectorizer(tokenizer):
     global dict_token, dict_ids, nb_token
-    path = 'data/vectorizer_tiktoken_big.pkl'
     vectorizer = joblib.load(path)
     dict_token = {tokenizer.decode([cle]): cle for cle, valeur in vectorizer.vocabulary_.items()}
     dict_ids = {cle: tokenizer.decode([cle]) for cle, valeur in vectorizer.vocabulary_.items()} #dict_ids.items()}
@@ -67,11 +69,11 @@ def init_nb_identifier():
     tokenizer = tiktoken.get_encoding("cl100k_base")
     # Chargement du classificateur sauvegardé
-    clf_nb = joblib.load("data/id_lang_tiktoken_nb_sparse_big.pkl")
     vectorizer = load_vectorizer(tokenizer)
     # Lisez le contenu du fichier JSON
-    with open('data/multilingue/lan_to_language.json', 'r') as fichier:
         lan_to_language = json.load(fichier)
     return tokenizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb, vectorizer
@@ -84,7 +86,7 @@ def encode_text(textes):
 def read_list_lan():
-    with open('data/multilingue/lan_code.csv', 'r') as fichier_csv:
         reader = csv.reader(fichier_csv)
         lan_code = next(reader)
         return lan_code
@@ -96,8 +98,8 @@ def init_dl_identifier():
     list_lan = read_list_lan()
     lan_identified = [lan_to_language[l] for l in list_lan]
     label_encoder.fit(list_lan)
-    merge = Merge("data/dl_id_lang_split",  "data", "dl_tiktoken_id_language_model.h5").merge(cleanup=False)
-    dl_model = keras.models.load_model("data/dl_tiktoken_id_language_model.h5")
     return dl_model, label_encoder, list_lan, lan_identified
 def lang_id_dl(sentences):
@@ -117,7 +119,7 @@ def init_lang_id_external():
     lang_id_model_ext = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
     dict_xlmr  = {"ar":"ara", "bg":"bul", "de":"deu", "el": "ell", "en":"eng", "es":"spa", "fr":"fra", "hi": "hin","it":"ita","ja":"jpn", \
                   "nl":"nld", "pl":"pol", "pt":"por", "ru":"rus", "sw":"swh", "th":"tha", "tr":"tur", "ur": "urd", "vi":"vie", "zh":"cmn"}
-    sentence_test = pd.read_csv('data//multilingue/sentence_test_extract.csv')
     sentence_test = sentence_test[:4750]
     # Instanciation d'un exemple
     exemples = ["Er weiß überhaupt nichts über dieses Buch",                               # Phrase 0
@@ -135,8 +137,8 @@ def init_lang_id_external():
     return lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples
 @st.cache_data
-def display_acp():
-    data = np.load('data/data_lang_id_acp.npz')
     X_train_scaled = data['X_train_scaled']
     y_train_pred = data['y_train_pred']
     label_arrow = ['.', ',', '?', ' a', ' de', ' la', ' que', 'Tom', ' un', ' the', ' in', \
@@ -167,7 +169,7 @@ def display_acp():
     plt.rc("xtick", labelsize=14)  # Taille des étiquettes de l'axe des x
     plt.rc("ytick", labelsize=14)  # Taille des étiquettes de l'axe des y
-    st.write("Affichage de 10 000 phrases (points) et des 50 tokens les + utilisés (flèches)")
     st.write("")
     fig = plt.figure(figsize=(20, 15))
     sns.scatterplot(x='PC1', y='PC2', hue='Langue', data=finalDF, alpha=0.5)
@@ -175,7 +177,7 @@ def display_acp():
         plt.arrow(0, 0, coeff[i, 0]*1.5, coeff[i, 1]*0.8,color='k', alpha=0.08, head_width=0.01, )
         plt.text(coeff[i, 0]*1.5, coeff[i, 1] * 0.8, label_arrow[i], color='k', weight='bold')
-    plt.title("Importance des principaux tokens dans\nl'identification de langue par l'algorithme Naive Bayes")
     plt.xlim(-0.4, 0.45)
     plt.ylim(-0.15, 0.28);
     st.pyplot(fig)
@@ -183,7 +185,7 @@ def display_acp():
 @st.cache_data
 def read_BOW_examples():
-    return pd.read_csv('data/lang_id_small_BOW.csv')
 def analyse_nb(sel_phrase):
     global lang_exemples,exemples
@@ -199,9 +201,9 @@ def analyse_nb(sel_phrase):
             if sb[i] > 0: nb_unique_token +=1
         return sb, nb_unique_token
-    st.write("#### **Probabilité d'appartenance de la phrase à une langue :**")
     st.image("./assets/formule_proba_naive_bayes.png")
-    st.write("où **C** est la classe (lan_code), **Fi** est la caractéristique i du BOW, **Z** est l'\"evidence\" servant à regulariser la proba")
     st.write("")
     nb_lang = 5
     lan_code = ['deu','eng','fra','spa','ita']
@@ -214,14 +216,14 @@ def analyse_nb(sel_phrase):
     nb_phrases_lang =[]
     for l in lan_code:
         nb_phrases_lang.append(sum(df_BOW['lan_code']==l))
-    st.write("Phrase à analyser :",'**:'+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase],']** - **"'+exemples[sel_phrase]+'"**')
     # Tokenisation et encodage de la phrase
     encodage = tokenizer.encode(exemples[sel_phrase])
     # Création du vecteur BOW de la phrase
     bow_exemple,  nb_unique_token = create_small_BOW(exemples[sel_phrase])
-    st.write("Nombre de tokens retenus dans le BOW: "+ str(nb_unique_token))
     masque_tokens_retenus = [(1 if token in list(dict_ids.keys()) else 0) for token in encodage]
     str_token = " "
     for i in range(len(encodage)):
@@ -232,7 +234,7 @@ def analyse_nb(sel_phrase):
                 str_token += "**:violet["+tokenizer.decode([encodage[i]])+"]** "
         else: str_token += ":green["+tokenizer.decode([encodage[i]])+"] "
-    st.write("Tokens se trouvant dans le modèle (en :red[rouge] ou :violet[violet]) :"+str_token+" ")
     st.write("")
     # Afin de continuer l'analyse on ne garde que les token de la phrase disponibles dans le BOW
@@ -251,7 +253,7 @@ def analyse_nb(sel_phrase):
     col_name = [str(i+1)+'-'+tokenizer.decode([int(token_used[i])]) for i in range(len(token_used))]
     df_count = pd.DataFrame(data=votes,columns=token_used, index=lan_code)
     df_count.columns = col_name
-    st.write("\n**Nombre d'apparitions des tokens, dans chaque langue**")
     # Lissage de Laplace n°1 (Laplace smoothing )
     # df_count = df_count+1
@@ -284,7 +286,7 @@ def analyse_nb(sel_phrase):
     df_proba['Proba'] = df_proba['Proba'].round(3)
     # Affichage de la matrice des probabilités
-    st.write("**Probabilités conditionnelles d'apparition des tokens retenus, dans chaque langue:**")
     st.dataframe(df_proba)
     str_token = "Lang proba max: "#&nbsp;"*20
     for i,token in enumerate(df_proba.columns[:-1]):
@@ -292,17 +294,17 @@ def analyse_nb(sel_phrase):
     st.write(str_token)
     st.write("")
-    st.write("Langue réelle de la phrase"+"&nbsp;"*35+": **:"+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase]+']**')
-    st.write("Langue dont la probabilité est la plus forte "+": **:"+lan_color[df_proba['Proba'].idxmax()]+'['+df_proba['Proba'].idxmax(),"]** (proba={:.2f}".format(max(df_proba['Proba']))+")")
     prediction = clf_nb2.predict([bow_exemple])
-    st.write("Langue prédite par Naiva Bayes"+"&nbsp;"*23+": **:"+lan_color[prediction[0]]+'['+prediction[0]+"]** (proba={:.2f}".format(max(clf_nb2.predict_proba([bow_exemple])[0]))+")")
     st.write("")
     fig, axs = plt.subplots(1, 2, figsize=(10, 6))
     df_proba_sorted =df_proba.sort_index(ascending=True)
-    axs[0].set_title("Probabilités calculée manuellement", fontsize=12)
     axs[0].barh(df_proba_sorted.index, df_proba_sorted['Proba'])
-    axs[1].set_title("Probabilités du classifieur Naive Bayes", fontsize=12)
     axs[1].barh(df_proba_sorted.index, clf_nb2.predict_proba([bow_exemple])[0]);
     st.pyplot(fig)
     return
@@ -313,9 +315,9 @@ def find_exemple(lang_sel):
     return exemples[lang_sel]
 def display_shapley(lang_sel):
-    st.write("**Analyse de l'importance de chaque token dans l'identification de la langue**")
     st.image('assets/fig_schapley'+str(lang_sel)+'.png')
-    st.write("**Recapitulatif de l'influence des tokens sur la selection de la langue**")
     st.image('assets/fig_schapley_recap'+str(lang_sel)+'.png')
     return
@@ -330,54 +332,58 @@ def run():
     lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples= init_lang_id_external()
     st.write("")
-    st.title(title)
-    st.write("## **Explications :**\n")
-    st.markdown(
         """
-        Afin de mettre en oeuvre cette fonctionnalité nous avons utilisé un jeu d'entrainement multilinge de **9.757.778 phrases** dans **95 langues**.
-        Les 95 langues identifiées sont:
         """
-    )
-    st.selectbox(label="",options=sorted(lan_identified))
-    st.markdown("""
         Nous avons utilisé 2 méthodes pour identifier la langue d'un texte:
         1. un classificateur **Naïve Bayes**
         2. un modèle de **Deep Learning**
         Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
         <br>
-        """
         , unsafe_allow_html=True)
     chosen_id = tab_bar(data=[
-        TabBarItemData(id="tab1", title="Id. Naïve Bayes", description="avec le Bag Of Words"),
-        TabBarItemData(id="tab2", title="Id. Deep Learning", description=" avec Keras"),
-        TabBarItemData(id="tab3", title="Interpretabilité", description="du modèle Naïve Bayes ")],
         default="tab1")
     if (chosen_id == "tab1") or (chosen_id == "tab2"):
-        st.write("## **Paramètres :**\n")
-        toggle_val = st.toggle('Phrase à saisir/Phrase test', value=True, help="Off = phrase à saisir, On = selection d'une phrase test parmi 9500 phraseq")
         if toggle_val:
-            custom_sentence= st.selectbox("Selectionnez une phrases test à identifier:", sentence_test['sentence'] )
         else:
-            custom_sentence = st.text_area(label="Saisir le texte dont vous souhaitez identifier la langue:")
-            st.button(label="Valider", type="primary")
         if custom_sentence!='':
-            st.write("## **Résultats :**\n")
             md = """
-                |Identifieur                          |Langue détectée|
                 |-------------------------------------|---------------|"""
             md1 = ""
             if toggle_val:
                 lan_reelle = sentence_test['lan_code'].loc[sentence_test['sentence']==custom_sentence].tolist()[0]
                 md1 = """
-                |Langue réelle                        |**:blue["""+lan_to_language[lan_reelle]+"""]**|"""
             md2 = """
-                |Classificateur Naïve Bayes           |**:red["""+lang_id_nb(custom_sentence)+"""]**|
-                |Modèle de Deep Learning           |**:red["""+lang_id_dl(custom_sentence)+"""]**|"""
             md3 = """
                 |XLM-RoBERTa (Hugging Face)           |**:red["""+lan_to_language[dict_xlmr[lang_id_model_ext(custom_sentence)[0]['label']]]+"""]**|"""
             if toggle_val:
@@ -386,64 +392,88 @@ def run():
             st.markdown(md+md1+md2+md3, unsafe_allow_html=True)
-        st.write("## **Details sur la méthode :**\n")
         if (chosen_id == "tab1"):
-            st.markdown(
                 """
-                Afin d'utiliser le classificateur Naïve Bayes, il nous a fallu:
-                - Créer un Bag of Words de token..
-                - ..Tokeniser le texte d'entrainement avec CountVectorizer et un tokenizer 'custom', **Tiktoken** d'OpenAI.
-                - Utiliser des matrices creuses (Sparse Matrix), car notre BOW contenait 10 M de lignes x 59122 tokens.
-                - Sauvegarder le vectorizer (non serialisable) et le classificateur entrainé.
                 L'execution de toutes ces étapes est assez rapide: une dizaine de minutes
                 <br>
                 Le résultat est très bon: L'Accuracy sur le jeu de test est =
                 **:red[96%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp)
                 <br>
                 **Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
                 **Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = **97,8%**,
                 versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
-                """
             , unsafe_allow_html=True)
         else:
-            st.markdown(
                 """
-                Nous avons mis en oeuvre un modèle Keras avec une couche d'embedding et 4 couches denses *(Voir architecture ci-dessous)*.
-                Nous avons utilisé le tokeniser **Tiktoken** d'OpenAI.
                 La couche d'embedding accepte 250 tokens, ce qui signifie que la détection de langue s'effectue sur approximativement les 200 premiers mots.
                 <br>
                 L'entrainement a duré plus de 10 heures..
                 Finalement, le résultat est très bon: L'Accuracy sur le jeu de test est =
                 **:red[97,5%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp).
                 Néanmoins, la durée pour une prédiction est relativement longue: approximativement 5/100 de seconde
                 <br>
-                **Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
-                **Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = **97,8%**,
                 versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
                 <br>
-                """
                 , unsafe_allow_html=True)
-            st.write("<center><h5>Architecture du modèle utilisé:</h5></center>", unsafe_allow_html=True)
             plot_model(dl_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file='./assets/model_plot.png')
             col1, col2, col3 = st.columns([0.15,0.7,0.15])
             with col2:
                  st.image('./assets/model_plot.png',use_column_width="auto")
     elif (chosen_id == "tab3"):
-        st.write("### **Interpretabilité du classifieur Naïve Bayes sur 5 langues**")
-        st.write("##### ..et un Training set réduit (15000 phrases et 94 tokens)")
         st.write("")
         chosen_id2 = tab_bar(data=[
-            TabBarItemData(id="tab1", title="Analyse en Compos. Princ.", description=""),
-            TabBarItemData(id="tab2", title="Simul. calcul NB", description=""),
-            TabBarItemData(id="tab3", title="Shapley", description="")],
             default="tab1")
         if (chosen_id2 == "tab1"):
-            display_acp()
         if (chosen_id2 == "tab2") or (chosen_id2 == "tab3"):
-            sel_phrase = st.selectbox('Selectionnez une phrase à "interpréter":', range(9), format_func=find_exemple)
             if (chosen_id2 == "tab2"):
                 analyse_nb(sel_phrase)
             if (chosen_id2 == "tab3"):
                 display_shapley(sel_phrase)

 import matplotlib.pyplot as plt
 import seaborn as sns
 from sklearn import naive_bayes
+from translate_app import tr
 title = "Identification de langue"
 sidebar_name = "Identification de langue"
+dataPath = st.session_state.DataPath
 # CountVectorizer a une liste de phrase en entrée.
 # Cette fonction met les données d'entrée dans le bon format
 def load_vectorizer(tokenizer):
     global dict_token, dict_ids, nb_token
+    path = dataPath+'/vectorizer_tiktoken_big.pkl'
     vectorizer = joblib.load(path)
     dict_token = {tokenizer.decode([cle]): cle for cle, valeur in vectorizer.vocabulary_.items()}
     dict_ids = {cle: tokenizer.decode([cle]) for cle, valeur in vectorizer.vocabulary_.items()} #dict_ids.items()}
     tokenizer = tiktoken.get_encoding("cl100k_base")
     # Chargement du classificateur sauvegardé
+    clf_nb = joblib.load(dataPath+"/id_lang_tiktoken_nb_sparse_big.pkl")
     vectorizer = load_vectorizer(tokenizer)
     # Lisez le contenu du fichier JSON
+    with open(dataPath+'/multilingue/lan_to_language.json', 'r') as fichier:
         lan_to_language = json.load(fichier)
     return tokenizer, dict_token, dict_ids, nb_token, lan_to_language, clf_nb, vectorizer
 def read_list_lan():
+    with open(dataPath+'/multilingue/lan_code.csv', 'r') as fichier_csv:
         reader = csv.reader(fichier_csv)
         lan_code = next(reader)
         return lan_code
     list_lan = read_list_lan()
     lan_identified = [lan_to_language[l] for l in list_lan]
     label_encoder.fit(list_lan)
+    merge = Merge(dataPath+"/dl_id_lang_split",  "../data", "dl_tiktoken_id_language_model.h5").merge(cleanup=False)
+    dl_model = keras.models.load_model(dataPath+"/dl_tiktoken_id_language_model.h5")
     return dl_model, label_encoder, list_lan, lan_identified
 def lang_id_dl(sentences):
     lang_id_model_ext = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
     dict_xlmr  = {"ar":"ara", "bg":"bul", "de":"deu", "el": "ell", "en":"eng", "es":"spa", "fr":"fra", "hi": "hin","it":"ita","ja":"jpn", \
                   "nl":"nld", "pl":"pol", "pt":"por", "ru":"rus", "sw":"swh", "th":"tha", "tr":"tur", "ur": "urd", "vi":"vie", "zh":"cmn"}
+    sentence_test = pd.read_csv(dataPath+'//multilingue/sentence_test_extract.csv')
     sentence_test = sentence_test[:4750]
     # Instanciation d'un exemple
     exemples = ["Er weiß überhaupt nichts über dieses Buch",                               # Phrase 0
     return lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples
 @st.cache_data
+def display_acp(title, comment):
+    data = np.load(dataPath+'/data_lang_id_acp.npz')
     X_train_scaled = data['X_train_scaled']
     y_train_pred = data['y_train_pred']
     label_arrow = ['.', ',', '?', ' a', ' de', ' la', ' que', 'Tom', ' un', ' the', ' in', \
     plt.rc("xtick", labelsize=14)  # Taille des étiquettes de l'axe des x
     plt.rc("ytick", labelsize=14)  # Taille des étiquettes de l'axe des y
+    st.write(comment)
     st.write("")
     fig = plt.figure(figsize=(20, 15))
     sns.scatterplot(x='PC1', y='PC2', hue='Langue', data=finalDF, alpha=0.5)
         plt.arrow(0, 0, coeff[i, 0]*1.5, coeff[i, 1]*0.8,color='k', alpha=0.08, head_width=0.01, )
         plt.text(coeff[i, 0]*1.5, coeff[i, 1] * 0.8, label_arrow[i], color='k', weight='bold')
+    plt.title(title)
     plt.xlim(-0.4, 0.45)
     plt.ylim(-0.15, 0.28);
     st.pyplot(fig)
 @st.cache_data
 def read_BOW_examples():
+    return pd.read_csv(dataPath+'/lang_id_small_BOW.csv')
 def analyse_nb(sel_phrase):
     global lang_exemples,exemples
             if sb[i] > 0: nb_unique_token +=1
         return sb, nb_unique_token
+    st.write("#### **"+tr("Probabilité d'appartenance de la phrase à une langue")+" :**")
     st.image("./assets/formule_proba_naive_bayes.png")
+    st.write(tr("où **C** est la classe (lan_code), **Fi** est la caractéristique i du BOW, **Z** est l'\"evidence\" servant à regulariser la probabilité"))
     st.write("")
     nb_lang = 5
     lan_code = ['deu','eng','fra','spa','ita']
     nb_phrases_lang =[]
     for l in lan_code:
         nb_phrases_lang.append(sum(df_BOW['lan_code']==l))
+    st.write(tr("Phrase à analyser")+" :",'**:'+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase],']** - **"'+exemples[sel_phrase]+'"**')
     # Tokenisation et encodage de la phrase
     encodage = tokenizer.encode(exemples[sel_phrase])
     # Création du vecteur BOW de la phrase
     bow_exemple,  nb_unique_token = create_small_BOW(exemples[sel_phrase])
+    st.write(tr("Nombre de tokens retenus dans le BOW")+": "+ str(nb_unique_token))
     masque_tokens_retenus = [(1 if token in list(dict_ids.keys()) else 0) for token in encodage]
     str_token = " "
     for i in range(len(encodage)):
                 str_token += "**:violet["+tokenizer.decode([encodage[i]])+"]** "
         else: str_token += ":green["+tokenizer.decode([encodage[i]])+"] "
+    st.write(tr("Tokens se trouvant dans le modèle (en")+" :red["+tr("rouge")+"] "+tr("ou")+" :violet["+tr("violet")+"]) :"+str_token+" ")
     st.write("")
     # Afin de continuer l'analyse on ne garde que les token de la phrase disponibles dans le BOW
     col_name = [str(i+1)+'-'+tokenizer.decode([int(token_used[i])]) for i in range(len(token_used))]
     df_count = pd.DataFrame(data=votes,columns=token_used, index=lan_code)
     df_count.columns = col_name
+    st.write("\n**"+tr("Nombre d'apparitions des tokens, dans chaque langue")+"**")
     # Lissage de Laplace n°1 (Laplace smoothing )
     # df_count = df_count+1
     df_proba['Proba'] = df_proba['Proba'].round(3)
     # Affichage de la matrice des probabilités
+    st.write("**"+tr("Probabilités conditionnelles d'apparition des tokens retenus, dans chaque langue")+":**")
     st.dataframe(df_proba)
     str_token = "Lang proba max: "#&nbsp;"*20
     for i,token in enumerate(df_proba.columns[:-1]):
     st.write(str_token)
     st.write("")
+    st.write(tr("Langue réelle de la phrase")+"&nbsp;"*35+": **:"+lan_color[lang_exemples[sel_phrase]]+'['+lang_exemples[sel_phrase]+']**')
+    st.write(tr("Langue dont la probabilité est la plus forte ")+": **:"+lan_color[df_proba['Proba'].idxmax()]+'['+df_proba['Proba'].idxmax(),"]** (proba={:.2f}".format(max(df_proba['Proba']))+")")
     prediction = clf_nb2.predict([bow_exemple])
+    st.write(tr("Langue prédite par Naiva Bayes")+"&nbsp;"*23+": **:"+lan_color[prediction[0]]+'['+prediction[0]+"]** (proba={:.2f}".format(max(clf_nb2.predict_proba([bow_exemple])[0]))+")")
     st.write("")
     fig, axs = plt.subplots(1, 2, figsize=(10, 6))
     df_proba_sorted =df_proba.sort_index(ascending=True)
+    axs[0].set_title(tr("Probabilités calculée manuellement"), fontsize=12)
     axs[0].barh(df_proba_sorted.index, df_proba_sorted['Proba'])
+    axs[1].set_title(tr("Probabilités du classifieur Naive Bayes"), fontsize=12)
     axs[1].barh(df_proba_sorted.index, clf_nb2.predict_proba([bow_exemple])[0]);
     st.pyplot(fig)
     return
     return exemples[lang_sel]
 def display_shapley(lang_sel):
+    st.write("**"+tr("Analyse de l'importance de chaque token dans l'identification de la langue")+"**")
     st.image('assets/fig_schapley'+str(lang_sel)+'.png')
+    st.write("**"+tr("Recapitulatif de l'influence des tokens sur la selection de la langue")+"**")
     st.image('assets/fig_schapley_recap'+str(lang_sel)+'.png')
     return
     lang_id_model_ext, dict_xlmr, sentence_test, lang_exemples, exemples= init_lang_id_external()
     st.write("")
+    st.title(tr(title))
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
         """
+        Afin de mettre en oeuvre cette fonctionnalité nous avons utilisé un jeu d'entrainement multilinge de <b> 9.757.778 phrases dans 95 langues</b>.
+        Les 95 langues identifiées sont:
+        """)
+    , unsafe_allow_html=True)
+    st.selectbox(label="Lang",options=sorted(lan_identified),label_visibility="hidden")
+    st.markdown(tr(
         """
         Nous avons utilisé 2 méthodes pour identifier la langue d'un texte:
         1. un classificateur **Naïve Bayes**
         2. un modèle de **Deep Learning**
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
         Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
         <br>
+        """)
         , unsafe_allow_html=True)
     chosen_id = tab_bar(data=[
+        TabBarItemData(id="tab1", title=tr("Id. Naïve Bayes"), description=tr("avec le Bag Of Words")),
+        TabBarItemData(id="tab2", title=tr("Id. Deep Learning"), description=tr(" avec Keras")),
+        TabBarItemData(id="tab3", title=tr("Interpretabilité"), description=tr("du modèle Naïve Bayes "))],
         default="tab1")
     if (chosen_id == "tab1") or (chosen_id == "tab2"):
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        toggle_val = st.toggle(tr('Phrase à saisir/Phrase test'), value=True, help=tr("Off = phrase à saisir, On = selection d'une phrase test parmi 9500 phrases"))
         if toggle_val:
+            custom_sentence= st.selectbox(tr("Selectionnez une phrases test à identifier")+":", sentence_test['sentence'] )
         else:
+            custom_sentence = st.text_area(label=tr("Saisir le texte dont vous souhaitez identifier la langue:"))
+            st.button(label=tr("Valider"), type="primary")
         if custom_sentence!='':
+            st.write("## **"+tr("Résultats")+" :**\n")
             md = """
+                |"""+tr("Identifieur")+"""                          |"""+tr("Langue identifiée")+"""|
                 |-------------------------------------|---------------|"""
             md1 = ""
             if toggle_val:
                 lan_reelle = sentence_test['lan_code'].loc[sentence_test['sentence']==custom_sentence].tolist()[0]
                 md1 = """
+                |"""+tr("Langue réelle")+"""                        |**:blue["""+lan_to_language[lan_reelle]+"""]**|"""
             md2 = """
+                |"""+tr("Classificateur Naïve Bayes")+"""           |**:red["""+lang_id_nb(custom_sentence)+"""]**|
+                |"""+tr("Modèle de Deep Learning")+"""           |**:red["""+lang_id_dl(custom_sentence)+"""]**|"""
             md3 = """
                 |XLM-RoBERTa (Hugging Face)           |**:red["""+lan_to_language[dict_xlmr[lang_id_model_ext(custom_sentence)[0]['label']]]+"""]**|"""
             if toggle_val:
             st.markdown(md+md1+md2+md3, unsafe_allow_html=True)
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
         if (chosen_id == "tab1"):
+            st.markdown(tr(
+                """
+                Afin d'utiliser le classificateur Naïve Bayes, il nous a fallu:""")+"\n"+
+                "* "+tr("Créer un Bag of Words de token..")+"\n"+
+                "* "+tr("..Tokeniser le texte d'entrainement avec CountVectorizer et un tokenizer 'custom', **Tiktoken** d'OpenAI.  ")+"\n"+
+                "* "+tr("Utiliser des matrices creuses (Sparse Matrix), car notre BOW contenait 10 Millions de lignes x 59122 tokens.  ")+"\n"+
+                "* "+tr("Sauvegarder le vectorizer (non serialisable) et le classificateur entrainé.  ")
+            , unsafe_allow_html=True)
+            st.markdown(tr(
                 """
                 L'execution de toutes ces étapes est assez rapide: une dizaine de minutes
                 <br>
                 Le résultat est très bon: L'Accuracy sur le jeu de test est =
                 **:red[96%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp)
                 <br>
+                """)
+            , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
                 **Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**
                 **Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = **97,8%**,
                 versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
+                """)
             , unsafe_allow_html=True)
         else:
+            st.markdown(tr(
                 """
+                Nous avons mis en oeuvre un modèle Keras avec une couche d'embedding et 4 couches denses (*Voir architecture ci-dessous*).
+                Nous avons utilisé le tokeniser <b>Tiktoken</b> d'OpenAI.
                 La couche d'embedding accepte 250 tokens, ce qui signifie que la détection de langue s'effectue sur approximativement les 200 premiers mots.
                 <br>
+                """)
+            , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
                 L'entrainement a duré plus de 10 heures..
                 Finalement, le résultat est très bon: L'Accuracy sur le jeu de test est =
                 **:red[97,5%]** sur les 95 langues, et **:red[99,1%]** sur les 5 langues d'Europe de l'Ouest (en,fr,de,it,sp).
                 Néanmoins, la durée pour une prédiction est relativement longue: approximativement 5/100 de seconde
                 <br>
+                """)
+                , unsafe_allow_html=True)
+            st.markdown(tr(
+                """
+                **Note 1:** Les 2 modèles ont un accuracy similaire sur le jeu de test: **:red[96% pour NB et 97,5% pour DL]**""")+"<br>"+
+                tr("""
+                **Note 2:** Le modèle *XLM-RoBERTa* de Hugging Face (qui identifie 20 langues seulement) a une accuracy, sur notre jeu de test = <b>97,8%</b>,
                 versus **99,3% pour NB** et **99,2% pour DL** sur ces 20 langues.
                 <br>
+                """)
                 , unsafe_allow_html=True)
+            st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5></center>", unsafe_allow_html=True)
             plot_model(dl_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file='./assets/model_plot.png')
             col1, col2, col3 = st.columns([0.15,0.7,0.15])
             with col2:
                  st.image('./assets/model_plot.png',use_column_width="auto")
     elif (chosen_id == "tab3"):
+        st.write("### **"+tr("Interpretabilité du classifieur Naïve Bayes sur 5 langues")+"**")
+        st.write("##### "+tr("..et un Training set réduit (15000 phrases et 94 tokens)"))
         st.write("")
         chosen_id2 = tab_bar(data=[
+            TabBarItemData(id="tab1", title=tr("Analyse en Compos. Princ."), description=""),
+            TabBarItemData(id="tab2", title=tr("Simul. calcul NB"), description=""),
+            TabBarItemData(id="tab3", title=tr("Shapley"), description="")],
             default="tab1")
         if (chosen_id2 == "tab1"):
+            display_acp(tr("Importance des principaux tokens dans \n l'identification de langue par l'algorithme Naive Bayes"),tr("Affichage de 10 000 phrases (points) et des 50 tokens les + utilisés (flèches)"))
         if (chosen_id2 == "tab2") or (chosen_id2 == "tab3"):
+            sel_phrase = st.selectbox(tr('Selectionnez une phrase à "interpréter"')+':', range(9), format_func=find_exemple)
             if (chosen_id2 == "tab2"):
                 analyse_nb(sel_phrase)
             if (chosen_id2 == "tab3"):
                 display_shapley(sel_phrase)

tabs/intro.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import streamlit as st
 title = "Démosthène"
 sidebar_name = "Introduction"
@@ -15,51 +14,80 @@ def run():
     # st.image("assets/tough-communication.gif",use_column_width=True)
     st.write("")
-    st.image("https://media.tenor.com/pfOeAfytY98AAAAC/miss-honey-glasses-off.gif",use_column_width=True)
-    st.title(title)
     st.markdown('''
-                ## **Système de traduction adapté aux lunettes connectées**
                 ---
                 ''')
-    st.header("**A propos**")
-    st.markdown(
         """
         Ce projet a été réalisé dans le cadre d’une formation de Data Scientist, entre juin et novembre 2023.
         <br>
         :red[**Démosthène**] est l'un des plus grands orateurs de l'Antiquité. Il savait s’exprimer,  et se faire comprendre.
         Se faire comprendre est l’un des principaux objectifs de la traduction.
         Démosthène avait de gros problèmes d’élocution.
         Il les a surmontés en s’entraînant à parler avec des cailloux dans la bouche.
         À l’image de l’Intelligence Artificielle,  où des entraînements sont nécessaires pour obtenir de bons résultats.
         Il nous a semblé pertinent de donner le nom de cet homme à un projet qu’il a fort bien illustré, il y a 2300 ans.
-        """
     , unsafe_allow_html=True)
-    st.header("**Contexte**")
-    st.markdown(
         """
         Les personnes malentendantes communiquent difficilement avec autrui. Par ailleurs, toute personne se trouvant dans un pays étranger
         dont il ne connaît pas la langue se retrouve dans la situation d’une personne malentendante.
         L’usage de lunettes connectées, dotées de la technologie de reconnaissance vocale et d’algorithmes IA de deep learning, permettrait
         de détecter la voix d’un interlocuteur, puis d’afficher la transcription textuelle, sur les verres en temps réel.
         À partir de cette transcription, il est possible d’:red[**afficher la traduction dans la langue du porteur de ces lunettes**].
-        """
-    )
-    st.header("**Objectifs**")
-    st.markdown(
         """
         L’objectif de ce projet est de développer une brique technologique de traitement, de transcription et de traduction,
         qui par la suite serait implémentable dans des lunettes connectées. Nous avons concentré nos efforts sur la construction
         d’un :red[**système de traduction**] plutôt que sur la reconnaissance vocale,
         et ce, pour tout type de public, afin de faciliter le dialogue entre deux individus ne pratiquant pas la même langue.
-        Il est bien sûr souhaitable que le système puisse rapidement :red[**identifier la langue**] des phrases fournies.
         Lors de la traduction, nous ne prendrons pas en compte le contexte des phrases précédentes ou celles préalablement traduites.
-        Nous évaluerons la qualité de nos résultats en les comparant avec des systèmes performants tels que “[Google translate](https://translate.google.fr/)”
-        Le projet est enregistré sur [Github](https://github.com/Demosthene-OR/AVR23_CDS_Text_translation)
         """
-    )

 import streamlit as st
+from translate_app import tr
 title = "Démosthène"
 sidebar_name = "Introduction"
     # st.image("assets/tough-communication.gif",use_column_width=True)
     st.write("")
+    if st.session_state.Cloud == 0:
+        st.image("assets/miss-honey-glasses-off.gif",use_column_width=True)
+    else:
+        st.image("https://media.tenor.com/pfOeAfytY98AAAAC/miss-honey-glasses-off.gif",use_column_width=True)
+    st.title(tr(title))
     st.markdown('''
+                ## **'''+tr("Système de traduction adapté aux lunettes connectées")+'''**
                 ---
                 ''')
+    st.header("**"+tr("A propos")+"**")
+    st.markdown(tr(
         """
         Ce projet a été réalisé dans le cadre d’une formation de Data Scientist, entre juin et novembre 2023.
         <br>
         :red[**Démosthène**] est l'un des plus grands orateurs de l'Antiquité. Il savait s’exprimer,  et se faire comprendre.
         Se faire comprendre est l’un des principaux objectifs de la traduction.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
         Démosthène avait de gros problèmes d’élocution.
         Il les a surmontés en s’entraînant à parler avec des cailloux dans la bouche.
         À l’image de l’Intelligence Artificielle,  où des entraînements sont nécessaires pour obtenir de bons résultats.
         Il nous a semblé pertinent de donner le nom de cet homme à un projet qu’il a fort bien illustré, il y a 2300 ans.
+        """)
     , unsafe_allow_html=True)
+    st.header("**"+tr("Contexte")+"**")
+    st.markdown(tr(
         """
         Les personnes malentendantes communiquent difficilement avec autrui. Par ailleurs, toute personne se trouvant dans un pays étranger
         dont il ne connaît pas la langue se retrouve dans la situation d’une personne malentendante.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
         L’usage de lunettes connectées, dotées de la technologie de reconnaissance vocale et d’algorithmes IA de deep learning, permettrait
         de détecter la voix d’un interlocuteur, puis d’afficher la transcription textuelle, sur les verres en temps réel.
         À partir de cette transcription, il est possible d’:red[**afficher la traduction dans la langue du porteur de ces lunettes**].
+        """)
+    , unsafe_allow_html=True)
+    st.header("**"+tr("Objectifs")+"**")
+    st.markdown(tr(
         """
         L’objectif de ce projet est de développer une brique technologique de traitement, de transcription et de traduction,
         qui par la suite serait implémentable dans des lunettes connectées. Nous avons concentré nos efforts sur la construction
         d’un :red[**système de traduction**] plutôt que sur la reconnaissance vocale,
         et ce, pour tout type de public, afin de faciliter le dialogue entre deux individus ne pratiquant pas la même langue.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Il est bien sûr souhaitable que le système puisse rapidement :red[**identifier la langue**] des phrases fournies.
         Lors de la traduction, nous ne prendrons pas en compte le contexte des phrases précédentes ou celles préalablement traduites.
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Nous évaluerons la qualité de nos résultats en les comparant avec des systèmes performants tels que “[Google translate](https://translate.google.fr/)”
+        """)
+    , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        Le projet est enregistré sur "[Github](https://github.com/Demosthene-OR/AVR23_CDS_Text_translation)"
+        """)
+    , unsafe_allow_html=True)
+    '''
+    sent = \
+        """
         """
+    st.markdown(tr(sent), unsafe_allow_html=True)
+    '''

tabs/modelisation_dict_tab.py CHANGED Viewed

@@ -3,12 +3,15 @@ import pandas as pd
 import numpy as np
 import os
 from sacrebleu import corpus_bleu
-# from sklearn.cluster import KMeans
-# from sklearn.neighbors import KNeighborsClassifier
-# from sklearn.ensemble import RandomForestClassifier
 title = "Traduction mot à mot"
 sidebar_name = "Traduction mot à mot"
 @st.cache_data
 def load_corpus(path):
@@ -19,15 +22,7 @@ def load_corpus(path):
         data=data[:-1]
     return pd.DataFrame(data)
-df_data_en = load_corpus('data/preprocess_txt_en')
-df_data_fr = load_corpus('data/preprocess_txt_fr')
-n1 = 0
-"""
-nb_mots_en = 199 # len(corpus_en)
-nb_mots_fr = 330 # len(corpus_fr)
-# @st.cache_data(ttl='1h00s')
 def load_BOW(path, l):
     input_file = os.path.join(path)
     df1 = pd.read_csv(input_file+'1_'+l, encoding="utf-8", index_col=0)
@@ -35,10 +30,11 @@ def load_BOW(path, l):
     df_count_word  = pd.concat([df1, df2])
     return df_count_word
-df_count_word_en = load_BOW('../data/preprocess_df_count_word', 'en')
-df_count_word_fr = load_BOW('../data/preprocess_df_count_word', 'fr')
-"""
 def accuracy(dict_ref,dict):
     correct_words = 0
@@ -51,122 +47,122 @@ def accuracy(dict_ref,dict):
     print(correct_words," mots corrects / ",min(dict.shape[1],dict_ref.shape[1]))
     return correct_words/min(dict.shape[1],dict_ref.shape[1])
-"""
-# On modifie df_count_word en indiquant la présence d'un mot par 1 (au lieu du nombre d'occurences)
-df_count_word_en = df_count_word_en[df_count_word_en==0].fillna(1)
-df_count_word_fr = df_count_word_fr[df_count_word_fr==0].fillna(1)
-# On triche un peu parce que new et jersey sont toujours dans la même phrase et donc dans la même classe
-if ('new' in df_count_word_en.columns):
-    df_count_word_en['new']=df_count_word_en['new']*2
-    df_count_word_fr['new']=df_count_word_fr['new']*2
-# ============
-def calc_kmeans(l_src,l_tgt):
-    global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
-    # Algorithme de K-means
-    init_centroids = df_count_word_tgt.T
-    kmeans = KMeans(n_clusters = nb_mots_tgt, n_init=1, max_iter=1, init=init_centroids, verbose=0)
-    kmeans.fit(df_count_word_tgt.T)
-    # Centroids and labels
-    centroids= kmeans.cluster_centers_
-    labels = kmeans.labels_
-    # Création et affichage du dictionnaire
-    df_dic = pd.DataFrame(data=df_count_word_tgt.columns[kmeans.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
-    df_dic.index.name= l_src
-    df_dic = df_dic.T
-    # print("Dictionnaire Anglais -> Français:")
-    # translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR'] =round(accuracy(dict_EN_FR_ref,dict_EN_FR)*100, 2)
-    # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR']}%")
-    # display(dict_EN_FR)
-    return df_dic
-def calc_knn(l_src,l_tgt, metric):
-    global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
-    #Définition de la metrique (pour les 2 dictionnaires
-    knn_metric = metric   # minkowski, cosine, chebyshev, manhattan, euclidean
-    # Algorithme de KNN
-    X_train = df_count_word_tgt.T
-    y_train = range(nb_mots_tgt)
-    # Création du classifieur et construction du modèle sur les données d'entraînement
-    knn = KNeighborsClassifier(n_neighbors=1, metric=knn_metric)
-    knn.fit(X_train, y_train)
-    # Création et affichage du dictionnaire
-    df_dic = pd.DataFrame(data=df_count_word_tgt.columns[knn.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
-    df_dic.index.name = l_src
-    df_dic = df_dic.T
-    # print("Dictionnaire Anglais -> Français:")
-    # translation_quality['Précision du dictionnaire'].loc['KNN EN->FR'] =round(accuracy(dict_EN_FR_ref,knn_dict_EN_FR)*100, 2)
-    # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['KNN EN->FR']}%")
-    # display(knn_dict_EN_FR)
-    return df_dic
-def calc_rf(l_src,l_tgt):
-    # Algorithme de Random Forest
-    X_train = df_count_word_tgt.T
-    y_train = range(nb_mots_tgt)
-    # Création du classifieur et construction du modèle sur les données d'entraînement
-    rf = RandomForestClassifier(n_jobs=-1, random_state=321)
-    rf.fit(X_train, y_train)
-    # Création et affichage du dictionnaire
-    df_dic = pd.DataFrame(data=df_count_word_tgt.columns[rf.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
-    df_dic.index.name= l_src
-    df_dic = df_dic.T
-    # print("Dictionnaire Anglais -> Français:")
-    # translation_quality['Précision du dictionnaire'].loc['RF EN->FR'] = round(accuracy(dict_EN_FR_ref,rf_dict_EN_FR)*100, 2)
-    # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['RF EN->FR']}%")
-    # display(rf_dict_EN_FR)
-    return df_dic
-def calcul_dic(Lang,Algo,Metrique):
-    if Lang[:2]=='en':
-        l_src = 'Anglais'
-        l_tgt = 'Francais'
-    else:
-        l_src = 'Francais'
-        l_tgt = 'Anglais'
-    if Algo=='Manuel':
-        df_dic = pd.read_csv('../data/dict_ref_'+Lang+'.csv',header=0,index_col=0, encoding ="utf-8", sep=';',keep_default_na=False).T.sort_index(axis=1)
-    elif Algo=='KMeans':
-         df_dic = calc_kmeans(l_src,l_tgt)
-    elif Algo=='KNN':
-        df_dic = calc_knn(l_src,l_tgt, Metrique)
-    elif Algo=='Random Forest':
-         df_dic = calc_rf(l_src,l_tgt)
-    else:
-        df_dic = pd.read_csv('../data/dict_ref_'+Lang+'.csv',header=0,index_col=0, encoding ="utf-8", sep=';',keep_default_na=False).T.sort_index(axis=1)
-    return df_dic
-"""
-def load_dic(Lang,Algo,Metrique):
-    Algo = Algo.lower()
-    if Algo=='random forest' : Algo = "rf"
-    else:
-        if Algo=='word embedding' : Algo = "we"
         else:
-            if Algo!='knn': Metrique = ''
-            else: Metrique = Metrique+'_'
-    input_file = os.path.join('data/dict_'+Algo+'_'+Metrique+Lang)
-    return pd.read_csv(input_file, encoding="utf-8", index_col=0).T.sort_index(axis=1)
-# ============
 def display_translation(n1,dict, Lang):
     global df_data_src, df_data_tgt, placeholder
@@ -186,79 +182,97 @@ def display_translation(n1,dict, Lang):
         st.write("**ref. :** "+s_trad_ref[i])
         st.write("")
     with placeholder:
-        st.write("<p style='text-align:center;background-color:red; color:white')>Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
                  unsafe_allow_html=True)
 def display_dic(df_dic):
     st.dataframe(df_dic.T, height=600)
 def run():
-    global n1, df_data_src, df_data_tgt, df_data_en, df_data_fr, placeholder # , df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
-    # global  nb_mots_en, df_count_word_en, df_count_word_fr, nb_mots_en, nb_mots_fr
     st.write("")
-    st.title(title)
     #
-    st.write("## **Explications :**\n")
-    st.markdown(
         """
         Dans une première approche naïve, nous avons implémenté un système de traduction mot à mot.
         Cette traduction est réalisée grâce à un dictionnaire qui associe un mot de la langue source à un mot de la langue cible, dans small_vocab
         Ce dictionnaire est calculé de 3 manières:
-        * :red[**Manuellement**] en choisissant pour chaque mot source le mot cible. Ceci nous a permis de définir un dictionnaire de référence
-        * Avec le :red[**Bag Of World**] (chaque mot dans la langue cible = une classe, BOW = features)
         """)
     st.image("assets/BOW.jpg",use_column_width=True)
     st.markdown(
-        """
-        * Avec le :red[**Word Embedding**], c'est à dire en associant chaque mot à un vecteur "sémantique" de dimensions=300, et en selectionnant le vecteur de langue cible
-        le plus proche du vecteur de langue source.
-        Enfin nous calculons:
-        * la :red[**précision**] du dictionnaire par rapport à notre dictionnaire de réference (manuel)
-        * le :red[**score BLEU**] ("BiLingual Evaluation Understudy"), qui mesure la précision de notre traduction par rapport à celle de notre corpus référence.
-        """
-    )
     #
-    st.write("## **Paramètres :**\n")
-    Sens = st.radio('Sens :',('Anglais -> Français','Français -> Anglais'), horizontal=True)
     Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
-    Algo = st.radio('Algorithme :',('Manuel', 'KMeans','KNN','Random Forest','Word Embedding'), horizontal=True)
     Metrique = ''
     if (Algo == 'KNN'):
-        Metrique = st.radio('Metrique:',('minkowski', 'cosine', 'chebyshev', 'manhattan', 'euclidean'), horizontal=True)
     if (Lang=='en_fr'):
         df_data_src = df_data_en
         df_data_tgt = df_data_fr
-        # df_count_word_src = df_count_word_en
-        # df_count_word_tgt = df_count_word_fr
-        # nb_mots_src = nb_mots_en
-        # nb_mots_tgt = nb_mots_fr
     else:
         df_data_src = df_data_fr
         df_data_tgt = df_data_en
-        # df_count_word_src = df_count_word_fr
-        # df_count_word_tgt = df_count_word_en
-        # nb_mots_src = nb_mots_fr
-        # nb_mots_tgt = nb_mots_en
     # df_data_src.columns = ['Phrase']
-    sentence1 = st.selectbox("Selectionnez la 1ere des 5 phrases à traduire avec le dictionnaire sélectionné", df_data_src.iloc[:-4],index=int(n1) )
     n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
-    df_dic = load_dic(Lang,Algo,Metrique)
-    df_dic_ref = load_dic(Lang,'Manuel',Metrique)
-    st.write("## **Dictionnaire calculé et traduction mot à mot :**\n")
     col1, col2 = st.columns([0.25, 0.75])
     with col1:
-        st.write("#### **Dictionnaire**")
         precision = int(round(accuracy(df_dic_ref,df_dic)*100, 0))
-        st.write("<p style='text-align:center;background-color:red; color:white')>Précision = {:2d}%</p>".format(precision), unsafe_allow_html=True)
         display_dic(df_dic)
     with col2:
-        st.write("#### **Traduction**")
         placeholder = st.empty()
         display_translation(n1, df_dic, Lang)

 import numpy as np
 import os
 from sacrebleu import corpus_bleu
+if st.session_state.Cloud == 0:
+    from sklearn.cluster import KMeans
+    from sklearn.neighbors import KNeighborsClassifier
+    from sklearn.ensemble import RandomForestClassifier
+from translate_app import tr
 title = "Traduction mot à mot"
 sidebar_name = "Traduction mot à mot"
+dataPath = st.session_state.DataPath
 @st.cache_data
 def load_corpus(path):
         data=data[:-1]
     return pd.DataFrame(data)
+@st.cache_data
 def load_BOW(path, l):
     input_file = os.path.join(path)
     df1 = pd.read_csv(input_file+'1_'+l, encoding="utf-8", index_col=0)
     df_count_word  = pd.concat([df1, df2])
     return df_count_word
+df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
+df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
+df_count_word_en = load_BOW(dataPath+'/preprocess_df_count_word', 'en')
+df_count_word_fr = load_BOW(dataPath+'/preprocess_df_count_word', 'fr')
+n1 = 0
 def accuracy(dict_ref,dict):
     correct_words = 0
     print(correct_words," mots corrects / ",min(dict.shape[1],dict_ref.shape[1]))
     return correct_words/min(dict.shape[1],dict_ref.shape[1])
+if st.session_state.reCalcule:
+    nb_mots_en = 199 # len(corpus_en)
+    nb_mots_fr = 330 # len(corpus_fr)
+    # On modifie df_count_word en indiquant la présence d'un mot par 1 (au lieu du nombre d'occurences)
+    df_count_word_en = df_count_word_en[df_count_word_en==0].fillna(1)
+    df_count_word_fr = df_count_word_fr[df_count_word_fr==0].fillna(1)
+    # On triche un peu parce que new et jersey sont toujours dans la même phrase et donc dans la même classe
+    if ('new' in df_count_word_en.columns):
+        df_count_word_en['new']=df_count_word_en['new']*2
+        df_count_word_fr['new']=df_count_word_fr['new']*2
+    def calc_kmeans(l_src,l_tgt):
+        global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
+        # Algorithme de K-means
+        init_centroids = df_count_word_tgt.T
+        kmeans = KMeans(n_clusters = nb_mots_tgt, n_init=1, max_iter=1, init=init_centroids, verbose=0)
+        kmeans.fit(df_count_word_tgt.T)
+        # Centroids and labels
+        centroids= kmeans.cluster_centers_
+        labels = kmeans.labels_
+        # Création et affichage du dictionnaire
+        df_dic = pd.DataFrame(data=df_count_word_tgt.columns[kmeans.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
+        df_dic.index.name= l_src
+        df_dic = df_dic.T
+        # print("Dictionnaire Anglais -> Français:")
+        # translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR'] =round(accuracy(dict_EN_FR_ref,dict_EN_FR)*100, 2)
+        # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['K-Means EN->FR']}%")
+        # display(dict_EN_FR)
+        return df_dic
+    def calc_knn(l_src,l_tgt, metric):
+        global df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt
+        #Définition de la metrique (pour les 2 dictionnaires
+        knn_metric = metric   # minkowski, cosine, chebyshev, manhattan, euclidean
+        # Algorithme de KNN
+        X_train = df_count_word_tgt.T
+        y_train = range(nb_mots_tgt)
+        # Création du classifieur et construction du modèle sur les données d'entraînement
+        knn = KNeighborsClassifier(n_neighbors=1, metric=knn_metric)
+        knn.fit(X_train, y_train)
+        # Création et affichage du dictionnaire
+        df_dic = pd.DataFrame(data=df_count_word_tgt.columns[knn.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
+        df_dic.index.name = l_src
+        df_dic = df_dic.T
+        # print("Dictionnaire Anglais -> Français:")
+        # translation_quality['Précision du dictionnaire'].loc['KNN EN->FR'] =round(accuracy(dict_EN_FR_ref,knn_dict_EN_FR)*100, 2)
+        # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['KNN EN->FR']}%")
+        # display(knn_dict_EN_FR)
+        return df_dic
+    def calc_rf(l_src,l_tgt):
+        # Algorithme de Random Forest
+        X_train = df_count_word_tgt.T
+        y_train = range(nb_mots_tgt)
+        # Création du classifieur et construction du modèle sur les données d'entraînement
+        rf = RandomForestClassifier(n_jobs=-1, random_state=321)
+        rf.fit(X_train, y_train)
+        # Création et affichage du dictionnaire
+        df_dic = pd.DataFrame(data=df_count_word_tgt.columns[rf.predict(df_count_word_src.T)],index=df_count_word_src.T.index,columns=[l_tgt])
+        df_dic.index.name= l_src
+        df_dic = df_dic.T
+        # print("Dictionnaire Anglais -> Français:")
+        # translation_quality['Précision du dictionnaire'].loc['RF EN->FR'] = round(accuracy(dict_EN_FR_ref,rf_dict_EN_FR)*100, 2)
+        # print(f"Précision du dictionnaire = {translation_quality['Précision du dictionnaire'].loc['RF EN->FR']}%")
+        # display(rf_dict_EN_FR)
+        return df_dic
+    def calcul_dic(Lang,Algo,Metrique):
+        if Lang[:2]=='en':
+            l_src = 'Anglais'
+            l_tgt = 'Francais'
+        else:
+            l_src = 'Francais'
+            l_tgt = 'Anglais'
+        if Algo=='Manuel':
+            df_dic = pd.read_csv('../data/dict_ref_'+Lang+'.csv',header=0,index_col=0, encoding ="utf-8", sep=';',keep_default_na=False).T.sort_index(axis=1)
+        elif Algo=='KMeans':
+            df_dic = calc_kmeans(l_src,l_tgt)
+        elif Algo=='KNN':
+            df_dic = calc_knn(l_src,l_tgt, Metrique)
+        elif Algo=='Random Forest':
+            df_dic = calc_rf(l_src,l_tgt)
+        else:
+            df_dic = pd.read_csv('../data/dict_we_'+Lang,header=0,index_col=0, encoding ="utf-8", keep_default_na=False).T.sort_index(axis=1)
+        return df_dic
+else:
+    def load_dic(Lang,Algo,Metrique):
+        Algo = Algo.lower()
+        if Algo=='random forest' : Algo = "rf"
         else:
+            if Algo=='word embedding' : Algo = "we"
+            else:
+                if Algo!='knn': Metrique = ''
+                else: Metrique = Metrique+'_'
+        input_file = os.path.join(dataPath+'/dict_'+Algo+'_'+Metrique+Lang)
+        return pd.read_csv(input_file, encoding="utf-8", index_col=0).T.sort_index(axis=1)
 def display_translation(n1,dict, Lang):
     global df_data_src, df_data_tgt, placeholder
         st.write("**ref. :** "+s_trad_ref[i])
         st.write("")
     with placeholder:
+        st.write("<p style='text-align:center;background-color:red; color:white')>"+"Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
                  unsafe_allow_html=True)
 def display_dic(df_dic):
     st.dataframe(df_dic.T, height=600)
+def save_dic(path, df_dic):
+    output_file = os.path.join(path)
+    df_dic.T.to_csv(output_file, encoding="utf-8")
+    return
 def run():
+    global df_data_src, df_data_tgt, df_count_word_src, df_count_word_tgt, nb_mots_src, nb_mots_tgt, n1, placeholder
+    global df_data_en, df_data_fr, nb_mots_en, df_count_word_en, df_count_word_fr, nb_mots_en, nb_mots_fr
     st.write("")
+    st.title(tr(title))
     #
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
         """
         Dans une première approche naïve, nous avons implémenté un système de traduction mot à mot.
         Cette traduction est réalisée grâce à un dictionnaire qui associe un mot de la langue source à un mot de la langue cible, dans small_vocab
         Ce dictionnaire est calculé de 3 manières:
         """)
+    , unsafe_allow_html=True)
+    st.markdown(
+        "* "+tr(":red[**Manuellement**] en choisissant pour chaque mot source le mot cible. Ceci nous a permis de définir un dictionnaire de référence")+"\n"+ \
+        "* "+tr("Avec le :red[**Bag Of World**] (chaque mot dans la langue cible = une classe, BOW = features)")
+    , unsafe_allow_html=True)
     st.image("assets/BOW.jpg",use_column_width=True)
     st.markdown(
+        "* "+tr("Avec le :red[**Word Embedding**], c'est à dire en associant chaque mot à un vecteur \"sémantique\" de dimensions=300, et en selectionnant le vecteur de langue cible "
+        "le plus proche du vecteur de langue source.")+" \n\n"+
+        tr("Enfin nous calculons :")+"\n"+ \
+        "* "+tr("la :red[**précision**] du dictionnaire par rapport à notre dictionnaire de réference (manuel)")+"\n"+ \
+        "* "+tr("le ")+" :red[**score BLEU**] (\"BiLingual Evaluation Understudy\")"+tr(", qui mesure la précision de notre traduction par rapport à celle de notre corpus référence. ")
+    , unsafe_allow_html=True)
     #
+    st.write("## **"+tr("Paramètres ")+" :**\n")
+    Sens = st.radio(tr('Sens')+' :',('Anglais -> Français','Français -> Anglais'), horizontal=True)
     Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
+    Algo = st.radio(tr('Algorithme')+' :',('Manuel', 'KMeans','KNN','Random Forest','Word Embedding'), horizontal=True)
     Metrique = ''
     if (Algo == 'KNN'):
+        Metrique = st.radio(tr('Metrique')+':',('minkowski', 'cosine', 'chebyshev', 'manhattan', 'euclidean'), horizontal=True)
     if (Lang=='en_fr'):
         df_data_src = df_data_en
         df_data_tgt = df_data_fr
+        if st.session_state.reCalcule:
+            df_count_word_src = df_count_word_en
+            df_count_word_tgt = df_count_word_fr
+            nb_mots_src = nb_mots_en
+            nb_mots_tgt = nb_mots_fr
     else:
         df_data_src = df_data_fr
         df_data_tgt = df_data_en
+        if st.session_state.reCalcule:
+            df_count_word_src = df_count_word_fr
+            df_count_word_tgt = df_count_word_en
+            nb_mots_src = nb_mots_fr
+            nb_mots_tgt = nb_mots_en
     # df_data_src.columns = ['Phrase']
+    sentence1 = st.selectbox(tr("Selectionnez la 1ere des 5 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
     n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
+    if st.session_state.reCalcule:
+        df_dic = calcul_dic(Lang,Algo,Metrique)
+        df_dic_ref = calcul_dic(Lang,'Manuel',Metrique)
+    else:
+        df_dic = load_dic(Lang,Algo,Metrique)
+        df_dic_ref = load_dic(Lang,'Manuel',Metrique)
+    """
+    save_dico = st.checkbox('Save dic ?')
+    if save_dico:
+        dic_name = st.text_input('Nom du fichier :',dataPath+'/dict_')
+        save_dic(dic_name, df_dic)
+    """
+    st.write("## **"+tr("Dictionnaire calculé et traduction mot à mot")+" :**\n")
     col1, col2 = st.columns([0.25, 0.75])
     with col1:
+        st.write("#### **"+tr("Dictionnaire")+"**")
         precision = int(round(accuracy(df_dic_ref,df_dic)*100, 0))
+        st.write("<p style='text-align:center;background-color:red; color:white')>"+tr("Précision")+" = {:2d}%</p>".format(precision), unsafe_allow_html=True)
         display_dic(df_dic)
     with col2:
+        st.write("#### **"+tr("Traduction")+"**")
         placeholder = st.empty()
         display_translation(n1, df_dic, Lang)

tabs/modelisation_seq2seq_tab.py CHANGED Viewed

@@ -16,17 +16,17 @@ import tensorflow as tf
 import string
 import re
 from tensorflow import keras
-from tensorflow.keras import layers
 from keras_nlp.layers import TransformerEncoder
 from tensorflow.keras.utils import plot_model
 from PIL import Image
 from gtts import gTTS
 from extra_streamlit_components import tab_bar, TabBarItemData
 title = "Traduction Sequence à Sequence"
 sidebar_name = "Traduction Seq2Seq"
 @st.cache_data
 def load_corpus(path):
@@ -65,7 +65,7 @@ def decode_sequence_rnn(input_sentence, src, tgt):
         output_mode="int",
         output_sequence_length=sequence_length,
         standardize=custom_standardization,
-        vocabulary = load_vocab("data/vocab_"+src+".txt"),
     )
     target_vectorization = layers.TextVectorization(
@@ -73,7 +73,7 @@ def decode_sequence_rnn(input_sentence, src, tgt):
         output_mode="int",
         output_sequence_length=sequence_length + 1,
         standardize=custom_standardization,
-        vocabulary = load_vocab("data/vocab_"+tgt+".txt"),
     )
     tgt_vocab = target_vectorization.get_vocabulary()
@@ -190,18 +190,6 @@ class PositionalEmbedding(layers.Layer):
             "input_dim": self.input_dim,
         })
         return config
-    def compute_mask(self, inputs, mask=None):
-        return tf.math.not_equal(inputs, 0)
-    def get_config(self):
-        config = super(PositionalEmbedding, self).get_config()
-        config.update({
-            "output_dim": self.output_dim,
-            "sequence_length": self.sequence_length,
-            "input_dim": self.input_dim,
-        })
-        return config
 def decode_sequence_tranf(input_sentence, src, tgt):
     global translation_model
@@ -214,7 +202,7 @@ def decode_sequence_tranf(input_sentence, src, tgt):
         output_mode="int",
         output_sequence_length=sequence_length,
         standardize=custom_standardization,
-        vocabulary = load_vocab("data/vocab_"+src+".txt"),
     )
     target_vectorization = layers.TextVectorization(
@@ -222,7 +210,7 @@ def decode_sequence_tranf(input_sentence, src, tgt):
         output_mode="int",
         output_sequence_length=sequence_length + 1,
         standardize=custom_standardization,
-        vocabulary = load_vocab("data/vocab_"+tgt+".txt"),
     )
     tgt_vocab = target_vectorization.get_vocabulary()
@@ -246,29 +234,33 @@ def decode_sequence_tranf(input_sentence, src, tgt):
 @st.cache_resource
 def load_all_data():
-    df_data_en = load_corpus('data/preprocess_txt_en')
-    df_data_fr = load_corpus('data/preprocess_txt_fr')
     lang_classifier = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
     translation_en_fr = pipeline('translation_en_to_fr', model="t5-base")
     translation_fr_en = pipeline('translation_fr_to_en', model="Helsinki-NLP/opus-mt-fr-en")
     finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
     model_speech = whisper.load_model("base")
-    merge = Merge( "data/rnn_en-fr_split",  "data", "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
-    merge = Merge( "data/rnn_fr-en_split",  "data", "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
-    rnn_en_fr = keras.models.load_model("data/seq2seq_rnn-model-en-fr.h5", compile=False)
-    rnn_fr_en = keras.models.load_model("data/seq2seq_rnn-model-fr-en.h5", compile=False)
     rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
-    with keras.saving.custom_object_scope(custom_objects):
-        transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
-        transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
-    merge = Merge( "data/transf_en-fr_weight_split",  "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
-    merge = Merge( "data/transf_fr-en_weight_split",  "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
-    transformer_en_fr.load_weights("data/transformer-model-en-fr.weights.h5")
-    transformer_fr_en.load_weights("data/transformer-model-fr-en.weights.h5")
     transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
@@ -331,41 +323,49 @@ def run():
     global lang_tgt, label_lang
     st.write("")
-    st.title(title)
     #
-    st.write("## **Explications :**\n")
-    st.markdown(
         """
         Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
         La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
         un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
         avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
         Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
         Vous en trouverez :red[**5 illustrations**] ci-dessous.
-        """
-    )
-    lang_tgt   = ['en','fr','ab','aa','af','ak','sq','de','am','en','ar','an','hy','as','av','ae','ay','az','ba','bm','eu','bn','bi','be','bh','my','bs','br','bg','ks','ca','ch','ny','zh','si','ko','kw','co','ht','cr','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','kl','gn','gu','ha','he','hz','hi','ho','hu','io','ig','id','ia','iu','ik','ga','is','it','ja','jv','kn','kr','kk','km','kg','ki','rw','ky','rn','kv','kj','ku','lo','la','lv','li','ln','lt','lu','lb','mk','ms','ml','dv','mg','mt','gv','mi','mr','mh','mo','mn','na','nv','ng','nl','ne','no','nb','nn','nr','ie','oc','oj','or','om','os','ug','ur','uz','ps','pi','pa','fa','ff','pl','pt','qu','rm','ro','ru','se','sm','sg','sa','sc','sr','sh','sn','nd','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','ce','cv','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','ve','vi','cu','vo','wa','wo','xh','ii','yi','yo','za','zu']
-    label_lang = ['Anglais','Français','Abkhaze','Afar','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Aragonais','Arménien','Assamais','Avar','Avestique','Aymara','Azéri','Bachkir','Bambara','Basque','Bengali','Bichelamar','Biélorusse','Bihari','Birman','Bosnien','Breton','Bulgare','Cachemiri','Catalan','Chamorro','Chichewa','Chinois','Cingalais','Coréen','Cornique','Corse','Créolehaïtien','Cri','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Groenlandais','Guarani','Gujarati','Haoussa','Hébreu','Héréro','Hindi','Hirimotu','Hongrois','Ido','Igbo','Indonésien','Interlingua','Inuktitut','Inupiak','Irlandais','Islandais','Italien','Japonais','Javanais','Kannada','Kanouri','Kazakh','Khmer','Kikongo','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Komi','Kuanyama','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luba','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','Mannois','MaorideNouvelle-Zélande','Marathi','Marshallais','Moldave','Mongol','Nauruan','Navajo','Ndonga','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Nrebele','Occidental','Occitan','Ojibwé','Oriya','Oromo','Ossète','Ouïghour','Ourdou','Ouzbek','Pachto','Pali','Pendjabi','Persan','Peul','Polonais','Portugais','Quechua','Romanche','Roumain','Russe','SameduNord','Samoan','Sango','Sanskrit','Sarde','Serbe','Serbo-croate','Shona','Sindebele','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Tchétchène','Tchouvache','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Venda','Vietnamien','Vieux-slave','Volapük','Wallon','Wolof','Xhosa','Yi','Yiddish','Yoruba','Zhuang','Zoulou']
     lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
                 'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
                 'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
-    st.write("#### Choisissez le type de traduction:")
     chosen_id = tab_bar(data=[
-        TabBarItemData(id="tab1", title="small vocab", description="avec Keras et un RNN"),
-        TabBarItemData(id="tab2", title="small vocab", description="avec Keras et un Transformer"),
-        TabBarItemData(id="tab3", title="Phrase personnelle", description="à saisir"),
-        TabBarItemData(id="tab4", title="Phrase personnelle", description="à dicter"),
-        TabBarItemData(id="tab5", title="Funny translation !", description="avec le Fine Tuning")],
         default="tab1")
     if (chosen_id == "tab1") or (chosen_id == "tab2") :
-        st.write("## **Paramètres :**\n")
         TabContainerHolder = st.container()
-        Sens = TabContainerHolder.radio('Sens de la traduction:',('Anglais -> Français','Français -> Anglais'), horizontal=True)
         Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
         if (Lang=='en_fr'):
@@ -382,18 +382,18 @@ def run():
                 translation_model = rnn_fr_en
             else:
                 translation_model = transformer_fr_en
-        sentence1 = st.selectbox("Selectionnez la 1ere des 5 phrases à traduire avec le dictionnaire sélectionné", df_data_src.iloc[:-4],index=int(n1) )
         n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
-        st.write("## **Résultats :**\n")
         if (chosen_id == "tab1"):
             display_translation(n1, Lang,1)
         else:
             display_translation(n1, Lang,2)
-        st.write("## **Explications :**\n")
         if (chosen_id == "tab1"):
-            st.markdown(
                 """
                 Nous avons utilisé 2 Gated Recurrent Units.
                 Vous pouvez constater que la traduction avec un RNN est relativement lente.
@@ -401,32 +401,32 @@ def run():
                 alors que les calculs sont réalisés en parrallèle dans les Transformers.
                 Le score BLEU est bien meilleur que celui des traductions mot à mot.
                 <br>
-                """
                 , unsafe_allow_html=True)
         else:
-            st.markdown(
                 """
                 Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
                 La dimension de l'embedding des tokens = 256
                 La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
                 <br>
-                """
                 , unsafe_allow_html=True)
-        st.write("<center><h5>Architecture du modèle utilisé:</h5>", unsafe_allow_html=True)
-        plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file='images/model_plot.png')
-        st.image('images/model_plot.png',use_column_width=True)
         st.write("</center>", unsafe_allow_html=True)
     elif chosen_id == "tab3":
-        st.write("## **Paramètres :**\n")
-        custom_sentence = st.text_area(label="Saisir le texte à traduire")
-        l_tgt = st.selectbox("Choisir la langue cible pour Google Translate (uniquement):",lang_tgt, format_func = find_lang_label )
-        st.button(label="Valider", type="primary")
         if custom_sentence!="":
-            st.write("## **Résultats :**\n")
             Lang_detected = lang_classifier (custom_sentence)[0]['label']
-            st.write('Langue détectée : **'+lang_src.get(Lang_detected)+'**')
             audio_stream_bytesio_src = io.BytesIO()
             tts = gTTS(custom_sentence,lang=Lang_detected)
             tts.write_to_fp(audio_stream_bytesio_src)
@@ -435,7 +435,7 @@ def run():
         else: Lang_detected=""
         col1, col2 = st.columns(2, gap="small")
         with col1:
-            st.write(":red[**Trad. t5-base & Helsinki**] *(Anglais/Français)*")
             audio_stream_bytesio_tgt = io.BytesIO()
             if (Lang_detected=='en'):
                 translation = translation_en_fr(custom_sentence, max_length=400)[0]['translation_text']
@@ -464,19 +464,19 @@ def run():
                     tts.write_to_fp(audio_stream_bytesio_tgt)
                     st.audio(audio_stream_bytesio_tgt)
             except:
-                st.write("Problème, essayer de nouveau..")
     elif chosen_id == "tab4":
-        st.write("## **Paramètres :**\n")
-        detection = st.toggle("Détection de langue ?", value=True)
         if not detection:
-            l_src = st.selectbox("Choisissez la langue parlée :",lang_tgt, format_func = find_lang_label, index=1 )
-        l_tgt = st.selectbox("Choisissez la langue cible  :",lang_tgt, format_func = find_lang_label )
-        audio_bytes = audio_recorder (pause_threshold=1.0,  sample_rate=16000, text="Cliquez pour parler, puis attendre 2s..", \
                                       recording_color="#e8b62c", neutral_color="#1ec3bc", icon_size="6x",)
         if audio_bytes:
-            st.write("## **Résultats :**\n")
             st.audio(audio_bytes, format="audio/wav")
             try:
                 if detection:
@@ -494,7 +494,7 @@ def run():
                     audio_input = np.mean(audio_input, axis=1)/32768
                     result = model_speech.transcribe(audio_input)
-                    st.write("Langue détectée : "+result["language"])
                     Lang_detected = result["language"]
                     # Transcription Whisper (si result a été préalablement calculé)
                     custom_sentence = result["text"]
@@ -519,22 +519,22 @@ def run():
                     tts = gTTS(translation,lang=l_tgt)
                     tts.write_to_fp(audio_stream_bytesio_tgt)
                     st.audio(audio_stream_bytesio_tgt)
-                    st.write("Prêt pour la phase suivante..")
                     audio_bytes = False
             except KeyboardInterrupt:
-                st.write("Arrêt de la reconnaissance vocale.")
             except:
-                st.write("Problème, essayer de nouveau..")
     elif chosen_id == "tab5":
-        st.markdown(
              """
             Pour cette section, nous avons "fine tuné" un transformer Hugging Face, :red[**t5-small**], qui traduit des textes de l'anglais vers le français.
             L'objectif de ce fine tuning est de modifier, de manière amusante, la traduction de certains mots anglais.
             Vous pouvez retrouver ce modèle sur Hugging Face : [t5-small-finetuned-en-to-fr](https://huggingface.co/Demosthene-OR/t5-small-finetuned-en-to-fr)
             Par exemple:
-            """
-            )
         col1, col2 = st.columns(2, gap="small")
         with col1:
             st.markdown(
@@ -557,13 +557,13 @@ def run():
                 """
             )
         st.write("")
-        st.markdown(
         """
         Ainsi **la data science devient :red[magique] et fait disparaitre certaines choses, pour en faire apparaitre d'autres..**
         Voici quelques illustrations :
         (*vous noterez que DataScientest a obtenu le monopole de l'enseignement de la data science*)
-        """
-        )
         s, t = translate_examples()
         placeholder2 = st.empty()
         with placeholder2:
@@ -572,20 +572,18 @@ def run():
                     st.write("**en   :**  :blue["+ s[i]+"]")
                     st.write("**fr   :**  "+t[i])
                     st.write("")
-        st.write("## **Paramètres :**\n")
-        st.write("A vous d'essayer:")
-        custom_sentence2 = st.text_area(label="Saisissez le texte anglais à traduire")
-        but2 = st.button(label="Valider", type="primary")
         if custom_sentence2!="":
-            st.write("## **Résultats :**\n")
             st.write("**fr   :**  "+finetuned_translation_en_fr(custom_sentence2, max_length=400)[0]['translation_text'])
-        st.write("## **Explications :**\n")
-        st.markdown(
-            """
-            Afin d'affiner :red[**t5-small**], il nous a fallu:
-            - 22 phrases d'entrainement
-            - approximatement 400 epochs pour obtenir une val loss proche de 0
-            La durée d'entrainement est très rapide (quelques minutes), et le résultat plutôt probant.
             """
-        )

 import string
 import re
 from tensorflow import keras
 from keras_nlp.layers import TransformerEncoder
+from tensorflow.keras import layers
 from tensorflow.keras.utils import plot_model
 from PIL import Image
 from gtts import gTTS
 from extra_streamlit_components import tab_bar, TabBarItemData
+from translate_app import tr
 title = "Traduction Sequence à Sequence"
 sidebar_name = "Traduction Seq2Seq"
+dataPath = st.session_state.DataPath
 @st.cache_data
 def load_corpus(path):
         output_mode="int",
         output_sequence_length=sequence_length,
         standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
     )
     target_vectorization = layers.TextVectorization(
         output_mode="int",
         output_sequence_length=sequence_length + 1,
         standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
     )
     tgt_vocab = target_vectorization.get_vocabulary()
             "input_dim": self.input_dim,
         })
         return config
 def decode_sequence_tranf(input_sentence, src, tgt):
     global translation_model
         output_mode="int",
         output_sequence_length=sequence_length,
         standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
     )
     target_vectorization = layers.TextVectorization(
         output_mode="int",
         output_sequence_length=sequence_length + 1,
         standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
     )
     tgt_vocab = target_vectorization.get_vocabulary()
 @st.cache_resource
 def load_all_data():
+    df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
+    df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
     lang_classifier = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
     translation_en_fr = pipeline('translation_en_to_fr', model="t5-base")
     translation_fr_en = pipeline('translation_fr_to_en', model="Helsinki-NLP/opus-mt-fr-en")
     finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
     model_speech = whisper.load_model("base")
+    merge = Merge( dataPath+"/rnn_en-fr_split",  dataPath, "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
+    merge = Merge( dataPath+"/rnn_fr-en_split",  dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
+    rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
+    rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
     rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
+    if st.session_state.Cloud == 1:
+        with keras.saving.custom_object_scope(custom_objects):
+            transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
+            transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
+        merge = Merge( "data/transf_en-fr_weight_split",  "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
+        merge = Merge( "data/transf_fr-en_weight_split",  "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
+    else:
+        transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
+        transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
+        transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
+        transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
     transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
     global lang_tgt, label_lang
     st.write("")
+    st.title(tr(title))
     #
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
         """
         Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
+        """)
+        , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
         La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
         un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
         avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
+        """)
+        , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
         Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
         Vous en trouverez :red[**5 illustrations**] ci-dessous.
+        """)
+    , unsafe_allow_html=True)
+    lang_tgt   = ['en','fr','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
+    label_lang = ['Anglais','Français','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
     lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
                 'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
                 'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
+    st.write("#### "+tr("Choisissez le type de traduction")+" :")
     chosen_id = tab_bar(data=[
+        TabBarItemData(id="tab1", title="small vocab", description=tr("avec Keras et un RNN")),
+        TabBarItemData(id="tab2", title="small vocab", description=tr("avec Keras et un Transformer")),
+        TabBarItemData(id="tab3", title=tr("Phrase personnelle"), description=tr("à saisir")),
+        TabBarItemData(id="tab4", title=tr("Phrase personnelle"), description=tr("à dicter")),
+        TabBarItemData(id="tab5", title=tr("Funny translation !"), description=tr("avec le Fine Tuning"))],
         default="tab1")
     if (chosen_id == "tab1") or (chosen_id == "tab2") :
+        st.write("## **"+tr("Paramètres")+" :**\n")
         TabContainerHolder = st.container()
+        Sens = TabContainerHolder.radio(tr('Sens')+':',('Anglais -> Français','Français -> Anglais'), horizontal=True)
         Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
         if (Lang=='en_fr'):
                 translation_model = rnn_fr_en
             else:
                 translation_model = transformer_fr_en
+        sentence1 = st.selectbox(tr("Selectionnez la 1ere des 5 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
         n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
+        st.write("## **"+tr("Résultats")+" :**\n")
         if (chosen_id == "tab1"):
             display_translation(n1, Lang,1)
         else:
             display_translation(n1, Lang,2)
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
         if (chosen_id == "tab1"):
+            st.markdown(tr(
                 """
                 Nous avons utilisé 2 Gated Recurrent Units.
                 Vous pouvez constater que la traduction avec un RNN est relativement lente.
                 alors que les calculs sont réalisés en parrallèle dans les Transformers.
                 Le score BLEU est bien meilleur que celui des traductions mot à mot.
                 <br>
+                """)
                 , unsafe_allow_html=True)
         else:
+            st.markdown(tr(
                 """
                 Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
                 La dimension de l'embedding des tokens = 256
                 La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
                 <br>
+                """)
                 , unsafe_allow_html=True)
+        st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5>", unsafe_allow_html=True)
+        plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file='../images/model_plot.png')
+        st.image('../images/model_plot.png',use_column_width=True)
         st.write("</center>", unsafe_allow_html=True)
     elif chosen_id == "tab3":
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        custom_sentence = st.text_area(label=tr("Saisir le texte à traduire"))
+        l_tgt = st.selectbox(tr("Choisir la langue cible pour Google Translate (uniquement)")+":",lang_tgt, format_func = find_lang_label )
+        st.button(label=tr("Valider"), type="primary")
         if custom_sentence!="":
+            st.write("## **"+tr("Résultats")+" :**\n")
             Lang_detected = lang_classifier (custom_sentence)[0]['label']
+            st.write(tr('Langue détectée')+' : **'+lang_src.get(Lang_detected)+'**')
             audio_stream_bytesio_src = io.BytesIO()
             tts = gTTS(custom_sentence,lang=Lang_detected)
             tts.write_to_fp(audio_stream_bytesio_src)
         else: Lang_detected=""
         col1, col2 = st.columns(2, gap="small")
         with col1:
+            st.write(":red[**Trad. t5-base & Helsinki**] *("+tr("Anglais/Français")+")*")
             audio_stream_bytesio_tgt = io.BytesIO()
             if (Lang_detected=='en'):
                 translation = translation_en_fr(custom_sentence, max_length=400)[0]['translation_text']
                     tts.write_to_fp(audio_stream_bytesio_tgt)
                     st.audio(audio_stream_bytesio_tgt)
             except:
+                st.write(tr("Problème, essayer de nouveau.."))
     elif chosen_id == "tab4":
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        detection = st.toggle(tr("Détection de langue ?"), value=True)
         if not detection:
+            l_src = st.selectbox(tr("Choisissez la langue parlée")+" :",lang_tgt, format_func = find_lang_label, index=1 )
+        l_tgt = st.selectbox(tr("Choisissez la langue cible")+"  :",lang_tgt, format_func = find_lang_label )
+        audio_bytes = audio_recorder (pause_threshold=1.0,  sample_rate=16000, text=tr("Cliquez pour parler, puis attendre 2sec."), \
                                       recording_color="#e8b62c", neutral_color="#1ec3bc", icon_size="6x",)
         if audio_bytes:
+            st.write("## **"+tr("Résultats")+" :**\n")
             st.audio(audio_bytes, format="audio/wav")
             try:
                 if detection:
                     audio_input = np.mean(audio_input, axis=1)/32768
                     result = model_speech.transcribe(audio_input)
+                    st.write(tr("Langue détectée")+" : "+result["language"])
                     Lang_detected = result["language"]
                     # Transcription Whisper (si result a été préalablement calculé)
                     custom_sentence = result["text"]
                     tts = gTTS(translation,lang=l_tgt)
                     tts.write_to_fp(audio_stream_bytesio_tgt)
                     st.audio(audio_stream_bytesio_tgt)
+                    st.write(tr("Prêt pour la phase suivante.."))
                     audio_bytes = False
             except KeyboardInterrupt:
+                st.write(tr("Arrêt de la reconnaissance vocale."))
             except:
+                st.write(tr("Problème, essayer de nouveau.."))
     elif chosen_id == "tab5":
+        st.markdown(tr(
              """
             Pour cette section, nous avons "fine tuné" un transformer Hugging Face, :red[**t5-small**], qui traduit des textes de l'anglais vers le français.
             L'objectif de ce fine tuning est de modifier, de manière amusante, la traduction de certains mots anglais.
             Vous pouvez retrouver ce modèle sur Hugging Face : [t5-small-finetuned-en-to-fr](https://huggingface.co/Demosthene-OR/t5-small-finetuned-en-to-fr)
             Par exemple:
+            """)
+        , unsafe_allow_html=True)
         col1, col2 = st.columns(2, gap="small")
         with col1:
             st.markdown(
                 """
             )
         st.write("")
+        st.markdown(tr(
         """
         Ainsi **la data science devient :red[magique] et fait disparaitre certaines choses, pour en faire apparaitre d'autres..**
         Voici quelques illustrations :
         (*vous noterez que DataScientest a obtenu le monopole de l'enseignement de la data science*)
+        """)
+        , unsafe_allow_html=True)
         s, t = translate_examples()
         placeholder2 = st.empty()
         with placeholder2:
                     st.write("**en   :**  :blue["+ s[i]+"]")
                     st.write("**fr   :**  "+t[i])
                     st.write("")
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        st.write(tr("A vous d'essayer")+":")
+        custom_sentence2 = st.text_area(label=tr("Saisissez le texte anglais à traduire"))
+        but2 = st.button(label=tr("Valider"), type="primary")
         if custom_sentence2!="":
+            st.write("## **"+tr("Résultats")+" :**\n")
             st.write("**fr   :**  "+finetuned_translation_en_fr(custom_sentence2, max_length=400)[0]['translation_text'])
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
+        st.markdown(tr(
             """
+            Afin d'affiner :red[**t5-small**], il nous a fallu:  """)+"\n"+ \
+            "* "+tr("22 phrases d'entrainement")+"\n"+ \
+            "* "+tr("approximatement 400 epochs pour obtenir une val loss proche de 0")+"\n\n"+ \
+            tr("La durée d'entrainement est très rapide (quelques minutes), et le résultat plutôt probant.")
+        , unsafe_allow_html=True)

translate_app.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import streamlit as st
+from translate import Translator
+@st.cache_data(ttl="1d")
+def trad(message,l):
+    try:
+        translator = Translator(to_lang=l , from_lang="fr")
+        translation = translator.translate(message)
+        return translation
+    except:
+        return "Problème de traduction.."
+def tr(message):
+    if 'Language' not in st.session_state: l = 'fr'
+    else: l= st.session_state['Language']
+    if l == 'fr': return message
+    else: message = message.replace(":red[**","").replace("**]","")
+    return trad(message,l)