Spaces:
Running
Running
import streamlit as st | |
import os | |
import pandas as pd | |
import collections | |
from nltk.tokenize import word_tokenize | |
from nltk import download | |
from ast import literal_eval | |
from translate_app import tr | |
if st.session_state.Cloud == 0: | |
# import nltk | |
import contextlib | |
import re | |
from nltk.corpus import stopwords | |
import warnings | |
warnings.filterwarnings('ignore') | |
# from PIL import Image | |
# import time | |
# import random | |
title = "Exploration et Preprocessing" | |
sidebar_name = "Exploration et Preprocessing" | |
dataPath = st.session_state.DataPath | |
# Indiquer si l'on veut enlever les stop words. C'est un processus long | |
stopwords_to_do = True | |
# Indiquer si l'on veut lemmatiser les phrases, un fois les stop words enlevés. C'est un processus long (approximativement 8 minutes) | |
lemmatize_to_do = True | |
# Indiquer si l'on veut calculer le score Bleu pour tout le corpus. C'est un processus très long long (approximativement 10 minutes pour les 10 dictionnaires) | |
bleu_score_to_do = True | |
# Première ligne à charger | |
first_line = 0 | |
# Nombre maximum de lignes à charger | |
max_lines = 140000 | |
if ((first_line+max_lines)>137860): | |
max_lines = max(137860-first_line ,0) | |
# Nombre maximum de ligne à afficher pour les DataFrame | |
max_lines_to_display = 50 | |
download('punkt') | |
if st.session_state.Cloud == 0: | |
download('averaged_perceptron_tagger') | |
with contextlib.redirect_stdout(open(os.devnull, "w")): | |
download('stopwords') | |
def load_data(path): | |
input_file = os.path.join(path) | |
with open(input_file, "r", encoding="utf-8") as f: | |
data = f.read() | |
# On convertit les majuscules en minulcule | |
data = data.lower() | |
data = data.split('\n') | |
return data[first_line:min(len(data),first_line+max_lines)] | |
def load_preprocessed_data(path,data_type): | |
input_file = os.path.join(path) | |
if data_type == 1: | |
return pd.read_csv(input_file, encoding="utf-8", index_col=0) | |
else: | |
with open(input_file, "r", encoding="utf-8") as f: | |
data = f.read() | |
data = data.split('\n') | |
if data_type==0: | |
data=data[:-1] | |
elif data_type == 2: | |
data=[eval(i) for i in data[:-1]] | |
elif data_type ==3: | |
data2 = [] | |
for d in data[:-1]: | |
data2.append(literal_eval(d)) | |
data=data2 | |
return data | |
def load_all_preprocessed_data(lang): | |
txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0) | |
txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3) | |
txt_lem = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0) | |
txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0) | |
df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)]) | |
return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word | |
#Chargement des textes complet dans les 2 langues | |
full_txt_en = load_data(dataPath+'/small_vocab_en') | |
full_txt_fr = load_data(dataPath+'/small_vocab_fr') | |
# Chargement du résultat du préprocessing, si st.session_state.reCalcule == False | |
if not st.session_state.reCalcule: | |
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en') | |
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr') | |
else: | |
def remove_stopwords(text, lang): | |
stop_words = set(stopwords.words(lang)) | |
# stop_words will contain set all english stopwords | |
filtered_sentence = [] | |
for word in text.split(): | |
if word not in stop_words: | |
filtered_sentence.append(word) | |
return " ".join(filtered_sentence) | |
def clean_undesirable_from_text(sentence, lang): | |
# Removing URLs | |
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence ) | |
# Removing Punctuations (we keep the . character) | |
REPLACEMENTS = [("..", "."), | |
(",", ""), | |
(";", ""), | |
(":", ""), | |
("?", ""), | |
('"', ""), | |
("-", " "), | |
("it's", "it is"), | |
("isn't","is not"), | |
("'", " ") | |
] | |
for old, new in REPLACEMENTS: | |
sentence = sentence.replace(old, new) | |
# Removing Digits | |
sentence= re.sub(r'[0-9]','',sentence) | |
# Removing Additional Spaces | |
sentence = re.sub(' +', ' ', sentence) | |
return sentence | |
def clean_untranslated_sentence(data1, data2): | |
i=0 | |
while i<len(data1): | |
if data1[i]==data2[i]: | |
data1.pop(i) | |
data2.pop(i) | |
else: i+=1 | |
return data1,data2 | |
import spacy | |
nlp_en = spacy.load('en_core_web_sm') | |
nlp_fr = spacy.load('fr_core_news_sm') | |
def lemmatize(sentence,lang): | |
# Create a Doc object | |
if lang=='en': | |
nlp=nlp_en | |
elif lang=='fr': | |
nlp=nlp_fr | |
else: return | |
doc = nlp(sentence) | |
# Create list of tokens from given string | |
tokens = [] | |
for token in doc: | |
tokens.append(token) | |
lemmatized_sentence = " ".join([token.lemma_ for token in doc]) | |
return lemmatized_sentence | |
def preprocess_txt (data, lang): | |
word_count = collections.Counter() | |
word_lem_count = collections.Counter() | |
word_wosw_count = collections.Counter() | |
corpus = [] | |
data_split = [] | |
sentence_length = [] | |
data_split_wo_stopwords = [] | |
data_length_wo_stopwords = [] | |
data_lem = [] | |
data_lem_length = [] | |
txt_en_one_string= ". ".join([s for s in data]) | |
txt_en_one_string = txt_en_one_string.replace('..', '.') | |
txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang') | |
data = txt_en_one_string.split('.') | |
if data[-1]=="": | |
data.pop(-1) | |
for i in range(len(data)): # On enleve les ' ' qui commencent et finissent les phrases | |
if data[i][0] == ' ': | |
data[i]=data[i][1:] | |
if data[i][-1] == ' ': | |
data[i]=data[i][:-1] | |
nb_phrases = len(data) | |
# Création d'un tableau de mots (sentence_split) | |
for i,sentence in enumerate(data): | |
sentence_split = word_tokenize(sentence) | |
word_count.update(sentence_split) | |
data_split.append(sentence_split) | |
sentence_length.append(len(sentence_split)) | |
# La lemmatisation et le nettoyage des stopword va se faire en batch pour des raisons de vitesse | |
# (au lieu de le faire phrase par phrase) | |
# Ces 2 processus nécéssitent de connaitre la langue du corpus | |
if lang == 'en': l='english' | |
elif lang=='fr': l='french' | |
else: l="unknown" | |
if l!="unknown": | |
# Lemmatisation en 12 lots (On ne peut lemmatiser + de 1 M de caractères à la fois) | |
data_lemmatized="" | |
if lemmatize_to_do: | |
n_batch = 12 | |
batch_size = round((nb_phrases/ n_batch)+0.5) | |
for i in range(n_batch): | |
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]]) | |
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower() | |
data_lem_for_sw = data_lemmatized[1:] | |
data_lemmatized = data_lem_for_sw.split('.') | |
for i in range(nb_phrases): | |
data_lem.append(data_lemmatized[i].split()) | |
data_lem_length.append(len(data_lemmatized[i].split())) | |
word_lem_count.update(data_lem[-1]) | |
# Elimination des StopWords en un lot | |
# On élimine les Stopwords des phrases lémmatisés, si cette phase a eu lieu | |
# (wosw signifie "WithOut Stop Words") | |
if stopwords_to_do: | |
if lemmatize_to_do: | |
data_wosw = remove_stopwords(data_lem_for_sw,l) | |
else: | |
data_wosw = remove_stopwords(txt_en_one_string,l) | |
data_wosw = data_wosw.split('.') | |
for i in range(nb_phrases): | |
data_split_wo_stopwords.append(data_wosw[i].split()) | |
data_length_wo_stopwords.append(len(data_wosw[i].split())) | |
word_wosw_count.update(data_split_wo_stopwords[-1]) | |
corpus = list(word_count.keys()) | |
# Création d'un DataFrame txt_n_unique_val : | |
# colonnes = mots | |
# lignes = phases | |
# valeur de la cellule = nombre d'occurence du mot dans la phrase | |
## BOW | |
from sklearn.feature_extraction.text import CountVectorizer | |
count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" ) | |
# Calcul du nombre d'apparition de chaque mot dans la phrases | |
countvectors = count_vectorizer.fit_transform(data) | |
corpus = count_vectorizer.get_feature_names_out() | |
txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float) | |
return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length | |
def count_world(data): | |
word_count = collections.Counter() | |
for sentence in data: | |
word_count.update(word_tokenize(sentence)) | |
corpus = list(word_count.keys()) | |
nb_mots = sum(word_count.values()) | |
nb_mots_uniques = len(corpus) | |
return corpus, nb_mots, nb_mots_uniques | |
def display_preprocess_results(lang, data, data_split, data_lem, data_wosw, txt_n_unique_val): | |
global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do | |
corpus = [] | |
nb_phrases = len(data) | |
corpus, nb_mots, nb_mots_uniques = count_world(data) | |
mots_lem, _ , nb_mots_lem = count_world(data_lem) | |
mots_wo_sw, _ , nb_mots_wo_stopword = count_world(data_wosw) | |
# Identifiez les colonnes contenant uniquement des zéros et les supprimer | |
columns_with_only_zeros = txt_n_unique_val.columns[txt_n_unique_val.eq(0).all()] | |
txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros) | |
# Affichage du nombre de mot en fonction du pré-processing réalisé | |
tab1, tab2, tab3, tab4 = st.tabs([tr("Résumé"), tr("Tokenisation"),tr("Lemmatisation"), tr("Sans Stopword")]) | |
with tab1: | |
st.subheader(tr("Résumé du pré-processing")) | |
st.write("**"+tr("Nombre de phrases")+" : "+str(nb_phrases)+"**") | |
st.write("**"+tr("Nombre de mots")+" : "+str(nb_mots)+"**") | |
st.write("**"+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"**") | |
st.write("") | |
st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**") | |
st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800) | |
with tab2: | |
st.subheader(tr("Tokenisation")) | |
st.write(tr('Texte "splited":')) | |
st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800) | |
st.write("**"+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"**") | |
st.write("") | |
st.write("\n**"+tr("Mots uniques")+":**") | |
st.markdown(corpus[:500]) | |
st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**") | |
st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800) | |
with tab3: | |
st.subheader(tr("Lemmatisation")) | |
if lemmatize_to_do: | |
st.dataframe(pd.DataFrame(data=data_lem,columns=[tr('Texte lemmatisé')],index=range(first_line,last_line)).head(max_lines_to_display), width=800) | |
# Si langue anglaise, affichage du taggage des mots | |
# if lang == 'en': | |
# for i in range(min(5,len(data))): | |
# s = str(nltk.pos_tag(data_split[i])) | |
# st.markdown("**Texte avec Tags "+str(i)+"** : "+s) | |
st.write("**"+tr("Nombre de mots uniques lemmatisés")+" : "+str(nb_mots_lem)+"**") | |
st.write("") | |
st.write("\n**"+tr("Mots uniques lemmatisés:")+"**") | |
st.markdown(mots_lem[:500]) | |
with tab4: | |
st.subheader(tr("Sans Stopword")) | |
if stopwords_to_do: | |
st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800) | |
st.write("**"+tr("Nombre de mots uniques sans stop words")+": "+str(nb_mots_wo_stopword)+"**") | |
st.write("") | |
st.write("\n**"+tr("Mots uniques sans stop words")+":**") | |
st.markdown(mots_wo_sw[:500]) | |
def run(): | |
global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do | |
global full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en | |
global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr | |
st.write("") | |
st.title(tr(title)) | |
st.write("## **"+tr("Explications")+" :**\n") | |
st.markdown(tr( | |
""" | |
Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines. | |
Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto. | |
Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français. | |
:red[**Small_vocab**] contient 137 860 phrases en anglais et français. | |
""") | |
, unsafe_allow_html=True) | |
st.markdown(tr( | |
""" | |
Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing). | |
Ces taches sont, par exemple: | |
""") | |
, unsafe_allow_html=True) | |
st.markdown( | |
"* "+tr("le :red[**nettoyage**] du texte (enlever les majuscules et la ponctuation)")+"\n"+ \ | |
"* "+tr("la :red[**tokenisation**] (découpage du texte en mots)")+"\n"+ \ | |
"* "+tr("la :red[**lemmatisation**] (traitement lexical qui permet de donner une forme unique à toutes les \"variations\" d'un même mot)")+"\n"+ \ | |
"* "+tr("l'élimination des :red[**mots \"transparents\"**] (sans utilité pour la compréhension, tels que les articles).")+" \n"+ \ | |
tr("Ce prétraintement se conclut avec la contruction d'un :red[**Bag Of Worlds**], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)") | |
, unsafe_allow_html=True) | |
# | |
st.write("## **"+tr("Paramètres")+" :**\n") | |
Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True) | |
first_line = st.slider(tr('No de la premiere ligne à analyser:'),0,137859) | |
max_lines = st.select_slider(tr('Nombre de lignes à analyser:'), | |
options=[1,5,10,15,100, 500, 1000,'Max']) | |
if max_lines=='Max': | |
max_lines=137860 | |
if ((first_line+max_lines)>137860): | |
max_lines = max(137860-first_line,0) | |
last_line = first_line+max_lines | |
if (Langue=='Anglais'): | |
st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) | |
else: | |
st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) | |
st.write("") | |
# Chargement des textes sélectionnés dans les 2 langues (max lignes = max_lines) | |
txt_en = full_txt_en[first_line:last_line] | |
txt_fr = full_txt_fr[first_line:last_line] | |
# Elimination des phrases non traduites | |
# txt_en, txt_fr = clean_untranslated_sentence(txt_en, txt_fr) | |
if not st.session_state.reCalcule: | |
txt_split_en = full_txt_split_en[first_line:last_line] | |
txt_lem_en = full_txt_lem_en[first_line:last_line] | |
txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line] | |
df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1] | |
txt_split_fr = full_txt_split_fr[first_line:last_line] | |
txt_lem_fr = full_txt_lem_fr[first_line:last_line] | |
txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line] | |
df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1] | |
# Lancement du préprocessing du texte qui va spliter nettoyer les phrases et les spliter en mots | |
# et calculer nombre d'occurences des mots dans chaque phrase | |
if (Langue == 'Anglais'): | |
st.write("## **"+tr("Préprocessing de small_vocab_en")+" :**\n") | |
if max_lines>10000: | |
with st.status(":sunglasses:", expanded=True): | |
if st.session_state.reCalcule: | |
txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') | |
display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) | |
else: | |
if st.session_state.reCalcule: | |
txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') | |
display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) | |
else: | |
st.write("## **"+tr("Préprocessing de small_vocab_fr")+" :**\n") | |
if max_lines>10000: | |
with st.status(":sunglasses:", expanded=True): | |
if st.session_state.reCalcule: | |
txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') | |
display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) | |
else: | |
if st.session_state.reCalcule: | |
txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') | |
display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) | |