|
import streamlit as st |
|
import os |
|
import pandas as pd |
|
import collections |
|
from nltk.tokenize import word_tokenize |
|
from nltk import download |
|
from ast import literal_eval |
|
from translate_app import tr |
|
if st.session_state.Cloud == 0: |
|
import contextlib |
|
import re |
|
from nltk.corpus import stopwords |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
|
title = "Exploration et Preprocessing" |
|
sidebar_name = "Exploration et Preprocessing" |
|
dataPath = st.session_state.DataPath |
|
|
|
|
|
stopwords_to_do = True |
|
|
|
lemmatize_to_do = True |
|
|
|
bleu_score_to_do = True |
|
|
|
first_line = 0 |
|
|
|
max_lines = 140000 |
|
if ((first_line+max_lines)>137860): |
|
max_lines = max(137860-first_line ,0) |
|
|
|
max_lines_to_display = 50 |
|
|
|
download('punkt') |
|
|
|
if st.session_state.Cloud == 0: |
|
download('averaged_perceptron_tagger') |
|
with contextlib.redirect_stdout(open(os.devnull, "w")): |
|
download('stopwords') |
|
|
|
@st.cache_data |
|
def load_data(path): |
|
|
|
input_file = os.path.join(path) |
|
with open(input_file, "r", encoding="utf-8") as f: |
|
data = f.read() |
|
|
|
|
|
data = data.lower() |
|
data = data.split('\n') |
|
return data[first_line:min(len(data),first_line+max_lines)] |
|
|
|
@st.cache_data |
|
def load_preprocessed_data(path,data_type): |
|
|
|
input_file = os.path.join(path) |
|
if data_type == 1: |
|
return pd.read_csv(input_file, encoding="utf-8", index_col=0) |
|
else: |
|
with open(input_file, "r", encoding="utf-8") as f: |
|
data = f.read() |
|
data = data.split('\n') |
|
if data_type==0: |
|
data=data[:-1] |
|
elif data_type == 2: |
|
data=[eval(i) for i in data[:-1]] |
|
elif data_type ==3: |
|
data2 = [] |
|
for d in data[:-1]: |
|
data2.append(literal_eval(d)) |
|
data=data2 |
|
return data |
|
|
|
@st.cache_data |
|
def load_all_preprocessed_data(lang): |
|
txt =load_preprocessed_data(dataPath+'/preprocess_txt_'+lang,0) |
|
txt_split = load_preprocessed_data(dataPath+'/preprocess_txt_split_'+lang,3) |
|
txt_lem = load_preprocessed_data(dataPath+'/preprocess_txt_lem_'+lang,0) |
|
txt_wo_stopword = load_preprocessed_data(dataPath+'/preprocess_txt_wo_stopword_'+lang,0) |
|
df_count_word = pd.concat([load_preprocessed_data(dataPath+'/preprocess_df_count_word1_'+lang,1), load_preprocessed_data(dataPath+'/preprocess_df_count_word2_'+lang,1)]) |
|
return txt, txt_split, txt_lem, txt_wo_stopword, df_count_word |
|
|
|
|
|
full_txt_en = load_data(dataPath+'/small_vocab_en') |
|
full_txt_fr = load_data(dataPath+'/small_vocab_fr') |
|
|
|
|
|
if not st.session_state.reCalcule: |
|
full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en = load_all_preprocessed_data('en') |
|
full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr = load_all_preprocessed_data('fr') |
|
else: |
|
|
|
def remove_stopwords(text, lang): |
|
stop_words = set(stopwords.words(lang)) |
|
|
|
filtered_sentence = [] |
|
for word in text.split(): |
|
if word not in stop_words: |
|
filtered_sentence.append(word) |
|
return " ".join(filtered_sentence) |
|
|
|
def clean_undesirable_from_text(sentence, lang): |
|
|
|
|
|
sentence = re.sub(r"https?://\S+|www\.\S+", "", sentence ) |
|
|
|
|
|
REPLACEMENTS = [("..", "."), |
|
(",", ""), |
|
(";", ""), |
|
(":", ""), |
|
("?", ""), |
|
('"', ""), |
|
("-", " "), |
|
("it's", "it is"), |
|
("isn't","is not"), |
|
("'", " ") |
|
] |
|
for old, new in REPLACEMENTS: |
|
sentence = sentence.replace(old, new) |
|
|
|
|
|
sentence= re.sub(r'[0-9]','',sentence) |
|
|
|
|
|
sentence = re.sub(' +', ' ', sentence) |
|
|
|
return sentence |
|
|
|
def clean_untranslated_sentence(data1, data2): |
|
i=0 |
|
while i<len(data1): |
|
if data1[i]==data2[i]: |
|
data1.pop(i) |
|
data2.pop(i) |
|
else: i+=1 |
|
return data1,data2 |
|
|
|
import spacy |
|
|
|
nlp_en = spacy.load('en_core_web_sm') |
|
nlp_fr = spacy.load('fr_core_news_sm') |
|
|
|
|
|
def lemmatize(sentence,lang): |
|
|
|
if lang=='en': |
|
nlp=nlp_en |
|
elif lang=='fr': |
|
nlp=nlp_fr |
|
else: return |
|
doc = nlp(sentence) |
|
|
|
|
|
tokens = [] |
|
for token in doc: |
|
tokens.append(token) |
|
|
|
lemmatized_sentence = " ".join([token.lemma_ for token in doc]) |
|
|
|
return lemmatized_sentence |
|
|
|
|
|
def preprocess_txt (data, lang): |
|
|
|
word_count = collections.Counter() |
|
word_lem_count = collections.Counter() |
|
word_wosw_count = collections.Counter() |
|
corpus = [] |
|
data_split = [] |
|
sentence_length = [] |
|
data_split_wo_stopwords = [] |
|
data_length_wo_stopwords = [] |
|
data_lem = [] |
|
data_lem_length = [] |
|
|
|
txt_en_one_string= ". ".join([s for s in data]) |
|
txt_en_one_string = txt_en_one_string.replace('..', '.') |
|
txt_en_one_string = " "+clean_undesirable_from_text(txt_en_one_string, 'lang') |
|
data = txt_en_one_string.split('.') |
|
if data[-1]=="": |
|
data.pop(-1) |
|
for i in range(len(data)): |
|
if data[i][0] == ' ': |
|
data[i]=data[i][1:] |
|
if data[i][-1] == ' ': |
|
data[i]=data[i][:-1] |
|
nb_phrases = len(data) |
|
|
|
|
|
for i,sentence in enumerate(data): |
|
sentence_split = word_tokenize(sentence) |
|
word_count.update(sentence_split) |
|
data_split.append(sentence_split) |
|
sentence_length.append(len(sentence_split)) |
|
|
|
|
|
|
|
|
|
if lang == 'en': l='english' |
|
elif lang=='fr': l='french' |
|
else: l="unknown" |
|
|
|
if l!="unknown": |
|
|
|
data_lemmatized="" |
|
if lemmatize_to_do: |
|
n_batch = 12 |
|
batch_size = round((nb_phrases/ n_batch)+0.5) |
|
for i in range(n_batch): |
|
to_lem = ".".join([s for s in data[i*batch_size:(i+1)*batch_size]]) |
|
data_lemmatized = data_lemmatized+"."+lemmatize(to_lem,lang).lower() |
|
|
|
data_lem_for_sw = data_lemmatized[1:] |
|
data_lemmatized = data_lem_for_sw.split('.') |
|
for i in range(nb_phrases): |
|
data_lem.append(data_lemmatized[i].split()) |
|
data_lem_length.append(len(data_lemmatized[i].split())) |
|
word_lem_count.update(data_lem[-1]) |
|
|
|
|
|
|
|
|
|
if stopwords_to_do: |
|
if lemmatize_to_do: |
|
data_wosw = remove_stopwords(data_lem_for_sw,l) |
|
else: |
|
data_wosw = remove_stopwords(txt_en_one_string,l) |
|
|
|
data_wosw = data_wosw.split('.') |
|
for i in range(nb_phrases): |
|
data_split_wo_stopwords.append(data_wosw[i].split()) |
|
data_length_wo_stopwords.append(len(data_wosw[i].split())) |
|
word_wosw_count.update(data_split_wo_stopwords[-1]) |
|
|
|
corpus = list(word_count.keys()) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
count_vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 1), token_pattern=r"[^' ']+" ) |
|
|
|
|
|
countvectors = count_vectorizer.fit_transform(data) |
|
corpus = count_vectorizer.get_feature_names_out() |
|
|
|
txt_n_unique_val= pd.DataFrame(columns=corpus,index=range(nb_phrases), data=countvectors.todense()).astype(float) |
|
|
|
return data, corpus, data_split, data_lemmatized, data_wosw, txt_n_unique_val, sentence_length, data_length_wo_stopwords, data_lem_length |
|
|
|
|
|
def count_world(data): |
|
word_count = collections.Counter() |
|
for sentence in data: |
|
word_count.update(word_tokenize(sentence)) |
|
corpus = list(word_count.keys()) |
|
nb_mots = sum(word_count.values()) |
|
nb_mots_uniques = len(corpus) |
|
return corpus, nb_mots, nb_mots_uniques |
|
|
|
def display_preprocess_results(lang, data, data_split, data_lem, data_wosw, txt_n_unique_val): |
|
|
|
global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do |
|
corpus = [] |
|
nb_phrases = len(data) |
|
corpus, nb_mots, nb_mots_uniques = count_world(data) |
|
mots_lem, _ , nb_mots_lem = count_world(data_lem) |
|
mots_wo_sw, _ , nb_mots_wo_stopword = count_world(data_wosw) |
|
|
|
columns_with_only_zeros = txt_n_unique_val.columns[txt_n_unique_val.eq(0).all()] |
|
txt_n_unique_val = txt_n_unique_val.drop(columns=columns_with_only_zeros) |
|
|
|
|
|
tab1, tab2, tab3, tab4 = st.tabs([tr("Résumé"), tr("Tokenisation"),tr("Lemmatisation"), tr("Sans Stopword")]) |
|
with tab1: |
|
st.subheader(tr("Résumé du pré-processing")) |
|
st.write("**"+tr("Nombre de phrases")+" : "+str(nb_phrases)+"**") |
|
st.write("**"+tr("Nombre de mots")+" : "+str(nb_mots)+"**") |
|
st.write("**"+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"**") |
|
st.write("") |
|
st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**") |
|
st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800) |
|
with tab2: |
|
st.subheader(tr("Tokenisation")) |
|
st.write(tr('Texte "splited":')) |
|
st.dataframe(pd.DataFrame(data=data_split, index=range(first_line,last_line)).head(max_lines_to_display).fillna(''), width=800) |
|
st.write("**"+tr("Nombre de mots uniques")+" : "+str(nb_mots_uniques)+"**") |
|
st.write("") |
|
st.write("\n**"+tr("Mots uniques")+":**") |
|
st.markdown(corpus[:500]) |
|
st.write("\n**"+tr("Nombre d'apparitions de chaque mot dans chaque phrase (:red[Bag Of Words]):")+"**") |
|
st.dataframe(txt_n_unique_val.head(max_lines_to_display), width=800) |
|
with tab3: |
|
st.subheader(tr("Lemmatisation")) |
|
if lemmatize_to_do: |
|
st.dataframe(pd.DataFrame(data=data_lem,columns=[tr('Texte lemmatisé')],index=range(first_line,last_line)).head(max_lines_to_display), width=800) |
|
|
|
|
|
|
|
|
|
|
|
st.write("**"+tr("Nombre de mots uniques lemmatisés")+" : "+str(nb_mots_lem)+"**") |
|
st.write("") |
|
st.write("\n**"+tr("Mots uniques lemmatisés:")+"**") |
|
st.markdown(mots_lem[:500]) |
|
with tab4: |
|
st.subheader(tr("Sans Stopword")) |
|
if stopwords_to_do: |
|
st.dataframe(pd.DataFrame(data=data_wosw,columns=['Texte sans stopwords'],index=range(first_line,last_line)).head(max_lines_to_display), width=800) |
|
st.write("**"+tr("Nombre de mots uniques sans stop words")+": "+str(nb_mots_wo_stopword)+"**") |
|
st.write("") |
|
st.write("\n**"+tr("Mots uniques sans stop words")+":**") |
|
st.markdown(mots_wo_sw[:500]) |
|
|
|
|
|
def run(): |
|
global max_lines, first_line, last_line, lemmatize_to_do, stopwords_to_do |
|
global full_txt_en, full_txt_split_en, full_txt_lem_en, full_txt_wo_stopword_en, full_df_count_word_en |
|
global full_txt_fr, full_txt_split_fr, full_txt_lem_fr, full_txt_wo_stopword_fr, full_df_count_word_fr |
|
|
|
st.write("") |
|
st.title(tr(title)) |
|
|
|
st.write("## **"+tr("Explications")+" :**\n") |
|
st.markdown(tr( |
|
""" |
|
Le traitement du langage naturel permet à l'ordinateur de comprendre et de traiter les langues humaines. |
|
Lors de notre projet, nous avons étudié le dataset small_vocab, proposés par Suzan Li, Chief Data Scientist chez Campaign Research à Toronto. |
|
Celui-ci représente un corpus de phrases simples en anglais, et sa traduction (approximative) en français. |
|
:red[**Small_vocab**] contient 137 860 phrases en anglais et français. |
|
""") |
|
, unsafe_allow_html=True) |
|
st.markdown(tr( |
|
""" |
|
Afin de découvrir ce corpus et de préparer la traduction, nous allons effectuer un certain nombre de tâches de pré-traitement (preprocessing). |
|
Ces taches sont, par exemple: |
|
""") |
|
, unsafe_allow_html=True) |
|
st.markdown( |
|
"* "+tr("le :red[**nettoyage**] du texte (enlever les majuscules et la ponctuation)")+"\n"+ \ |
|
"* "+tr("la :red[**tokenisation**] (découpage du texte en mots)")+"\n"+ \ |
|
"* "+tr("la :red[**lemmatisation**] (traitement lexical qui permet de donner une forme unique à toutes les \"variations\" d'un même mot)")+"\n"+ \ |
|
"* "+tr("l'élimination des :red[**mots \"transparents\"**] (sans utilité pour la compréhension, tels que les articles).")+" \n"+ \ |
|
tr("Ce prétraintement se conclut avec la contruction d'un :red[**Bag Of Worlds**], c'est à dire une matrice qui compte le nombre d'apparition de chaque mots (colonne) dans chaque phrase (ligne)") |
|
, unsafe_allow_html=True) |
|
|
|
st.write("## **"+tr("Paramètres")+" :**\n") |
|
Langue = st.radio(tr('Langue:'),('Anglais','Français'), horizontal=True) |
|
first_line = st.slider(tr('No de la premiere ligne à analyser:'),0,137859) |
|
max_lines = st.select_slider(tr('Nombre de lignes à analyser:'), |
|
options=[1,5,10,15,100, 500, 1000,'Max']) |
|
if max_lines=='Max': |
|
max_lines=137860 |
|
if ((first_line+max_lines)>137860): |
|
max_lines = max(137860-first_line,0) |
|
|
|
last_line = first_line+max_lines |
|
if (Langue=='Anglais'): |
|
st.dataframe(pd.DataFrame(data=full_txt_en,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) |
|
else: |
|
st.dataframe(pd.DataFrame(data=full_txt_fr,columns=['Texte']).loc[first_line:last_line-1].head(max_lines_to_display), width=800) |
|
st.write("") |
|
|
|
|
|
txt_en = full_txt_en[first_line:last_line] |
|
txt_fr = full_txt_fr[first_line:last_line] |
|
|
|
|
|
|
|
|
|
if not st.session_state.reCalcule: |
|
txt_split_en = full_txt_split_en[first_line:last_line] |
|
txt_lem_en = full_txt_lem_en[first_line:last_line] |
|
txt_wo_stopword_en = full_txt_wo_stopword_en[first_line:last_line] |
|
df_count_word_en = full_df_count_word_en.loc[first_line:last_line-1] |
|
txt_split_fr = full_txt_split_fr[first_line:last_line] |
|
txt_lem_fr = full_txt_lem_fr[first_line:last_line] |
|
txt_wo_stopword_fr = full_txt_wo_stopword_fr[first_line:last_line] |
|
df_count_word_fr = full_df_count_word_fr.loc[first_line:last_line-1] |
|
|
|
|
|
|
|
if (Langue == 'Anglais'): |
|
st.write("## **"+tr("Préprocessing de small_vocab_en")+" :**\n") |
|
if max_lines>10000: |
|
with st.status(":sunglasses:", expanded=True): |
|
if st.session_state.reCalcule: |
|
txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') |
|
display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) |
|
else: |
|
if st.session_state.reCalcule: |
|
txt_en, corpus_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en,sent_len_en, sent_wo_sw_len_en, sent_lem_len_en = preprocess_txt (txt_en,'en') |
|
display_preprocess_results('en',txt_en, txt_split_en, txt_lem_en, txt_wo_stopword_en, df_count_word_en) |
|
else: |
|
st.write("## **"+tr("Préprocessing de small_vocab_fr")+" :**\n") |
|
if max_lines>10000: |
|
with st.status(":sunglasses:", expanded=True): |
|
if st.session_state.reCalcule: |
|
txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') |
|
display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) |
|
else: |
|
if st.session_state.reCalcule: |
|
txt_fr, corpus_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr,sent_len_fr, sent_wo_sw_len_fr, sent_lem_len_fr = preprocess_txt (txt_fr,'fr') |
|
display_preprocess_results('fr', txt_fr, txt_split_fr, txt_lem_fr, txt_wo_stopword_fr, df_count_word_fr) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|