import streamlit as st import pandas as pd import numpy as np import joblib import spacy from wordcloud import WordCloud from io import StringIO, BytesIO import mimetypes from transformers import CamembertForSequenceClassification, CamembertTokenizer import torch # Model Loading model = joblib.load('model.pkl') vectorizer = joblib.load('vectorizer.pkl') camembert_model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2) state_dict = torch.load('camembertperso.pth', map_location='cpu') camembert_model.load_state_dict(state_dict, strict=False) tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True) nlp = spacy.load("fr_core_news_sm") # Text Processing Functions def clean_text(text): return text.strip().lower() def lemmatize_text(text): doc = nlp(text) lemmatized_text = " ".join([token.lemma_ for token in doc]) return lemmatized_text # Prediction Functions def predict_label(text): cleaned_text = clean_text(text) lemmatized_text = lemmatize_text(cleaned_text) vectorized_text = vectorizer.transform([lemmatized_text]) label = model.predict(vectorized_text)[0] probability_score = model.decision_function(vectorized_text)[0] probability = 1 / (1 + np.exp(-probability_score)) return label, probability def predict_camembert(text): tokens = tokenizer.encode_plus(text, return_tensors="pt") with torch.no_grad(): outputs = camembert_model(**tokens) if len(outputs) == 1: logits = outputs[0] else: logits = outputs[1] predictions = torch.argmax(logits, dim=1).item() probabilities = torch.softmax(logits, dim=1)[:, 1].item() return predictions, probabilities # App Interface st.title('Analyse de sentiments') st.write('Cet outil permet de prédire si une review est positive ou négative.') review_text = st.text_area('Saisir la review ou charger un fichier :') if st.button('Prédire et générer le nuage de mots'): # LinearSVC Prediction label_linear_svc, probability_linear_svc = predict_label(review_text) # Display LinearSVC Results st.write('Résultats de LinearSVC:') if label_linear_svc == 0: st.write('La review est négative.') else: st.write('La review est positive.') # Display LinearSVC Prediction Score st.write('Score de prédiction (LinearSVC) :', f'**{label_linear_svc}**', unsafe_allow_html=True) # Display LinearSVC Probability st.write('Probabilité (LinearSVC) :', f'**{probability_linear_svc:.2%}**', unsafe_allow_html=True) # CamemBERT Prediction label_camembert, probability_camembert = predict_camembert(review_text) # Display CamemBERT Results st.write('Résultats de Camembert:') if label_camembert == 0: st.write('La review est négative.') else: st.write('La review est positive.') # Display CamemBERT Prediction Score st.write('Score de prédiction (Camembert) :', f'**{label_camembert}**', unsafe_allow_html=True) # Display CamemBERT Probability st.write('Probabilité (Camembert) :', f'**{probability_camembert:.2%}**', unsafe_allow_html=True) # Lemmatize and Exclude Stop Words doc = nlp(review_text) lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop]) # Générer le nuage de mots wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords) st.image(wordcloud.to_image()) # Créer un bouton pour l'upload d'un fichier uploaded_file = st.file_uploader("Charger un fichier texte", type=["txt", "csv"]) if uploaded_file is not None: content_type, _ = mimetypes.guess_type(uploaded_file.name) if content_type == 'text/plain': file_contents = uploaded_file.read().decode("utf-8") st.text(file_contents) # Lemmatiser le texte et exclure les mots vides doc = nlp(file_contents) lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop]) # Générer le nuage de mots à partir du fichier uploadé wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords) st.image(wordcloud.to_image()) elif content_type == 'text/csv': df = pd.read_csv(uploaded_file) st.write(df)