Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import joblib | |
import spacy | |
from wordcloud import WordCloud | |
from io import StringIO, BytesIO | |
import mimetypes | |
from transformers import CamembertForSequenceClassification, CamembertTokenizer | |
import torch | |
# Model Loading | |
model = joblib.load('model.pkl') | |
vectorizer = joblib.load('vectorizer.pkl') | |
camembert_model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2) | |
state_dict = torch.load('camembertperso.pth', map_location='cpu') | |
camembert_model.load_state_dict(state_dict, strict=False) | |
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True) | |
nlp = spacy.load("fr_core_news_sm") | |
# Text Processing Functions | |
def clean_text(text): | |
return text.strip().lower() | |
def lemmatize_text(text): | |
doc = nlp(text) | |
lemmatized_text = " ".join([token.lemma_ for token in doc]) | |
return lemmatized_text | |
# Prediction Functions | |
def predict_label(text): | |
cleaned_text = clean_text(text) | |
lemmatized_text = lemmatize_text(cleaned_text) | |
vectorized_text = vectorizer.transform([lemmatized_text]) | |
label = model.predict(vectorized_text)[0] | |
probability_score = model.decision_function(vectorized_text)[0] | |
probability = 1 / (1 + np.exp(-probability_score)) | |
return label, probability | |
def predict_camembert(text): | |
tokens = tokenizer.encode_plus(text, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = camembert_model(**tokens) | |
if len(outputs) == 1: | |
logits = outputs[0] | |
else: | |
logits = outputs[1] | |
predictions = torch.argmax(logits, dim=1).item() | |
probabilities = torch.softmax(logits, dim=1)[:, 1].item() | |
return predictions, probabilities | |
# App Interface | |
st.title('Analyse de sentiments') | |
st.write('Cet outil permet de prédire si une review est positive ou négative.') | |
review_text = st.text_area('Saisir la review ou charger un fichier :') | |
if st.button('Prédire et générer le nuage de mots'): | |
# LinearSVC Prediction | |
label_linear_svc, probability_linear_svc = predict_label(review_text) | |
# Display LinearSVC Results | |
st.write('Résultats de LinearSVC:') | |
if label_linear_svc == 0: | |
st.write('La review est négative.') | |
else: | |
st.write('La review est positive.') | |
# Display LinearSVC Prediction Score | |
st.write('Score de prédiction (LinearSVC) :', f'**{label_linear_svc}**', unsafe_allow_html=True) | |
# Display LinearSVC Probability | |
st.write('Probabilité (LinearSVC) :', f'**{probability_linear_svc:.2%}**', unsafe_allow_html=True) | |
# CamemBERT Prediction | |
label_camembert, probability_camembert = predict_camembert(review_text) | |
# Display CamemBERT Results | |
st.write('Résultats de Camembert:') | |
if label_camembert == 0: | |
st.write('La review est négative.') | |
else: | |
st.write('La review est positive.') | |
# Display CamemBERT Prediction Score | |
st.write('Score de prédiction (Camembert) :', f'**{label_camembert}**', unsafe_allow_html=True) | |
# Display CamemBERT Probability | |
st.write('Probabilité (Camembert) :', f'**{probability_camembert:.2%}**', unsafe_allow_html=True) | |
# Lemmatize and Exclude Stop Words | |
doc = nlp(review_text) | |
lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop]) | |
# Générer le nuage de mots | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords) | |
st.image(wordcloud.to_image()) | |
# Créer un bouton pour l'upload d'un fichier | |
uploaded_file = st.file_uploader("Charger un fichier texte", type=["txt", "csv"]) | |
if uploaded_file is not None: | |
content_type, _ = mimetypes.guess_type(uploaded_file.name) | |
if content_type == 'text/plain': | |
file_contents = uploaded_file.read().decode("utf-8") | |
st.text(file_contents) | |
# Lemmatiser le texte et exclure les mots vides | |
doc = nlp(file_contents) | |
lemmatized_text_no_stopwords = " ".join([token.lemma_ for token in doc if not token.is_stop]) | |
# Générer le nuage de mots à partir du fichier uploadé | |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(lemmatized_text_no_stopwords) | |
st.image(wordcloud.to_image()) | |
elif content_type == 'text/csv': | |
df = pd.read_csv(uploaded_file) | |
st.write(df) |