import streamlit as st import pandas as pd import plotly.express as px import random import plotly.graph_objects as go import streamlit.components.v1 as components import requests import numpy as np from streamlit_lottie import st_lottie st.set_page_config(layout="wide") def load_lottie_url(url: str): r = requests.get(url) if r.status_code != 200: return None return r.json() # Chargement des données @st.cache_data def load_data(): return pd.read_csv(r'predictions.csv') # Get the data df = load_data() df.dropna(inplace=True) # CSS personnalisé custom_css = """ """ # Injecter le CSS personnalisé dans l'application st.markdown(custom_css, unsafe_allow_html=True) coll1,coll2,coll3= st.columns([1,4,1]) with coll2: col1,col2= st.columns([2,5]) with col1: lottie_animation_1 = "https://assets1.lottiefiles.com/packages/lf20_5mhyg2hz.json" lottie_anime_json = load_lottie_url(lottie_animation_1) st_lottie(lottie_anime_json, key="logo") with col2: st.title('Tweets Analysis on Cancer and Cannabis: Exploring Opinions') # Mapper les labels numériques aux étiquettes textuelles label_map = {0: 'for', 1: 'against', 2: 'neutral'} df['prediction_text'] = df['prediction'].map(label_map) # Header pour la section st.header('Overview et Visualization') st.text("") # Ajouter une ligne vide # Mapping dictionary to replace prediction_text values mapping = {'for': 'PRO', 'against': 'ANTI', 'neutral': 'NEUTRAL'} # Applying the mapping to the prediction_text column df['prediction_text'] = df['prediction_text'].map(mapping) def pick_random_tweet(): # Sélection aléatoire d'un tweet tweet_index = random.randint(0, len(df) - 1) current_tweet = df.loc[tweet_index, 'text'] current_prediction = df.loc[tweet_index, 'prediction_text'] # Création des colonnes pour le tweet et le bouton tweet_col, button_col = st.columns([3, 1]) # Ajuster les proportions selon vos besoins with tweet_col: # Affichage du tweet aléatoire st.markdown("Random Tweet : ") st.info(current_tweet) st.success(f"Label : {current_prediction}") with button_col: # Le bouton pour choisir un autre tweet, avec une clé unique générée à partir de l'index du tweet if st.button("Choose another tweet", key=f"choose_another_tweet_{tweet_index}"): pick_random_tweet() pick_random_tweet() def plot_label_distribution(): # Création d'un bar plot pour les labels avec Plotly label_counts = df['prediction_text'].value_counts().reset_index() label_counts.columns = ['label', 'count'] fig = px.bar(label_counts, x='label', y='count', text='count', title="Label Distribution", labels={'label': 'Label', 'count': 'Count of tweets'}, color='label') fig.update_traces(texttemplate='%{text:.2s}', textposition='outside') fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide') st.plotly_chart(fig, use_container_width=True) # Affichage du bar plot plot_label_distribution() # Partie 2 : # Section Méthodologie st.header('Opinion Classification') st.text("") # Ajouter une ligne vide st.write(""" Firstly, we developed and utilized a labeling script specifically designed to classify tweets into three categories: PRO, ANTI, and NEUTRAL. In total, 500 tweets were labeled, providing a solid foundation for testing and training our opinion classification models. We adopted an 80/20 data split for training and testing, respectively, to maximize learning while effectively validating the model's performance. This relatively small dataset was chosen due to time constraints that did not allow for more extensive labeling.""") st.subheader('Bert Fine-Tuning') st.write(""" Regarding the analysis of opinions expressed in tweets, we opted for fine-tuning a pre-trained BERT (Bidirectional Encoder Representations from Transformers) model. The BERT architecture is particularly well-suited to our problem, as it allows for understanding the nuances of natural language through its ability to process words in their bidirectional context. Additionally, BERT is widely documented, which facilitated the fine-tuning process.""") st.write("Fine-tuning involves adjusting the parameters of a pre-trained model on a specific dataset (450 tweets). In our study, we mapped the opinion categories as follows to enable tokenization: 0 for 'PRO', 1 for 'ANTI', and 2 for 'NEUTRAL'.") st.write(""" The results of the model training are presented below. These data illustrate the variations in loss and accuracy over the epochs.""") # Détails de l'entraînement du modèle data_training = { "Epoch": [1, 2, 3], "Learning Rate": ["{:.1e}".format(3.725e-5), "{:.1e}".format(1.863e-5), "{:.1e}".format(0.0)], "Loss": [0.3329, 0.3242, 0.195] } df_training = pd.DataFrame(data_training) st.table(df_training) # Section Présentation des Résultats st.header('Results') st.text("") # Ajouter une ligne vide # Accuracy du modèle par classe et globale data_accuracy = { "Accuracies": ["Global Accuracy", "Class 0 PRO Accuracy", "Class 1 ANTI Accuracy", "Class 2 NEUTRAL Accuracy"], "Values": [0.86, 0.971, 0.500, 0.643] } df_accuracy = pd.DataFrame(data_accuracy) st.table(df_accuracy) # Perspectives d'améliorations st.subheader('Perspectives of Improvement') st.text("") # Ajouter une ligne vide st.write(""" The performance of our model was significantly impacted by class imbalance in the dataset, with the "against" class being underrepresented. To address this issue, we are considering two main improvement strategies: """) st.write(""" 1. **Data Augmentation**: This technique aims to artificially generate new training examples by slightly modifying existing tweets. This can include changing certain words, rephrasing, or using synonyms. 2. **Loss Functions to Handle Class Imbalance**: Adapting the loss function to account for class imbalance can also be an effective approach.""") st.write(""" In particular, loss functions such as **Weighted Cross-Entropy Loss** and **Focal Loss** are considered for their effectiveness in addressing class imbalance issues.""")