Spaces:

BulatF
/

StreamlitSentiment

Runtime error

App Files Files Community

BulatF commited on Jul 5, 2023

Commit

bb50616

1 Parent(s): 768bcdc

Upload 2 files

Browse files

Files changed (2) hide show

app.py +52 -3
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -2,16 +2,21 @@ import streamlit as st
 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import pipeline
 import torch.nn.functional as F
 import torch
 import io
 import base64
 from stqdm import stqdm
 import matplotlib.pyplot as plt
 import numpy as np
 # Define the model and tokenizer
 model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
@@ -42,6 +47,19 @@ def get_table_download_link(df):
     b64 = base64.b64encode(csv.encode()).decode()
     return f'<a href="data:file/csv;base64,{b64}" download="data.csv">Download csv file</a>'
 # Function for classifying with the new model
 def classify_with_new_classes(reviews, class_names):
@@ -78,7 +96,11 @@ def main():
             review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
             df[review_column] = df[review_column].astype(str)
             class_names = st.text_input('Enter the possible class names separated by comma')  # New input field for class names
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
@@ -109,6 +131,8 @@ def main():
 def process_reviews(df, review_column, class_names):
     with st.spinner('Classifying reviews...'):
         progress_bar = st.progress(0)
@@ -134,7 +158,9 @@ def process_reviews(df, review_column, class_names):
         class_scores_dict[name] = [score[i] for score in class_scores]
     # Add a new column with the class that has the highest score
-    df['Highest Class'] = df[class_names].idxmax(axis=1)
     df_new = df.copy()
     df_new['raw_scores'] = raw_scores
@@ -192,14 +218,37 @@ def display_dataframe(df, df_display):
         )
     st.dataframe(df_display)
 def display_ratings(df, review_column):
     cols = st.columns(5)
     for i in range(1, 6):
-        rating_counts = df[df['Rating'] == i].shape[0]
         cols[i-1].markdown(f"### {rating_counts}")
         cols[i-1].markdown(f"{'⭐' * i}")

 import pandas as pd
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import pipeline
+from fuzzywuzzy import fuzz
+from sklearn.feature_extraction.text import TfidfVectorizer
 import torch.nn.functional as F
 import torch
 import io
 import base64
 from stqdm import stqdm
+import nltk
+from nltk.corpus import stopwords
+nltk.download('stopwords')
 import matplotlib.pyplot as plt
 import numpy as np
+stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
 # Define the model and tokenizer
 model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
     b64 = base64.b64encode(csv.encode()).decode()
     return f'<a href="data:file/csv;base64,{b64}" download="data.csv">Download csv file</a>'
+def filter_dataframe(df, review_column, filter_words):
+    # Return full DataFrame if filter_words is empty or contains only spaces
+    if not filter_words or all(word.isspace() for word in filter_words):
+        return df
+    filter_scores = df[review_column].apply(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]))
+    return df[filter_scores > 70]  # Adjust this threshold as necessary
+def process_filter_words(filter_words_input):
+    filter_words = [word.strip() for word in filter_words_input.split(',')]
+    return filter_words
 # Function for classifying with the new model
 def classify_with_new_classes(reviews, class_names):
             review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
             df[review_column] = df[review_column].astype(str)
+            filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')  # New input field for filter words
+            filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)  # Process the filter words
             class_names = st.text_input('Enter the possible class names separated by comma')  # New input field for class names
+            df = filter_dataframe(df, review_column, filter_words)  # Filter the DataFrame
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
 def process_reviews(df, review_column, class_names):
     with st.spinner('Classifying reviews...'):
         progress_bar = st.progress(0)
         class_scores_dict[name] = [score[i] for score in class_scores]
     # Add a new column with the class that has the highest score
+    if class_names and not all(name.isspace() for name in class_names):
+        df['Highest Class'] = df[class_names].idxmax(axis=1)
     df_new = df.copy()
     df_new['raw_scores'] = raw_scores
         )
     st.dataframe(df_display)
+def important_words(reviews, num_words=5):
+    if len(reviews) == 0:
+        return []
+    vectorizer = TfidfVectorizer(stop_words=stopwords_list, max_features=10000)
+    vectors = vectorizer.fit_transform(reviews)
+    features = vectorizer.get_feature_names_out()
+    indices = np.argsort(vectorizer.idf_)[::-1]
+    top_features = [features[i] for i in indices[:num_words]]
+    return top_features
 def display_ratings(df, review_column):
     cols = st.columns(5)
     for i in range(1, 6):
+        rating_reviews = df[df['Rating'] == i][review_column]
+        top_words = important_words(rating_reviews)
+        rating_counts = rating_reviews.shape[0]
         cols[i-1].markdown(f"### {rating_counts}")
         cols[i-1].markdown(f"{'⭐' * i}")
+        # Display the most important words for each rating
+        cols[i-1].markdown(f"#### Most Important Words:")
+        if top_words:
+            for word in top_words:
+                cols[i-1].markdown(f"**{word}**")
+        else:
+            cols[i-1].markdown("No important words to display")

requirements.txt CHANGED Viewed

@@ -5,4 +5,8 @@ torch
 stqdm
 openpyxl
 wordcloud
-matplotlib

 stqdm
 openpyxl
 wordcloud
+matplotlib
+fuzzywuzzy
+scikit-learn
+nltk
+numpy