Spaces:

BulatF
/

StreamlitSentiment

Runtime error

App Files Files Community

BulatF commited on Jul 6, 2023

Commit

4ca73d1

•

1 Parent(s): 8a6b406

Upload 2 files

Browse files

Files changed (2) hide show

app.py +35 -50
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -16,24 +16,13 @@ nltk.download('stopwords')
 import matplotlib.pyplot as plt
 import numpy as np
 stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
-st.set_page_config(layout="wide")
-@st.cache_resource
-def load_model_and_tokenizer(model_name):
-    model = AutoModelForSequenceClassification.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    return model, tokenizer
-model, tokenizer = load_model_and_tokenizer('nlptown/bert-base-multilingual-uncased-sentiment')
-@st.cache_resource
-def load_pipeline():
-    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-    return classifier
-classifier = load_pipeline()
 # Import the new model and tokenizer
@@ -41,15 +30,12 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
 #defs
-def classify_reviews(reviews, batch_size=100):
-    probabilities = []
-    for i in range(0, len(reviews), batch_size):
-        batch_reviews = reviews[i:i+batch_size]
-        inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
-        outputs = model(**inputs)
-        probabilities.extend(F.softmax(outputs.logits, dim=1).tolist())
     return probabilities
 def top_rating(scores):
     return scores.index(max(scores)) + 1
@@ -65,13 +51,8 @@ def filter_dataframe(df, review_column, filter_words):
     # Return full DataFrame if filter_words is empty or contains only spaces
     if not filter_words or all(word.isspace() for word in filter_words):
         return df
-    # Use multiprocessing to speed up fuzz token set ratio calculation
-    from multiprocessing import Pool
-    with Pool() as pool:
-        filter_scores = pool.map(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]), df[review_column])
-    return df[np.array(filter_scores) > 70]  # Adjust this threshold as necessary
@@ -81,15 +62,16 @@ def process_filter_words(filter_words_input):
 # Function for classifying with the new model
-def classify_with_new_classes(reviews, class_names, batch_size=100):
     class_scores = []
-    for i in range(0, len(reviews), batch_size):
-        batch_reviews = reviews[i:i+batch_size]
-        result = classifier(batch_reviews, class_names)
         scores_dict = dict(zip(result['labels'], result['scores']))
         # Reorder scores to match the original class_names order
         scores = [scores_dict[name] for name in class_names]
-        class_scores.extend(scores)
     return class_scores
@@ -101,42 +83,46 @@ def main():
     file = st.file_uploader("Upload an excel file", type=['xlsx'])
     review_column = None
     df = None
-    class_names = None
     if file is not None:
         try:
-            chunk_size = 10000  # adjust this value depending on your available memory
-            df = pd.concat(pd.read_excel(file, chunksize=chunk_size))
             df = df.dropna(how='all')
             df = df.replace(r'^\s*$', np.nan, regex=True)
             df = df.dropna(how='all')
             review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
             df[review_column] = df[review_column].astype(str)
-            filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')
-            filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)
-            class_names = st.text_input('Enter the possible class names separated by comma')
-            df = filter_dataframe(df, review_column, filter_words)
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
     start_button = st.button('Start Analysis')
     if start_button and df is not None:
         df = df[df[review_column].notna()]
         df = df[df[review_column].str.strip() != '']
-        class_names = [name.strip() for name in class_names.split(',')]
-        for name in class_names:
             if name not in df.columns:
                 df[name] = 0.0
         if review_column in df.columns:
             with st.spinner('Performing sentiment analysis...'):
                 df, df_display = process_reviews(df, review_column, class_names)
-            display_ratings(df, review_column)
             display_dataframe(df, df_display)
         else:
             st.write(f'No column named "{review_column}" found in the uploaded file.')
@@ -147,7 +133,6 @@ def main():
 def process_reviews(df, review_column, class_names):
     with st.spinner('Classifying reviews...'):
         progress_bar = st.progress(0)

 import matplotlib.pyplot as plt
 import numpy as np
 stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
+# Define the model and tokenizer
+model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+st.set_page_config(layout="wide")
 # Import the new model and tokenizer
 #defs
+def classify_reviews(reviews):
+    inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
+    outputs = model(**inputs)
+    probabilities = F.softmax(outputs.logits, dim=1).tolist()
     return probabilities
 def top_rating(scores):
     return scores.index(max(scores)) + 1
     # Return full DataFrame if filter_words is empty or contains only spaces
     if not filter_words or all(word.isspace() for word in filter_words):
         return df
+    filter_scores = df[review_column].apply(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]))
+    return df[filter_scores > 70]  # Adjust this threshold as necessary
 # Function for classifying with the new model
+def classify_with_new_classes(reviews, class_names):
     class_scores = []
+    for review in reviews:
+        result = classifier(review, class_names)
         scores_dict = dict(zip(result['labels'], result['scores']))
         # Reorder scores to match the original class_names order
         scores = [scores_dict[name] for name in class_names]
+        class_scores.append(scores)
     return class_scores
     file = st.file_uploader("Upload an excel file", type=['xlsx'])
     review_column = None
     df = None
+    class_names = None  # New variable for class names
     if file is not None:
         try:
+            df = pd.read_excel(file)
+            # Drop rows where all columns are NaN
             df = df.dropna(how='all')
+            # Replace blank spaces with NaN, then drop rows where all columns are NaN again
             df = df.replace(r'^\s*$', np.nan, regex=True)
             df = df.dropna(how='all')
             review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
             df[review_column] = df[review_column].astype(str)
+            filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')  # New input field for filter words
+            filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)  # Process the filter words
+            class_names = st.text_input('Enter the possible class names separated by comma')  # New input field for class names
+            df = filter_dataframe(df, review_column, filter_words)  # Filter the DataFrame
         except Exception as e:
             st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
             return
     start_button = st.button('Start Analysis')
     if start_button and df is not None:
+        # Drop rows with NaN or blank values in the review_column
         df = df[df[review_column].notna()]
         df = df[df[review_column].str.strip() != '']
+        class_names = [name.strip() for name in class_names.split(',')]  # Split class names into a list
+        for name in class_names:  # Add a new column for each class name
             if name not in df.columns:
                 df[name] = 0.0
         if review_column in df.columns:
             with st.spinner('Performing sentiment analysis...'):
                 df, df_display = process_reviews(df, review_column, class_names)
+            display_ratings(df, review_column)  # updated this line
             display_dataframe(df, df_display)
         else:
             st.write(f'No column named "{review_column}" found in the uploaded file.')
 def process_reviews(df, review_column, class_names):
     with st.spinner('Classifying reviews...'):
         progress_bar = st.progress(0)

requirements.txt CHANGED Viewed

@@ -9,5 +9,4 @@ matplotlib
 fuzzywuzzy
 scikit-learn
 nltk
-numpy
-lime

 fuzzywuzzy
 scikit-learn
 nltk
+numpy