Spaces:

BulatF
/

StreamlitSentiment

Runtime error

App Files Files Community

BulatF commited on Jul 5, 2023

Commit

5580d32

•

1 Parent(s): 12a4176

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -14

app.py CHANGED Viewed

@@ -41,12 +41,15 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
 #defs
-def classify_reviews(reviews):
-    inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
-    outputs = model(**inputs)
-    probabilities = F.softmax(outputs.logits, dim=1).tolist()
     return probabilities
 def top_rating(scores):
     return scores.index(max(scores)) + 1
@@ -62,8 +65,13 @@ def filter_dataframe(df, review_column, filter_words):
     # Return full DataFrame if filter_words is empty or contains only spaces
     if not filter_words or all(word.isspace() for word in filter_words):
         return df
-    filter_scores = df[review_column].apply(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]))
-    return df[filter_scores > 70]  # Adjust this threshold as necessary
@@ -73,16 +81,15 @@ def process_filter_words(filter_words_input):
 # Function for classifying with the new model
-def classify_with_new_classes(reviews, class_names):
     class_scores = []
-    for review in reviews:
-        result = classifier(review, class_names)
         scores_dict = dict(zip(result['labels'], result['scores']))
         # Reorder scores to match the original class_names order
         scores = [scores_dict[name] for name in class_names]
-        class_scores.append(scores)
     return class_scores
@@ -98,7 +105,9 @@ def main():
     if file is not None:
         try:
-            df = pd.read_excel(file)
             # Drop rows where all columns are NaN
             df = df.dropna(how='all')
             # Replace blank spaces with NaN, then drop rows where all columns are NaN again

 #defs
+def classify_reviews(reviews, batch_size=100):
+    probabilities = []
+    for i in range(0, len(reviews), batch_size):
+        batch_reviews = reviews[i:i+batch_size]
+        inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
+        outputs = model(**inputs)
+        probabilities.extend(F.softmax(outputs.logits, dim=1).tolist())
     return probabilities
 def top_rating(scores):
     return scores.index(max(scores)) + 1
     # Return full DataFrame if filter_words is empty or contains only spaces
     if not filter_words or all(word.isspace() for word in filter_words):
         return df
+    # Use multiprocessing to speed up fuzz token set ratio calculation
+    from multiprocessing import Pool
+    with Pool() as pool:
+        filter_scores = pool.map(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]), df[review_column])
+    return df[np.array(filter_scores) > 70]  # Adjust this threshold as necessary
 # Function for classifying with the new model
+def classify_with_new_classes(reviews, class_names, batch_size=100):
     class_scores = []
+    for i in range(0, len(reviews), batch_size):
+        batch_reviews = reviews[i:i+batch_size]
+        result = classifier(batch_reviews, class_names)
         scores_dict = dict(zip(result['labels'], result['scores']))
         # Reorder scores to match the original class_names order
         scores = [scores_dict[name] for name in class_names]
+        class_scores.extend(scores)
     return class_scores
     if file is not None:
         try:
+            chunk_size = 10000  # adjust this value depending on your available memory
+            df = pd.concat(pd.read_excel(file, chunksize=chunk_size))
             # Drop rows where all columns are NaN
             df = df.dropna(how='all')
             # Replace blank spaces with NaN, then drop rows where all columns are NaN again