Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -41,12 +41,15 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
|
|
41 |
|
42 |
|
43 |
#defs
|
44 |
-
def classify_reviews(reviews):
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
48 |
return probabilities
|
49 |
-
|
50 |
def top_rating(scores):
|
51 |
return scores.index(max(scores)) + 1
|
52 |
|
@@ -62,8 +65,13 @@ def filter_dataframe(df, review_column, filter_words):
|
|
62 |
# Return full DataFrame if filter_words is empty or contains only spaces
|
63 |
if not filter_words or all(word.isspace() for word in filter_words):
|
64 |
return df
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
|
69 |
|
@@ -73,16 +81,15 @@ def process_filter_words(filter_words_input):
|
|
73 |
|
74 |
|
75 |
# Function for classifying with the new model
|
76 |
-
def classify_with_new_classes(reviews, class_names):
|
77 |
class_scores = []
|
78 |
-
|
79 |
-
|
80 |
-
result = classifier(
|
81 |
scores_dict = dict(zip(result['labels'], result['scores']))
|
82 |
# Reorder scores to match the original class_names order
|
83 |
scores = [scores_dict[name] for name in class_names]
|
84 |
-
class_scores.
|
85 |
-
|
86 |
return class_scores
|
87 |
|
88 |
|
@@ -98,7 +105,9 @@ def main():
|
|
98 |
|
99 |
if file is not None:
|
100 |
try:
|
101 |
-
|
|
|
|
|
102 |
# Drop rows where all columns are NaN
|
103 |
df = df.dropna(how='all')
|
104 |
# Replace blank spaces with NaN, then drop rows where all columns are NaN again
|
|
|
41 |
|
42 |
|
43 |
#defs
|
44 |
+
def classify_reviews(reviews, batch_size=100):
|
45 |
+
probabilities = []
|
46 |
+
for i in range(0, len(reviews), batch_size):
|
47 |
+
batch_reviews = reviews[i:i+batch_size]
|
48 |
+
inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
|
49 |
+
outputs = model(**inputs)
|
50 |
+
probabilities.extend(F.softmax(outputs.logits, dim=1).tolist())
|
51 |
return probabilities
|
52 |
+
|
53 |
def top_rating(scores):
|
54 |
return scores.index(max(scores)) + 1
|
55 |
|
|
|
65 |
# Return full DataFrame if filter_words is empty or contains only spaces
|
66 |
if not filter_words or all(word.isspace() for word in filter_words):
|
67 |
return df
|
68 |
+
|
69 |
+
# Use multiprocessing to speed up fuzz token set ratio calculation
|
70 |
+
from multiprocessing import Pool
|
71 |
+
with Pool() as pool:
|
72 |
+
filter_scores = pool.map(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]), df[review_column])
|
73 |
+
return df[np.array(filter_scores) > 70] # Adjust this threshold as necessary
|
74 |
+
|
75 |
|
76 |
|
77 |
|
|
|
81 |
|
82 |
|
83 |
# Function for classifying with the new model
|
84 |
+
def classify_with_new_classes(reviews, class_names, batch_size=100):
|
85 |
class_scores = []
|
86 |
+
for i in range(0, len(reviews), batch_size):
|
87 |
+
batch_reviews = reviews[i:i+batch_size]
|
88 |
+
result = classifier(batch_reviews, class_names)
|
89 |
scores_dict = dict(zip(result['labels'], result['scores']))
|
90 |
# Reorder scores to match the original class_names order
|
91 |
scores = [scores_dict[name] for name in class_names]
|
92 |
+
class_scores.extend(scores)
|
|
|
93 |
return class_scores
|
94 |
|
95 |
|
|
|
105 |
|
106 |
if file is not None:
|
107 |
try:
|
108 |
+
chunk_size = 10000 # adjust this value depending on your available memory
|
109 |
+
df = pd.concat(pd.read_excel(file, chunksize=chunk_size))
|
110 |
+
|
111 |
# Drop rows where all columns are NaN
|
112 |
df = df.dropna(how='all')
|
113 |
# Replace blank spaces with NaN, then drop rows where all columns are NaN again
|