BulatF commited on
Commit
5580d32
1 Parent(s): 12a4176

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -14
app.py CHANGED
@@ -41,12 +41,15 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
41
 
42
 
43
  #defs
44
- def classify_reviews(reviews):
45
- inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
46
- outputs = model(**inputs)
47
- probabilities = F.softmax(outputs.logits, dim=1).tolist()
 
 
 
48
  return probabilities
49
-
50
  def top_rating(scores):
51
  return scores.index(max(scores)) + 1
52
 
@@ -62,8 +65,13 @@ def filter_dataframe(df, review_column, filter_words):
62
  # Return full DataFrame if filter_words is empty or contains only spaces
63
  if not filter_words or all(word.isspace() for word in filter_words):
64
  return df
65
- filter_scores = df[review_column].apply(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]))
66
- return df[filter_scores > 70] # Adjust this threshold as necessary
 
 
 
 
 
67
 
68
 
69
 
@@ -73,16 +81,15 @@ def process_filter_words(filter_words_input):
73
 
74
 
75
  # Function for classifying with the new model
76
- def classify_with_new_classes(reviews, class_names):
77
  class_scores = []
78
-
79
- for review in reviews:
80
- result = classifier(review, class_names)
81
  scores_dict = dict(zip(result['labels'], result['scores']))
82
  # Reorder scores to match the original class_names order
83
  scores = [scores_dict[name] for name in class_names]
84
- class_scores.append(scores)
85
-
86
  return class_scores
87
 
88
 
@@ -98,7 +105,9 @@ def main():
98
 
99
  if file is not None:
100
  try:
101
- df = pd.read_excel(file)
 
 
102
  # Drop rows where all columns are NaN
103
  df = df.dropna(how='all')
104
  # Replace blank spaces with NaN, then drop rows where all columns are NaN again
 
41
 
42
 
43
  #defs
44
+ def classify_reviews(reviews, batch_size=100):
45
+ probabilities = []
46
+ for i in range(0, len(reviews), batch_size):
47
+ batch_reviews = reviews[i:i+batch_size]
48
+ inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
49
+ outputs = model(**inputs)
50
+ probabilities.extend(F.softmax(outputs.logits, dim=1).tolist())
51
  return probabilities
52
+
53
  def top_rating(scores):
54
  return scores.index(max(scores)) + 1
55
 
 
65
  # Return full DataFrame if filter_words is empty or contains only spaces
66
  if not filter_words or all(word.isspace() for word in filter_words):
67
  return df
68
+
69
+ # Use multiprocessing to speed up fuzz token set ratio calculation
70
+ from multiprocessing import Pool
71
+ with Pool() as pool:
72
+ filter_scores = pool.map(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]), df[review_column])
73
+ return df[np.array(filter_scores) > 70] # Adjust this threshold as necessary
74
+
75
 
76
 
77
 
 
81
 
82
 
83
  # Function for classifying with the new model
84
+ def classify_with_new_classes(reviews, class_names, batch_size=100):
85
  class_scores = []
86
+ for i in range(0, len(reviews), batch_size):
87
+ batch_reviews = reviews[i:i+batch_size]
88
+ result = classifier(batch_reviews, class_names)
89
  scores_dict = dict(zip(result['labels'], result['scores']))
90
  # Reorder scores to match the original class_names order
91
  scores = [scores_dict[name] for name in class_names]
92
+ class_scores.extend(scores)
 
93
  return class_scores
94
 
95
 
 
105
 
106
  if file is not None:
107
  try:
108
+ chunk_size = 10000 # adjust this value depending on your available memory
109
+ df = pd.concat(pd.read_excel(file, chunksize=chunk_size))
110
+
111
  # Drop rows where all columns are NaN
112
  df = df.dropna(how='all')
113
  # Replace blank spaces with NaN, then drop rows where all columns are NaN again