BulatF commited on
Commit
4ca73d1
1 Parent(s): 8a6b406

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +35 -50
  2. requirements.txt +1 -2
app.py CHANGED
@@ -16,24 +16,13 @@ nltk.download('stopwords')
16
  import matplotlib.pyplot as plt
17
  import numpy as np
18
 
19
-
20
  stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
21
- st.set_page_config(layout="wide")
22
- @st.cache_resource
23
- def load_model_and_tokenizer(model_name):
24
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
25
- tokenizer = AutoTokenizer.from_pretrained(model_name)
26
- return model, tokenizer
27
-
28
- model, tokenizer = load_model_and_tokenizer('nlptown/bert-base-multilingual-uncased-sentiment')
29
-
30
- @st.cache_resource
31
- def load_pipeline():
32
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
33
- return classifier
34
-
35
- classifier = load_pipeline()
36
 
 
 
 
 
 
37
 
38
  # Import the new model and tokenizer
39
 
@@ -41,15 +30,12 @@ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnl
41
 
42
 
43
  #defs
44
- def classify_reviews(reviews, batch_size=100):
45
- probabilities = []
46
- for i in range(0, len(reviews), batch_size):
47
- batch_reviews = reviews[i:i+batch_size]
48
- inputs = tokenizer(batch_reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
49
- outputs = model(**inputs)
50
- probabilities.extend(F.softmax(outputs.logits, dim=1).tolist())
51
  return probabilities
52
-
53
  def top_rating(scores):
54
  return scores.index(max(scores)) + 1
55
 
@@ -65,13 +51,8 @@ def filter_dataframe(df, review_column, filter_words):
65
  # Return full DataFrame if filter_words is empty or contains only spaces
66
  if not filter_words or all(word.isspace() for word in filter_words):
67
  return df
68
-
69
- # Use multiprocessing to speed up fuzz token set ratio calculation
70
- from multiprocessing import Pool
71
- with Pool() as pool:
72
- filter_scores = pool.map(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]), df[review_column])
73
- return df[np.array(filter_scores) > 70] # Adjust this threshold as necessary
74
-
75
 
76
 
77
 
@@ -81,15 +62,16 @@ def process_filter_words(filter_words_input):
81
 
82
 
83
  # Function for classifying with the new model
84
- def classify_with_new_classes(reviews, class_names, batch_size=100):
85
  class_scores = []
86
- for i in range(0, len(reviews), batch_size):
87
- batch_reviews = reviews[i:i+batch_size]
88
- result = classifier(batch_reviews, class_names)
89
  scores_dict = dict(zip(result['labels'], result['scores']))
90
  # Reorder scores to match the original class_names order
91
  scores = [scores_dict[name] for name in class_names]
92
- class_scores.extend(scores)
 
93
  return class_scores
94
 
95
 
@@ -101,42 +83,46 @@ def main():
101
  file = st.file_uploader("Upload an excel file", type=['xlsx'])
102
  review_column = None
103
  df = None
104
- class_names = None
105
 
106
  if file is not None:
107
  try:
108
- chunk_size = 10000 # adjust this value depending on your available memory
109
- df = pd.concat(pd.read_excel(file, chunksize=chunk_size))
110
  df = df.dropna(how='all')
 
111
  df = df.replace(r'^\s*$', np.nan, regex=True)
112
  df = df.dropna(how='all')
113
  review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
114
  df[review_column] = df[review_column].astype(str)
115
 
116
- filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)')
117
- filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input)
118
- class_names = st.text_input('Enter the possible class names separated by comma')
119
- df = filter_dataframe(df, review_column, filter_words)
 
120
  except Exception as e:
121
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
122
  return
123
 
124
  start_button = st.button('Start Analysis')
125
 
 
126
  if start_button and df is not None:
 
127
  df = df[df[review_column].notna()]
128
  df = df[df[review_column].str.strip() != '']
129
-
130
- class_names = [name.strip() for name in class_names.split(',')]
131
- for name in class_names:
132
  if name not in df.columns:
133
  df[name] = 0.0
134
-
135
  if review_column in df.columns:
136
  with st.spinner('Performing sentiment analysis...'):
137
  df, df_display = process_reviews(df, review_column, class_names)
138
-
139
- display_ratings(df, review_column)
140
  display_dataframe(df, df_display)
141
  else:
142
  st.write(f'No column named "{review_column}" found in the uploaded file.')
@@ -147,7 +133,6 @@ def main():
147
 
148
 
149
 
150
-
151
  def process_reviews(df, review_column, class_names):
152
  with st.spinner('Classifying reviews...'):
153
  progress_bar = st.progress(0)
 
16
  import matplotlib.pyplot as plt
17
  import numpy as np
18
 
 
19
  stopwords_list = stopwords.words('english') + ['your_additional_stopwords_here']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Define the model and tokenizer
22
+ model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'
23
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
25
+ st.set_page_config(layout="wide")
26
 
27
  # Import the new model and tokenizer
28
 
 
30
 
31
 
32
  #defs
33
+ def classify_reviews(reviews):
34
+ inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
35
+ outputs = model(**inputs)
36
+ probabilities = F.softmax(outputs.logits, dim=1).tolist()
 
 
 
37
  return probabilities
38
+
39
  def top_rating(scores):
40
  return scores.index(max(scores)) + 1
41
 
 
51
  # Return full DataFrame if filter_words is empty or contains only spaces
52
  if not filter_words or all(word.isspace() for word in filter_words):
53
  return df
54
+ filter_scores = df[review_column].apply(lambda x: max([fuzz.token_set_ratio(x, word) for word in filter_words]))
55
+ return df[filter_scores > 70] # Adjust this threshold as necessary
 
 
 
 
 
56
 
57
 
58
 
 
62
 
63
 
64
  # Function for classifying with the new model
65
+ def classify_with_new_classes(reviews, class_names):
66
  class_scores = []
67
+
68
+ for review in reviews:
69
+ result = classifier(review, class_names)
70
  scores_dict = dict(zip(result['labels'], result['scores']))
71
  # Reorder scores to match the original class_names order
72
  scores = [scores_dict[name] for name in class_names]
73
+ class_scores.append(scores)
74
+
75
  return class_scores
76
 
77
 
 
83
  file = st.file_uploader("Upload an excel file", type=['xlsx'])
84
  review_column = None
85
  df = None
86
+ class_names = None # New variable for class names
87
 
88
  if file is not None:
89
  try:
90
+ df = pd.read_excel(file)
91
+ # Drop rows where all columns are NaN
92
  df = df.dropna(how='all')
93
+ # Replace blank spaces with NaN, then drop rows where all columns are NaN again
94
  df = df.replace(r'^\s*$', np.nan, regex=True)
95
  df = df.dropna(how='all')
96
  review_column = st.selectbox('Select the column from your excel file containing text', df.columns)
97
  df[review_column] = df[review_column].astype(str)
98
 
99
+
100
+ filter_words_input = st.text_input('Enter words to filter the data by, separated by comma (or leave empty)') # New input field for filter words
101
+ filter_words = [] if filter_words_input.strip() == "" else process_filter_words(filter_words_input) # Process the filter words
102
+ class_names = st.text_input('Enter the possible class names separated by comma') # New input field for class names
103
+ df = filter_dataframe(df, review_column, filter_words) # Filter the DataFrame
104
  except Exception as e:
105
  st.write("An error occurred while reading the uploaded file. Please make sure it's a valid Excel file.")
106
  return
107
 
108
  start_button = st.button('Start Analysis')
109
 
110
+
111
  if start_button and df is not None:
112
+ # Drop rows with NaN or blank values in the review_column
113
  df = df[df[review_column].notna()]
114
  df = df[df[review_column].str.strip() != '']
115
+
116
+ class_names = [name.strip() for name in class_names.split(',')] # Split class names into a list
117
+ for name in class_names: # Add a new column for each class name
118
  if name not in df.columns:
119
  df[name] = 0.0
120
+
121
  if review_column in df.columns:
122
  with st.spinner('Performing sentiment analysis...'):
123
  df, df_display = process_reviews(df, review_column, class_names)
124
+
125
+ display_ratings(df, review_column) # updated this line
126
  display_dataframe(df, df_display)
127
  else:
128
  st.write(f'No column named "{review_column}" found in the uploaded file.')
 
133
 
134
 
135
 
 
136
  def process_reviews(df, review_column, class_names):
137
  with st.spinner('Classifying reviews...'):
138
  progress_bar = st.progress(0)
requirements.txt CHANGED
@@ -9,5 +9,4 @@ matplotlib
9
  fuzzywuzzy
10
  scikit-learn
11
  nltk
12
- numpy
13
- lime
 
9
  fuzzywuzzy
10
  scikit-learn
11
  nltk
12
+ numpy