Demea9000 commited on
Commit
5c4ad0b
1 Parent(s): a74ddbf

some more minor changes

Browse files
Files changed (2) hide show
  1. app.py +1 -0
  2. textclassifier/TextClassifier.py +17 -12
app.py CHANGED
@@ -79,6 +79,7 @@ def main(from_date,
79
 
80
  ):
81
  save_file_bool = s1, s2, s3, s4, s5, s6, s7, s8
 
82
 
83
  def add_pie_chart(df, leaders, plot_choices):
84
  df_list = []
 
79
 
80
  ):
81
  save_file_bool = s1, s2, s3, s4, s5, s6, s7, s8
82
+ # Describe what save_file_bool is for: if you want to save the dataframe to a file, this is the boolean for that
83
 
84
  def add_pie_chart(df, leaders, plot_choices):
85
  df_list = []
textclassifier/TextClassifier.py CHANGED
@@ -116,16 +116,16 @@ class TextClassifier:
116
  diff = 4 - len(item_control)
117
  if diff < 0: # If response gave more than four predictions
118
  cutout = item_control[diff - 1:] # Cut out the superflous predictions
119
- item_control = item_control[:diff - 1] # Save the rest
120
  new_s = ""
121
  for i in range(len(cutout)):
122
  new_s += cutout[i]
123
  if i < -diff:
124
  new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
125
  item_control.append(new_s)
126
- elif diff > 0: # If response gave less than four predictions
127
  for i in range(diff):
128
- item_control.append("none") # Fill out tuple with nones
129
  new_item = str(tuple(item_control))
130
  new_item = new_item.replace("'", "")
131
  return new_item
@@ -180,16 +180,19 @@ class TextClassifier:
180
  # Manually add columns to self.df
181
  self.df['main_topic'] = df_topic_split['main_topic'].tolist()
182
  self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
183
- self.df['main_topic'] = self.df['main_topic'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
 
184
 
185
  self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
186
  # In a few of the outputs from GPT-3 the sub_topic = "sentiment"
187
  self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
188
- self.df['sub_topic'] = self.df['sub_topic'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
 
189
 
190
  self.df['sentiment'] = df_topic_split['sentiment'].tolist()
191
  self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
192
- self.df['sentiment'] = self.df['sentiment'].apply(lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
 
193
 
194
  self.df['target'] = df_topic_split['target'].tolist()
195
  self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
@@ -246,8 +249,9 @@ class TextClassifier:
246
  main_topic_list = self.clean_party_names(main_topic_list)
247
  sub_topic_list = self.clean_party_names(sub_topic_list)
248
  for i in range(len(main_topic_list)):
249
- if main_topic_list[i].lower() == "none" and sub_topic_list[i].lower() == "none": # If the predictions are faulty
250
- new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
 
251
  elif main_topic_list[i].lower() == "none":
252
  new_list.append(sub_topic_list[i])
253
  elif sub_topic_list[i].lower() == "none":
@@ -276,7 +280,7 @@ class TextClassifier:
276
  if not topic.endswith("####"):
277
  temp_list.append(topic)
278
  else:
279
- temp_list.append(topic[:-4]) # Remove the marker (####)
280
  topic_list.append(temp_list)
281
  temp_list = []
282
 
@@ -441,7 +445,7 @@ class TextClassifier:
441
  model = SentenceTransformer(model_name)
442
  # Encode the topics/targets with the sentence transformer model
443
  old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
444
- convert_to_tensor=True)
445
  # Encode the synonyms with the sentence transformer model
446
  synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
447
  convert_to_tensor=True)
@@ -532,7 +536,8 @@ class TextClassifier:
532
  """
533
  df_topic = df.copy()
534
  df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
535
- columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target', 'cos_sim_target', 'synonym_target'])
 
536
  self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
537
  # Manually add columns to self.df
538
  self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
@@ -609,4 +614,4 @@ if __name__ == "__main__":
609
  to_date = start_date.strftime("%Y-%m-%d")
610
  print("curr_date: ", from_date)
611
  tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
612
- tc.run_main_pipeline()
 
116
  diff = 4 - len(item_control)
117
  if diff < 0: # If response gave more than four predictions
118
  cutout = item_control[diff - 1:] # Cut out the superflous predictions
119
+ item_control = item_control[:diff - 1] # Save the rest
120
  new_s = ""
121
  for i in range(len(cutout)):
122
  new_s += cutout[i]
123
  if i < -diff:
124
  new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
125
  item_control.append(new_s)
126
+ elif diff > 0: # If response gave less than four predictions
127
  for i in range(diff):
128
+ item_control.append("none") # Fill out tuple with nones
129
  new_item = str(tuple(item_control))
130
  new_item = new_item.replace("'", "")
131
  return new_item
 
180
  # Manually add columns to self.df
181
  self.df['main_topic'] = df_topic_split['main_topic'].tolist()
182
  self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
183
+ self.df['main_topic'] = self.df['main_topic'].apply(
184
+ lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
185
 
186
  self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
187
  # In a few of the outputs from GPT-3 the sub_topic = "sentiment"
188
  self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
189
+ self.df['sub_topic'] = self.df['sub_topic'].apply(
190
+ lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
191
 
192
  self.df['sentiment'] = df_topic_split['sentiment'].tolist()
193
  self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
194
+ self.df['sentiment'] = self.df['sentiment'].apply(
195
+ lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
196
 
197
  self.df['target'] = df_topic_split['target'].tolist()
198
  self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
 
249
  main_topic_list = self.clean_party_names(main_topic_list)
250
  sub_topic_list = self.clean_party_names(sub_topic_list)
251
  for i in range(len(main_topic_list)):
252
+ if main_topic_list[i].lower() == "none" and sub_topic_list[
253
+ i].lower() == "none": # If the predictions are faulty
254
+ new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
255
  elif main_topic_list[i].lower() == "none":
256
  new_list.append(sub_topic_list[i])
257
  elif sub_topic_list[i].lower() == "none":
 
280
  if not topic.endswith("####"):
281
  temp_list.append(topic)
282
  else:
283
+ temp_list.append(topic[:-4]) # Remove the marker (####)
284
  topic_list.append(temp_list)
285
  temp_list = []
286
 
 
445
  model = SentenceTransformer(model_name)
446
  # Encode the topics/targets with the sentence transformer model
447
  old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
448
+ convert_to_tensor=True)
449
  # Encode the synonyms with the sentence transformer model
450
  synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
451
  convert_to_tensor=True)
 
536
  """
537
  df_topic = df.copy()
538
  df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
539
+ columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target',
540
+ 'cos_sim_target', 'synonym_target'])
541
  self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
542
  # Manually add columns to self.df
543
  self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
 
614
  to_date = start_date.strftime("%Y-%m-%d")
615
  print("curr_date: ", from_date)
616
  tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
617
+ tc.run_main_pipeline()