Spaces:
Runtime error
Runtime error
some more minor changes
Browse files- app.py +1 -0
- textclassifier/TextClassifier.py +17 -12
app.py
CHANGED
@@ -79,6 +79,7 @@ def main(from_date,
|
|
79 |
|
80 |
):
|
81 |
save_file_bool = s1, s2, s3, s4, s5, s6, s7, s8
|
|
|
82 |
|
83 |
def add_pie_chart(df, leaders, plot_choices):
|
84 |
df_list = []
|
|
|
79 |
|
80 |
):
|
81 |
save_file_bool = s1, s2, s3, s4, s5, s6, s7, s8
|
82 |
+
# Describe what save_file_bool is for: if you want to save the dataframe to a file, this is the boolean for that
|
83 |
|
84 |
def add_pie_chart(df, leaders, plot_choices):
|
85 |
df_list = []
|
textclassifier/TextClassifier.py
CHANGED
@@ -116,16 +116,16 @@ class TextClassifier:
|
|
116 |
diff = 4 - len(item_control)
|
117 |
if diff < 0: # If response gave more than four predictions
|
118 |
cutout = item_control[diff - 1:] # Cut out the superflous predictions
|
119 |
-
item_control = item_control[:diff - 1]
|
120 |
new_s = ""
|
121 |
for i in range(len(cutout)):
|
122 |
new_s += cutout[i]
|
123 |
if i < -diff:
|
124 |
new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
|
125 |
item_control.append(new_s)
|
126 |
-
elif diff > 0:
|
127 |
for i in range(diff):
|
128 |
-
item_control.append("none")
|
129 |
new_item = str(tuple(item_control))
|
130 |
new_item = new_item.replace("'", "")
|
131 |
return new_item
|
@@ -180,16 +180,19 @@ class TextClassifier:
|
|
180 |
# Manually add columns to self.df
|
181 |
self.df['main_topic'] = df_topic_split['main_topic'].tolist()
|
182 |
self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
|
183 |
-
self.df['main_topic'] = self.df['main_topic'].apply(
|
|
|
184 |
|
185 |
self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
|
186 |
# In a few of the outputs from GPT-3 the sub_topic = "sentiment"
|
187 |
self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
|
188 |
-
self.df['sub_topic'] = self.df['sub_topic'].apply(
|
|
|
189 |
|
190 |
self.df['sentiment'] = df_topic_split['sentiment'].tolist()
|
191 |
self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
|
192 |
-
self.df['sentiment'] = self.df['sentiment'].apply(
|
|
|
193 |
|
194 |
self.df['target'] = df_topic_split['target'].tolist()
|
195 |
self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
|
@@ -246,8 +249,9 @@ class TextClassifier:
|
|
246 |
main_topic_list = self.clean_party_names(main_topic_list)
|
247 |
sub_topic_list = self.clean_party_names(sub_topic_list)
|
248 |
for i in range(len(main_topic_list)):
|
249 |
-
if main_topic_list[i].lower() == "none" and sub_topic_list[
|
250 |
-
|
|
|
251 |
elif main_topic_list[i].lower() == "none":
|
252 |
new_list.append(sub_topic_list[i])
|
253 |
elif sub_topic_list[i].lower() == "none":
|
@@ -276,7 +280,7 @@ class TextClassifier:
|
|
276 |
if not topic.endswith("####"):
|
277 |
temp_list.append(topic)
|
278 |
else:
|
279 |
-
temp_list.append(topic[:-4])
|
280 |
topic_list.append(temp_list)
|
281 |
temp_list = []
|
282 |
|
@@ -441,7 +445,7 @@ class TextClassifier:
|
|
441 |
model = SentenceTransformer(model_name)
|
442 |
# Encode the topics/targets with the sentence transformer model
|
443 |
old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
|
444 |
-
|
445 |
# Encode the synonyms with the sentence transformer model
|
446 |
synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
|
447 |
convert_to_tensor=True)
|
@@ -532,7 +536,8 @@ class TextClassifier:
|
|
532 |
"""
|
533 |
df_topic = df.copy()
|
534 |
df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
|
535 |
-
columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target',
|
|
|
536 |
self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
|
537 |
# Manually add columns to self.df
|
538 |
self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
|
@@ -609,4 +614,4 @@ if __name__ == "__main__":
|
|
609 |
to_date = start_date.strftime("%Y-%m-%d")
|
610 |
print("curr_date: ", from_date)
|
611 |
tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
|
612 |
-
tc.run_main_pipeline()
|
|
|
116 |
diff = 4 - len(item_control)
|
117 |
if diff < 0: # If response gave more than four predictions
|
118 |
cutout = item_control[diff - 1:] # Cut out the superflous predictions
|
119 |
+
item_control = item_control[:diff - 1] # Save the rest
|
120 |
new_s = ""
|
121 |
for i in range(len(cutout)):
|
122 |
new_s += cutout[i]
|
123 |
if i < -diff:
|
124 |
new_s += " and " # Merge superflous predictions. E.g. target = 's', 'mp', 'v' -> target = 's and mp and v'
|
125 |
item_control.append(new_s)
|
126 |
+
elif diff > 0: # If response gave less than four predictions
|
127 |
for i in range(diff):
|
128 |
+
item_control.append("none") # Fill out tuple with nones
|
129 |
new_item = str(tuple(item_control))
|
130 |
new_item = new_item.replace("'", "")
|
131 |
return new_item
|
|
|
180 |
# Manually add columns to self.df
|
181 |
self.df['main_topic'] = df_topic_split['main_topic'].tolist()
|
182 |
self.df['main_topic'] = self.df['main_topic'].replace(["n/a", "", " "], "none", regex=True)
|
183 |
+
self.df['main_topic'] = self.df['main_topic'].apply(
|
184 |
+
lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
185 |
|
186 |
self.df['sub_topic'] = df_topic_split['sub_topic'].tolist()
|
187 |
# In a few of the outputs from GPT-3 the sub_topic = "sentiment"
|
188 |
self.df['sub_topic'] = self.df['sub_topic'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
|
189 |
+
self.df['sub_topic'] = self.df['sub_topic'].apply(
|
190 |
+
lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
191 |
|
192 |
self.df['sentiment'] = df_topic_split['sentiment'].tolist()
|
193 |
self.df['sentiment'] = self.df['sentiment'].replace(["n/a", "sentiment", "", " "], "none", regex=True)
|
194 |
+
self.df['sentiment'] = self.df['sentiment'].apply(
|
195 |
+
lambda x: x.strip() if not (len(x) == 1 and x == "-") else "none")
|
196 |
|
197 |
self.df['target'] = df_topic_split['target'].tolist()
|
198 |
self.df['target'] = self.df['target'].replace(["n/a", "", " "], "none", regex=True)
|
|
|
249 |
main_topic_list = self.clean_party_names(main_topic_list)
|
250 |
sub_topic_list = self.clean_party_names(sub_topic_list)
|
251 |
for i in range(len(main_topic_list)):
|
252 |
+
if main_topic_list[i].lower() == "none" and sub_topic_list[
|
253 |
+
i].lower() == "none": # If the predictions are faulty
|
254 |
+
new_list.append("ERROR_496") # Label as ERROR_496 (faulty prediction)
|
255 |
elif main_topic_list[i].lower() == "none":
|
256 |
new_list.append(sub_topic_list[i])
|
257 |
elif sub_topic_list[i].lower() == "none":
|
|
|
280 |
if not topic.endswith("####"):
|
281 |
temp_list.append(topic)
|
282 |
else:
|
283 |
+
temp_list.append(topic[:-4]) # Remove the marker (####)
|
284 |
topic_list.append(temp_list)
|
285 |
temp_list = []
|
286 |
|
|
|
445 |
model = SentenceTransformer(model_name)
|
446 |
# Encode the topics/targets with the sentence transformer model
|
447 |
old_list_embeddings = model.encode(old_list, batch_size=64, show_progress_bar=True,
|
448 |
+
convert_to_tensor=True)
|
449 |
# Encode the synonyms with the sentence transformer model
|
450 |
synonym_list_embeddings = model.encode(synonym_list, batch_size=64, show_progress_bar=True,
|
451 |
convert_to_tensor=True)
|
|
|
536 |
"""
|
537 |
df_topic = df.copy()
|
538 |
df_topic_split = pd.DataFrame(df_topic['merged_tuple'].tolist(),
|
539 |
+
columns=['merged_topic', 'cos_sim_topic', 'synonym_topic', 'merged_target',
|
540 |
+
'cos_sim_target', 'synonym_target'])
|
541 |
self.df['merged_tuple'] = df_topic['merged_tuple'].tolist()
|
542 |
# Manually add columns to self.df
|
543 |
self.df['merged_topic'] = df_topic_split['merged_topic'].tolist()
|
|
|
614 |
to_date = start_date.strftime("%Y-%m-%d")
|
615 |
print("curr_date: ", from_date)
|
616 |
tc = TextClassifier(from_date=from_date, to_date=to_date, user_list=USER_LIST, num_tweets=6000)
|
617 |
+
tc.run_main_pipeline()
|