Demea9000 commited on
Commit
5214b07
·
1 Parent(s): 60390ee

changed topic so that topics are divided into columns

Browse files
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
  from textclassifier import TextClassifier as tc
 
 
3
 
4
 
5
  def main(from_date, to_date, user_name):
@@ -12,18 +14,75 @@ def main(from_date, to_date, user_name):
12
  user_name = user_name
13
  text_classifier = tc.TextClassifier(from_date=from_date, to_date=to_date, user_name=user_name, num_tweets=20)
14
  text_classifier.run_main_pipeline()
15
- return text_classifier.get_dataframe()
 
16
 
17
 
18
- def greet(name):
19
- return "Hello " + name + "!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  if __name__ == "__main__":
23
- demo = gr.Interface(
24
- fn=main,
25
- inputs=['text', 'text', 'text'],
26
- outputs="dataframe",
27
- )
28
 
29
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from textclassifier import TextClassifier as tc
3
+ from functions import *
4
+ import re
5
 
6
 
7
  def main(from_date, to_date, user_name):
 
14
  user_name = user_name
15
  text_classifier = tc.TextClassifier(from_date=from_date, to_date=to_date, user_name=user_name, num_tweets=20)
16
  text_classifier.run_main_pipeline()
17
+ dataframe = text_classifier.dataframe
18
+ return (dataframe, get_summary_statistics(dataframe))
19
 
20
 
21
+ def get_summary_statistics(dataframe):
22
+ """
23
+ This function returns a summary statistics of the dataframe. Returns a string with the summary statistics.
24
+ :param dataframe: dataframe
25
+ :return: str
26
+ """
27
+ summary_statistics = dataframe.describe()
28
+ return summary_statistics.to_string()
29
+
30
+
31
+ def separate_string(string):
32
+ list_string = string.split('.')
33
+ list_useable = []
34
+ for list_part in list_string :
35
+ list_useable.append(list_part.split(' ', 1))
36
+
37
+ final_list = []
38
+ for li in list_useable[1:]:
39
+ final_list.append(li[1])
40
+ # remove numeric characters and spaces
41
+ filter_numeric_regex = '[^a-z]'
42
+ final_final_list = []
43
+ for li in final_list:
44
+ final_final_list.append(re.sub(filter_numeric_regex,' ',li).strip())
45
+ return final_final_list
46
+
47
+
48
+ def summary_categorical(dataframe):
49
+ """
50
+ This function returns a string of a summary of categorical variables of a dataframe
51
+ :param dataframe:
52
+ :return:
53
+ """
54
+ # First find all numeric columns in the dataframe
55
+ numeric_columns = dataframe._get_numeric_data().columns
56
+ # Then find all categorical columns in the dataframe
57
+ categorical_columns = dataframe.select_dtypes(include=['object']).columns
58
+ print("Numeric columns: " + str(numeric_columns) + "\n" + "Categorical columns: " + str(categorical_columns))
59
+ # Then find the summary statistics of each categorical column
60
+ summary = ""
61
+ for column in categorical_columns:
62
+ # skip 'tweet', 'urls' and 'date' columns
63
+ if column == 'tweet' or column == 'date' or column == 'urls':
64
+ continue
65
+ else:
66
+ summary += column + ": " + str(dataframe[column].value_counts()) + "\n"
67
+ # Return a nice decoration of summary
68
+ return "Summary of categorical variables:\n" + summary
69
 
70
 
71
  if __name__ == "__main__":
72
+ from datetime import date
 
 
 
 
73
 
74
+ # demo = gr.Interface(
75
+ # fn=main,
76
+ # inputs=['text', 'text', 'text'],
77
+ # outputs=["dataframe", "text"],
78
+ # )
79
+ # demo.launch()
80
+
81
+ text_classifier = tc.TextClassifier(from_date='2020-01-01', to_date='2020-01-31', user_name="jimmieakesson",
82
+ num_tweets=20)
83
+ text_classifier.run_main_pipeline()
84
+ print(get_summary_statistics(text_classifier.get_dataframe()))
85
+ print(type(get_summary_statistics(text_classifier.get_dataframe())))
86
+ print(summary_categorical(text_classifier.get_dataframe()))
87
+ string = '1. swedish 2. nuclear 3. hello world 4. uha yhd ikv hahd vva 5. '
88
+ print(separate_string(string))
flagged/log.csv CHANGED
@@ -1,2 +1,3 @@
1
  'from_date','to_date','user_name','output','flag','username','timestamp'
2
  '2020-01-01','2020-01-01','jimmieakesson','{"data": [["", "", ""], ["", "", ""], ["", "", ""]], "headers": ["1", "2", "3"]}','','','2022-07-19 14:58:49.268002'
 
 
1
  'from_date','to_date','user_name','output','flag','username','timestamp'
2
  '2020-01-01','2020-01-01','jimmieakesson','{"data": [["", "", ""], ["", "", ""], ["", "", ""]], "headers": ["1", "2", "3"]}','','','2022-07-19 14:58:49.268002'
3
+ '2021-01-01','2021-01-31','jimmieakesson','{"data": [["", "", ""], ["", "", ""], ["", "", ""]], "headers": ["1", "2", "3"]}','','','','2022-07-20 10:01:35.767463'
functions/functions.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from re import sub
2
+
3
+
4
+ def separate_string(string):
5
+ """
6
+ This function returns a list of strings from a string.
7
+ Example: separate_string('1. swedish 2. nuclear 3. hello world 4. uha yhd ikv hahd vva 5. ')
8
+ returns ['swedish', 'nuclear', 'hello world', 'uha yhd ikv hahd vva', '']
9
+ :param string: string to be separated
10
+ :return: list of string items
11
+ """
12
+ list_string = string.split('.')
13
+ list_useable = []
14
+ for list_part in list_string:
15
+ list_useable.append(list_part.split(' ', 1))
16
+
17
+ final_list = []
18
+ for li in list_useable[1:]:
19
+ final_list.append(li[1])
20
+ # remove numeric characters and spaces
21
+ filter_numeric_regex = '[^a-z]'
22
+ final_final_list = []
23
+ for li in final_list:
24
+ final_final_list.append(sub(filter_numeric_regex, ' ', li).strip())
25
+ return final_final_list
functions/statistics.py ADDED
File without changes
textclassifier/TextClassifier.py CHANGED
@@ -1,16 +1,16 @@
 
1
  import time
 
 
2
 
3
  import openai
4
- import csv
5
  import regex as re
6
- from twitterscraper import TwitterScraper
7
- from datetime import date
8
- import os
9
  from dotenv import find_dotenv, load_dotenv
10
- import pandas as pd
11
- import warnings
12
  from pandas.core.common import SettingWithCopyWarning
13
- import matplotlib.pyplot as plt
 
 
14
 
15
  warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
16
 
@@ -269,12 +269,17 @@ class TextClassifier:
269
  self.df = df
270
  self.df_to_csv(filename)
271
 
272
- def get_tweet_by_id(self, id, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
273
  """
274
- Returns tweet by id.
275
- :param id: id of tweet
276
- :return: tweet
277
  """
 
 
 
 
 
 
278
 
279
  def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
280
  """
@@ -340,8 +345,6 @@ class TextClassifier:
340
 
341
 
342
  if __name__ == "__main__":
343
- text_classifier = TextClassifier(from_date="2020-01-01", to_date="2020-01-31", user_name='dadgostarnooshi', num_tweets=20)
344
  text_classifier.run_main_pipeline()
345
- print(text_classifier.split_topics("1. topic1 2. topic2 3. topic3"))
346
-
347
 
 
1
+ import os
2
  import time
3
+ import warnings
4
+ from datetime import date
5
 
6
  import openai
7
+ import pandas as pd
8
  import regex as re
 
 
 
9
  from dotenv import find_dotenv, load_dotenv
 
 
10
  from pandas.core.common import SettingWithCopyWarning
11
+
12
+ from twitterscraper import TwitterScraper
13
+ from functions import functions as f
14
 
15
  warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
16
 
 
269
  self.df = df
270
  self.df_to_csv(filename)
271
 
272
+ def split_topics_into_columns(self):
273
  """
274
+ Splits the topics into columns.
275
+ :return: None
 
276
  """
277
+ df_topic = self.df.copy()
278
+ df_topic['topic_temp'] = df_topic['topic'].apply(lambda x: f.separate_string(x))
279
+ df_topic_split = pd.DataFrame(df_topic['topic_temp'].tolist(),
280
+ columns=['main_topic', 'sub_topic_1', 'sub_topic_2'])
281
+ self.df = df_topic.merge(df_topic_split, how='left', left_index=True, right_index=True)
282
+ self.df.drop(['topic_temp'], axis=1, inplace=True)
283
 
284
  def run_main_pipeline(self, filename="{}/data/twitterdata.csv".format(ROOT_PATH)):
285
  """
 
345
 
346
 
347
  if __name__ == "__main__":
348
+ text_classifier = TextClassifier(from_date="2020-01-01", to_date="2020-01-31", user_name='jimmieakesson', num_tweets=20)
349
  text_classifier.run_main_pipeline()
 
 
350