Yara Kyrychenko commited on
Commit
8bf791d
β€’
1 Parent(s): 6074363

Add first files

Browse files
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. Home_Page.py +143 -0
  3. pages/.DS_Store +0 -0
  4. pages/Create_Model.py +117 -0
  5. pages/Update_Model.py +91 -0
  6. requirements.txt +6 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
Home_Page.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd, numpy as np
3
+ from bertopic import BERTopic
4
+ from datetime import datetime
5
+ import math
6
+ from helper import visualize_topics_over_time, visualize_topics_per_class
7
+
8
+ @st.cache_data
9
+ def get_df(url):
10
+ return pd.read_csv(url)
11
+
12
+ @st.cache_resource
13
+ def get_model(url):
14
+ return BERTopic.load(url)
15
+
16
+ @st.cache_data
17
+ def get_topics_over_time(frame,lens):
18
+ strings = frame.proc2.apply(lambda x: str(x))
19
+ date = pd.to_datetime(frame.date,format=st.session_state.datetime_format)
20
+ return st.session_state.model.topics_over_time(strings, date, nr_bins=math.floor(len(frame.date.unique())/3))
21
+
22
+ @st.cache_data
23
+ def get_topics_per_class(frame,colname):
24
+ strings = frame.proc2.apply(lambda x: str(x))
25
+ classes = st.session_state.df[colname].apply(lambda x: str(x))
26
+ return st.session_state.model.topics_per_class(strings, classes=classes)
27
+
28
+ st.set_page_config(
29
+ page_title="BoardTopic",
30
+ page_icon="πŸ€–",
31
+ layout="wide"
32
+ )
33
+
34
+ st.header("πŸ€– BoardTopic")
35
+ st.subheader("Turning your data into insight with behavioral data science")
36
+
37
+ if "model" not in st.session_state:
38
+ st.markdown("Welcome to BoardTopic, a friendly way to understand your big data.")
39
+ st.markdown("If you do not have a BoardTopic model trained, please go to the 'Create Model' tab.")
40
+ st.markdown("If you already have a BoardTopic model trained, please enter the information below:")
41
+ model_name = st.text_input("Please enter model file name (e.g., 'model')")
42
+ df_name = st.text_input("Please enter dataframe file name (e.g., 'df_small.csv')")
43
+ datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
44
+ st.session_state.datetime_format = None if datetime_format == "" else datetime_format
45
+ if st.button("Enter"):
46
+ st.session_state.model = get_model(f'models/{model_name}')
47
+ st.session_state.df = get_df(f'models/{df_name}')
48
+ st.success("Model and dataframe loaded!")
49
+ if "model" in st.session_state:
50
+ if "datetime_format" not in st.session_state:
51
+ st.session_state.datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="", key="datetime_format")
52
+ st.session_state.datetime_format = None if st.session_state.datetime_format == "" else st.session_state.datetime_format
53
+ #st.session_state.df = get_df("df_small.csv")
54
+ st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=6, topic_prefix=False, word_length=10, separator=", "))
55
+ st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
56
+ st.session_state.df["id"] = st.session_state.model_df.index
57
+ st.session_state.model_df["id"] = st.session_state.model_df.index
58
+ st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
59
+ st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date,format=st.session_state.datetime_format)
60
+
61
+ topics_over_time = get_topics_over_time(st.session_state.df,len(st.session_state.df))
62
+ largest_topics = st.session_state.model_df.groupby("Topic").agg("count").sort_values("Document",ascending=False)[0:10]
63
+ st.write(visualize_topics_over_time(st.session_state.model, topics_over_time, topics=list(largest_topics.index),
64
+ custom_labels=True, title = "10 most popular narratives over time"))
65
+
66
+ st.markdown("#### Overall document distribution")
67
+
68
+ grouped = st.session_state.model_df.groupby("date").agg("count")
69
+ grouped['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
70
+ st.bar_chart(data=grouped, x='date', y='Document')
71
+
72
+ st.markdown("#### Emotions")
73
+
74
+ joy = st.session_state.model_df.joy.apply(lambda x: 1 if x > 0.9 else 0)
75
+ sadness = st.session_state.model_df.sadness.apply(lambda x: 1 if x > 0.9 else 0)
76
+ surprise = st.session_state.model_df.surprise.apply(lambda x: 1 if x > 0.9 else 0)
77
+ fear = st.session_state.model_df.fear.apply(lambda x: 1 if x > 0.9 else 0)
78
+ anger = st.session_state.model_df.anger.apply(lambda x: 1 if x > 0.9 else 0)
79
+
80
+ emotions = pd.DataFrame({"date":st.session_state.model_df.date, "source": st.session_state.model_df.source,
81
+ "joy":joy, "sadness":sadness, "surprise":surprise, "fear":fear, "anger":anger})
82
+ #dates = pd.to_datetime(emotions.date.unique(),format="%d.%m.%Y").sort_values()
83
+ #emotions["date"] = pd.to_datetime(emotions.date,format="%d.%m.%Y")
84
+ #emnew = emotions[(dates[-7] <= emotions.date) & (emotions.date <= dates[-1])].drop('date',axis=1, inplace=False).mean()
85
+ #emplot = pd.DataFrame({f"Week of {str(dates[-14])[:10]}": emold, f"Week of {str(dates[-7])[:10]}": emnew}).T
86
+
87
+ st.markdown("##### Percent with emotion by platform")
88
+ st.bar_chart(emotions.groupby("source").agg("mean").T*100)
89
+
90
+ st.markdown("##### Platform breakdown")
91
+ st.bar_chart(emotions.groupby("source").agg("mean")*100)
92
+
93
+ emotionsgr = emotions.groupby("date").agg("mean")*100
94
+ emotionsgr['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
95
+
96
+ st.markdown("##### Emotional dynamics over time")
97
+ st.line_chart(emotionsgr,x="date")
98
+
99
+ st.markdown("#### Topics per class")
100
+ if "source" in st.session_state.df.columns:
101
+ topics_per_class1 = get_topics_per_class(st.session_state.df,"source")
102
+ st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class1, top_n_topics=20, width = 900, height = 600,
103
+ custom_labels=True, title = "20 most popular narratives per platform"))
104
+ st.session_state.df["emotion"] = st.session_state.df[["joy","sadness","surprise","fear",'anger','no_emotion']].idxmax(axis=1)
105
+ topics_per_class2 = get_topics_per_class(st.session_state.df,"emotion")
106
+ st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class2, top_n_topics=20, width = 900, height = 600,
107
+ custom_labels=True, title = "20 most popular narratives per emotion"))
108
+
109
+ st.markdown("#### All topics")
110
+ last_week = st.session_state.model_df
111
+ largest_topics_last_week = last_week.groupby("Topic").agg("count").sort_values("Document",ascending=False)
112
+ largest_topics_last_week["Name"] = [ list(last_week[last_week.Topic == i]["CustomName"])[0] for i in largest_topics_last_week.index ]
113
+ largest_topics_last_week["Count"] = largest_topics_last_week["Document"]
114
+ largest_topics_last_week["Percent"] = round(100*largest_topics_last_week["Count"]/len(st.session_state.model_df),3)
115
+ st.table(largest_topics_last_week[["Name", "Count","Percent"]])
116
+
117
+ dictionary = {i:st.session_state.model.custom_labels_[i] for i in range(len(st.session_state.model.custom_labels_))}
118
+ def mapping(item):
119
+ return dictionary[item]
120
+
121
+ st.markdown("#### Explore representative documents")
122
+ st.selectbox("Select topic",list(st.session_state.model_df.Topic.unique()),key="selected_topic",format_func=mapping)
123
+ repr_docs_mappings, repr_docs, repr_docs_indices = st.session_state.model._extract_representative_docs(st.session_state.model.c_tf_idf_,st.session_state.model_df,st.session_state.model.topic_representations_)
124
+ ind = repr_docs_indices[st.session_state.selected_topic]
125
+ j = 1
126
+ for doc in st.session_state.model_df.iloc[ind].Document:
127
+ st.markdown(f"**Representative document {j}**")
128
+ st.text(doc)
129
+ j+=1
130
+
131
+ st.markdown("---")
132
+ st.markdown("### Save current model")
133
+ name = st.text_input("Please name this model file (e.g., 'my_cool_model')")
134
+ if st.button("Save this model"):
135
+ st.session_state.model.save(f"models/model_{name}")
136
+ st.session_state.df.to_csv(f"models/df_{name}.csv")
137
+ st.success(f"Model and dataframe saved in folder 'models'!")
138
+ if st.button("Restart"):
139
+ st.cache_data.clear()
140
+ st.cache_resource.clear()
141
+ for key in st.session_state.keys():
142
+ del st.session_state[key]
143
+
pages/.DS_Store ADDED
Binary file (6.15 kB). View file
 
pages/Create_Model.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd, numpy as np
3
+ from bertopic import BERTopic
4
+ from transformers import pipeline
5
+
6
+ def make_stopwords():
7
+ text_file = open("dicts/stopwords.txt", "r")
8
+ stopwords_list = text_file.read().split("\n")
9
+ text_file.close()
10
+ return stopwords_list
11
+ stopwords = make_stopwords()
12
+
13
+ @st.cache_data
14
+ def get_emotions(frame, language):
15
+ clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
16
+ st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
17
+ temp = st.classifier(list(frame.proc2))
18
+ rangelabels = len(temp[0])
19
+ temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
20
+ temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
21
+ return temp
22
+
23
+ def preproc(frame):
24
+ import re
25
+ frame["proc"] = frame.text.apply(lambda x: str(x))
26
+ frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
27
+ frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
28
+ frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
29
+ frame["proc2"] = frame.proc
30
+ frame.proc2 = frame.proc2.apply(lambda row: row[:514].lower()) #2048
31
+ return frame
32
+
33
+
34
+ st.set_page_config(
35
+ page_title="Create BERTopic",
36
+ page_icon="πŸ€–",
37
+ layout="wide"
38
+ )
39
+
40
+ st.header("πŸ€– Create BERTopic")
41
+ st.subheader("Use this page to create a model with your data")
42
+
43
+ model_name = st.text_input("Please enter a name for the new model (e.g., 'ukraine_war_jan5')")
44
+ df_name = st.text_input("Please enter data file path (e.g., 'data/df.csv')")
45
+ language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
46
+ text_col = st.text_input("Text column name (exactly as appears in the csv)")
47
+ date_col = st.text_input("Date column name (exactly as appears in the csv)")
48
+ datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
49
+ st.session_state.datetime_format = None if datetime_format == "" else datetime_format
50
+ embs_name = st.text_input("Please enter embedding file path if any (e.g., 'data/embs.csv')")
51
+ sample = st.slider("Percent of data to use", 0, 100, 25)
52
+ emotions_yes = st.checkbox("Get emotions")
53
+
54
+ if st.button("Train new model"):
55
+
56
+ from sentence_transformers import SentenceTransformer
57
+ # https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
58
+ from sklearn.cluster import MiniBatchKMeans
59
+ from sklearn.decomposition import IncrementalPCA
60
+ from bertopic.vectorizers import OnlineCountVectorizer
61
+ np.random.seed(123)
62
+ from river import cluster
63
+ from helper import River
64
+
65
+ umap_model = IncrementalPCA(n_components=5)
66
+ cluster_model = River(cluster.DBSTREAM(clustering_threshold = 1.5,
67
+ fading_factor = 0.05,
68
+ cleanup_interval = 7,
69
+ intersection_factor = 0.5,
70
+ minimum_weight = 1))
71
+ vectorizer_model = OnlineCountVectorizer(decay=.01,stop_words=stopwords)
72
+ embedding_model = "all-MiniLM-L6-v2" if language=="English" else "paraphrase-multilingual-MiniLM-L12-v2"
73
+ sentence_model = SentenceTransformer(embedding_model)
74
+ topic_model = BERTopic(verbose=True,
75
+ embedding_model=embedding_model,
76
+ umap_model=umap_model,
77
+ hdbscan_model=cluster_model,
78
+ vectorizer_model=vectorizer_model,
79
+ calculate_probabilities=True)
80
+
81
+ with st.spinner("Preprocessing..."):
82
+ df = pd.read_csv(df_name).sample(frac=sample/100)
83
+ df = df.rename({text_col: 'text', date_col: 'date'}, axis=1)
84
+ df.index = range(len(df))
85
+ new_df = preproc(df)
86
+ new_df['id'] = df.index
87
+ all_docs = list(new_df.proc)
88
+ st.write(len(df))
89
+ st.write(len(new_df))
90
+
91
+ with st.spinner("Generating embeddings. This may take a couple of hours..."):
92
+ try:
93
+ embeddings = np.array(pd.read_csv(embs_name).drop("Unnamed: 0",axis=1))
94
+ except:
95
+ embeddings = sentence_model.encode(new_df.proc, show_progress_bar=True)
96
+ pd.DataFrame(embeddings).to_csv(f"embs_{model_name}.csv")
97
+
98
+ with st.spinner("Creating the model. This may take a couple of minutes..."):
99
+ doc_emb_chunks = [(all_docs[i:i+1000],embeddings[i:i+1000]) for i in range(0, len(all_docs), 1000)]
100
+ topics = []
101
+ for doc_chunk, emb_chunk in doc_emb_chunks:
102
+ topic_model.partial_fit(all_docs,embeddings)
103
+ topics.extend(topic_model.topics_)
104
+ topic_model.topics_ = topics
105
+
106
+ if emotions_yes:
107
+ with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
108
+ ems = get_emotions(new_df,language)
109
+ new_df = pd.merge(new_df, ems, on='id')
110
+
111
+ st.session_state.model = topic_model
112
+ st.session_state.df = new_df
113
+ st.session_state.model_df = st.session_state.model.get_document_info(new_df.proc)
114
+
115
+ topic_model.save(f"models/{model_name}")
116
+ st.session_state.df.to_csv(f"models/df_{model_name}.csv")
117
+ st.success(f"New model trained and saved as '{model_name}', dataframe saved as 'df_{model_name}.csv' in folter 'models'.")
pages/Update_Model.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd, numpy as np
3
+ from bertopic import BERTopic
4
+ from transformers import pipeline
5
+
6
+ @st.cache_data
7
+ def get_emotions(frame, language):
8
+ clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
9
+ st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
10
+ temp = st.classifier(list(frame.proc2))
11
+ rangelabels = len(temp[0])
12
+ temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
13
+ temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
14
+ return temp
15
+
16
+ def preproc(frame):
17
+ import re
18
+ frame["proc"] = frame.text.apply(lambda x: str(x))
19
+ frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
20
+ frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
21
+ frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
22
+ frame["proc2"] = frame.proc
23
+ frame.proc2 = frame.proc2.apply(lambda row: row[:2048].lower())
24
+ return frame
25
+
26
+ st.set_page_config(
27
+ page_title="Update BERTopic",
28
+ page_icon="πŸ€–",
29
+ layout="wide"
30
+ )
31
+
32
+ st.header("πŸ€– Update BERTopic")
33
+ st.subheader("Use this page to update your model with new data")
34
+
35
+ if "model" not in st.session_state:
36
+ st.markdown("**No model detected. Please go to the Home Page and add a model first.**")
37
+ if "model" in st.session_state:
38
+ old_df = st.session_state.df
39
+ topics = list(st.session_state.model.topics_)
40
+ st.markdown(f"**Current data:** {len(old_df)} rows, {len(st.session_state.model.topic_labels_)} topics.")
41
+ st.markdown(f"**Current date range:** {str(min(pd.to_datetime(old_df.date, format='%d.%m.%Y')))[:10]} -- {str(max(pd.to_datetime(old_df.date,format='%d.%m.%Y')))[:10]}.")
42
+ st.write(old_df)
43
+
44
+ st.markdown("#### Please eneter a name for the updated model and upload files below")
45
+ name = st.text_input("Please enter a model name (e.g., 'my_cool_model')")
46
+ language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
47
+ datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
48
+ st.session_state.datetime_format = None if datetime_format == "" else datetime_format
49
+ uploaded_files = st.file_uploader("Choose a CSV file", accept_multiple_files=True)
50
+ if st.button('All files selected'):
51
+ for i in range(len(uploaded_files)):
52
+ uploaded_file = uploaded_files[i]
53
+ new_df = pd.read_csv(uploaded_file)
54
+ st.write(f"Uploaded file {uploaded_file.name}")
55
+
56
+ with st.spinner("Preprocessing..."):
57
+ new_df= preproc(new_df)
58
+ new_df['id'] = [i for i in range(len(old_df),len(new_df)+len(old_df))]
59
+ docs = list(new_df.proc)
60
+
61
+ with st.spinner("Updating the model. This may take a couple of minutes..."):
62
+ st.session_state.model.partial_fit(docs)
63
+ topics.extend(st.session_state.model.topics_)
64
+
65
+ with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
66
+ ems = get_emotions(new_df, language)
67
+ new_df = pd.merge(new_df, ems, on='id')
68
+
69
+ old_df = pd.concat([old_df,new_df])
70
+ st.success(f"Done with file {uploaded_file.name}!")
71
+ if i == len(uploaded_files)-1:
72
+ st.session_state.df = old_df
73
+ st.session_state.model.topics_ = topics
74
+
75
+ st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=5, topic_prefix=False, word_length=10, separator=", "))
76
+ st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
77
+ st.session_state.df["id"] = st.session_state.model_df.index
78
+ st.session_state.model_df["id"] = st.session_state.model_df.index
79
+ st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
80
+ st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date, format="%d.%m.%Y")
81
+
82
+ st.markdown("---")
83
+ st.markdown(f"**Updated data:** {len(old_df)} rows, {len(st.session_state.model.topic_labels_)} topics.")
84
+ st.markdown(f"**Updated date range:** {str(min(pd.to_datetime(old_df.date, format='%d.%m.%Y')))[:10]} -- {str(max(pd.to_datetime(old_df.date,format='%d.%m.%Y')))[:10]}.")
85
+
86
+ st.session_state.model.save(f"models/model_{name}")
87
+ st.session_state.df.to_csv(f"models/df_{name}.csv")
88
+ st.success(f"Model and dataframe saved in folder 'model'!")
89
+
90
+
91
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas==1.5.3
3
+ numpy
4
+ bertopic
5
+ river==0.10.0
6
+ scikit-learn