Spaces:
Running
Running
Yara Kyrychenko
commited on
Commit
β’
8bf791d
1
Parent(s):
6074363
Add first files
Browse files- .DS_Store +0 -0
- Home_Page.py +143 -0
- pages/.DS_Store +0 -0
- pages/Create_Model.py +117 -0
- pages/Update_Model.py +91 -0
- requirements.txt +6 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
Home_Page.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd, numpy as np
|
3 |
+
from bertopic import BERTopic
|
4 |
+
from datetime import datetime
|
5 |
+
import math
|
6 |
+
from helper import visualize_topics_over_time, visualize_topics_per_class
|
7 |
+
|
8 |
+
@st.cache_data
|
9 |
+
def get_df(url):
|
10 |
+
return pd.read_csv(url)
|
11 |
+
|
12 |
+
@st.cache_resource
|
13 |
+
def get_model(url):
|
14 |
+
return BERTopic.load(url)
|
15 |
+
|
16 |
+
@st.cache_data
|
17 |
+
def get_topics_over_time(frame,lens):
|
18 |
+
strings = frame.proc2.apply(lambda x: str(x))
|
19 |
+
date = pd.to_datetime(frame.date,format=st.session_state.datetime_format)
|
20 |
+
return st.session_state.model.topics_over_time(strings, date, nr_bins=math.floor(len(frame.date.unique())/3))
|
21 |
+
|
22 |
+
@st.cache_data
|
23 |
+
def get_topics_per_class(frame,colname):
|
24 |
+
strings = frame.proc2.apply(lambda x: str(x))
|
25 |
+
classes = st.session_state.df[colname].apply(lambda x: str(x))
|
26 |
+
return st.session_state.model.topics_per_class(strings, classes=classes)
|
27 |
+
|
28 |
+
st.set_page_config(
|
29 |
+
page_title="BoardTopic",
|
30 |
+
page_icon="π€",
|
31 |
+
layout="wide"
|
32 |
+
)
|
33 |
+
|
34 |
+
st.header("π€ BoardTopic")
|
35 |
+
st.subheader("Turning your data into insight with behavioral data science")
|
36 |
+
|
37 |
+
if "model" not in st.session_state:
|
38 |
+
st.markdown("Welcome to BoardTopic, a friendly way to understand your big data.")
|
39 |
+
st.markdown("If you do not have a BoardTopic model trained, please go to the 'Create Model' tab.")
|
40 |
+
st.markdown("If you already have a BoardTopic model trained, please enter the information below:")
|
41 |
+
model_name = st.text_input("Please enter model file name (e.g., 'model')")
|
42 |
+
df_name = st.text_input("Please enter dataframe file name (e.g., 'df_small.csv')")
|
43 |
+
datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
|
44 |
+
st.session_state.datetime_format = None if datetime_format == "" else datetime_format
|
45 |
+
if st.button("Enter"):
|
46 |
+
st.session_state.model = get_model(f'models/{model_name}')
|
47 |
+
st.session_state.df = get_df(f'models/{df_name}')
|
48 |
+
st.success("Model and dataframe loaded!")
|
49 |
+
if "model" in st.session_state:
|
50 |
+
if "datetime_format" not in st.session_state:
|
51 |
+
st.session_state.datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="", key="datetime_format")
|
52 |
+
st.session_state.datetime_format = None if st.session_state.datetime_format == "" else st.session_state.datetime_format
|
53 |
+
#st.session_state.df = get_df("df_small.csv")
|
54 |
+
st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=6, topic_prefix=False, word_length=10, separator=", "))
|
55 |
+
st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
|
56 |
+
st.session_state.df["id"] = st.session_state.model_df.index
|
57 |
+
st.session_state.model_df["id"] = st.session_state.model_df.index
|
58 |
+
st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
|
59 |
+
st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date,format=st.session_state.datetime_format)
|
60 |
+
|
61 |
+
topics_over_time = get_topics_over_time(st.session_state.df,len(st.session_state.df))
|
62 |
+
largest_topics = st.session_state.model_df.groupby("Topic").agg("count").sort_values("Document",ascending=False)[0:10]
|
63 |
+
st.write(visualize_topics_over_time(st.session_state.model, topics_over_time, topics=list(largest_topics.index),
|
64 |
+
custom_labels=True, title = "10 most popular narratives over time"))
|
65 |
+
|
66 |
+
st.markdown("#### Overall document distribution")
|
67 |
+
|
68 |
+
grouped = st.session_state.model_df.groupby("date").agg("count")
|
69 |
+
grouped['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
|
70 |
+
st.bar_chart(data=grouped, x='date', y='Document')
|
71 |
+
|
72 |
+
st.markdown("#### Emotions")
|
73 |
+
|
74 |
+
joy = st.session_state.model_df.joy.apply(lambda x: 1 if x > 0.9 else 0)
|
75 |
+
sadness = st.session_state.model_df.sadness.apply(lambda x: 1 if x > 0.9 else 0)
|
76 |
+
surprise = st.session_state.model_df.surprise.apply(lambda x: 1 if x > 0.9 else 0)
|
77 |
+
fear = st.session_state.model_df.fear.apply(lambda x: 1 if x > 0.9 else 0)
|
78 |
+
anger = st.session_state.model_df.anger.apply(lambda x: 1 if x > 0.9 else 0)
|
79 |
+
|
80 |
+
emotions = pd.DataFrame({"date":st.session_state.model_df.date, "source": st.session_state.model_df.source,
|
81 |
+
"joy":joy, "sadness":sadness, "surprise":surprise, "fear":fear, "anger":anger})
|
82 |
+
#dates = pd.to_datetime(emotions.date.unique(),format="%d.%m.%Y").sort_values()
|
83 |
+
#emotions["date"] = pd.to_datetime(emotions.date,format="%d.%m.%Y")
|
84 |
+
#emnew = emotions[(dates[-7] <= emotions.date) & (emotions.date <= dates[-1])].drop('date',axis=1, inplace=False).mean()
|
85 |
+
#emplot = pd.DataFrame({f"Week of {str(dates[-14])[:10]}": emold, f"Week of {str(dates[-7])[:10]}": emnew}).T
|
86 |
+
|
87 |
+
st.markdown("##### Percent with emotion by platform")
|
88 |
+
st.bar_chart(emotions.groupby("source").agg("mean").T*100)
|
89 |
+
|
90 |
+
st.markdown("##### Platform breakdown")
|
91 |
+
st.bar_chart(emotions.groupby("source").agg("mean")*100)
|
92 |
+
|
93 |
+
emotionsgr = emotions.groupby("date").agg("mean")*100
|
94 |
+
emotionsgr['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
|
95 |
+
|
96 |
+
st.markdown("##### Emotional dynamics over time")
|
97 |
+
st.line_chart(emotionsgr,x="date")
|
98 |
+
|
99 |
+
st.markdown("#### Topics per class")
|
100 |
+
if "source" in st.session_state.df.columns:
|
101 |
+
topics_per_class1 = get_topics_per_class(st.session_state.df,"source")
|
102 |
+
st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class1, top_n_topics=20, width = 900, height = 600,
|
103 |
+
custom_labels=True, title = "20 most popular narratives per platform"))
|
104 |
+
st.session_state.df["emotion"] = st.session_state.df[["joy","sadness","surprise","fear",'anger','no_emotion']].idxmax(axis=1)
|
105 |
+
topics_per_class2 = get_topics_per_class(st.session_state.df,"emotion")
|
106 |
+
st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class2, top_n_topics=20, width = 900, height = 600,
|
107 |
+
custom_labels=True, title = "20 most popular narratives per emotion"))
|
108 |
+
|
109 |
+
st.markdown("#### All topics")
|
110 |
+
last_week = st.session_state.model_df
|
111 |
+
largest_topics_last_week = last_week.groupby("Topic").agg("count").sort_values("Document",ascending=False)
|
112 |
+
largest_topics_last_week["Name"] = [ list(last_week[last_week.Topic == i]["CustomName"])[0] for i in largest_topics_last_week.index ]
|
113 |
+
largest_topics_last_week["Count"] = largest_topics_last_week["Document"]
|
114 |
+
largest_topics_last_week["Percent"] = round(100*largest_topics_last_week["Count"]/len(st.session_state.model_df),3)
|
115 |
+
st.table(largest_topics_last_week[["Name", "Count","Percent"]])
|
116 |
+
|
117 |
+
dictionary = {i:st.session_state.model.custom_labels_[i] for i in range(len(st.session_state.model.custom_labels_))}
|
118 |
+
def mapping(item):
|
119 |
+
return dictionary[item]
|
120 |
+
|
121 |
+
st.markdown("#### Explore representative documents")
|
122 |
+
st.selectbox("Select topic",list(st.session_state.model_df.Topic.unique()),key="selected_topic",format_func=mapping)
|
123 |
+
repr_docs_mappings, repr_docs, repr_docs_indices = st.session_state.model._extract_representative_docs(st.session_state.model.c_tf_idf_,st.session_state.model_df,st.session_state.model.topic_representations_)
|
124 |
+
ind = repr_docs_indices[st.session_state.selected_topic]
|
125 |
+
j = 1
|
126 |
+
for doc in st.session_state.model_df.iloc[ind].Document:
|
127 |
+
st.markdown(f"**Representative document {j}**")
|
128 |
+
st.text(doc)
|
129 |
+
j+=1
|
130 |
+
|
131 |
+
st.markdown("---")
|
132 |
+
st.markdown("### Save current model")
|
133 |
+
name = st.text_input("Please name this model file (e.g., 'my_cool_model')")
|
134 |
+
if st.button("Save this model"):
|
135 |
+
st.session_state.model.save(f"models/model_{name}")
|
136 |
+
st.session_state.df.to_csv(f"models/df_{name}.csv")
|
137 |
+
st.success(f"Model and dataframe saved in folder 'models'!")
|
138 |
+
if st.button("Restart"):
|
139 |
+
st.cache_data.clear()
|
140 |
+
st.cache_resource.clear()
|
141 |
+
for key in st.session_state.keys():
|
142 |
+
del st.session_state[key]
|
143 |
+
|
pages/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
pages/Create_Model.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd, numpy as np
|
3 |
+
from bertopic import BERTopic
|
4 |
+
from transformers import pipeline
|
5 |
+
|
6 |
+
def make_stopwords():
|
7 |
+
text_file = open("dicts/stopwords.txt", "r")
|
8 |
+
stopwords_list = text_file.read().split("\n")
|
9 |
+
text_file.close()
|
10 |
+
return stopwords_list
|
11 |
+
stopwords = make_stopwords()
|
12 |
+
|
13 |
+
@st.cache_data
|
14 |
+
def get_emotions(frame, language):
|
15 |
+
clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
|
16 |
+
st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
|
17 |
+
temp = st.classifier(list(frame.proc2))
|
18 |
+
rangelabels = len(temp[0])
|
19 |
+
temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
|
20 |
+
temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
|
21 |
+
return temp
|
22 |
+
|
23 |
+
def preproc(frame):
|
24 |
+
import re
|
25 |
+
frame["proc"] = frame.text.apply(lambda x: str(x))
|
26 |
+
frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
|
27 |
+
frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
|
28 |
+
frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
|
29 |
+
frame["proc2"] = frame.proc
|
30 |
+
frame.proc2 = frame.proc2.apply(lambda row: row[:514].lower()) #2048
|
31 |
+
return frame
|
32 |
+
|
33 |
+
|
34 |
+
st.set_page_config(
|
35 |
+
page_title="Create BERTopic",
|
36 |
+
page_icon="π€",
|
37 |
+
layout="wide"
|
38 |
+
)
|
39 |
+
|
40 |
+
st.header("π€ Create BERTopic")
|
41 |
+
st.subheader("Use this page to create a model with your data")
|
42 |
+
|
43 |
+
model_name = st.text_input("Please enter a name for the new model (e.g., 'ukraine_war_jan5')")
|
44 |
+
df_name = st.text_input("Please enter data file path (e.g., 'data/df.csv')")
|
45 |
+
language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
|
46 |
+
text_col = st.text_input("Text column name (exactly as appears in the csv)")
|
47 |
+
date_col = st.text_input("Date column name (exactly as appears in the csv)")
|
48 |
+
datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
|
49 |
+
st.session_state.datetime_format = None if datetime_format == "" else datetime_format
|
50 |
+
embs_name = st.text_input("Please enter embedding file path if any (e.g., 'data/embs.csv')")
|
51 |
+
sample = st.slider("Percent of data to use", 0, 100, 25)
|
52 |
+
emotions_yes = st.checkbox("Get emotions")
|
53 |
+
|
54 |
+
if st.button("Train new model"):
|
55 |
+
|
56 |
+
from sentence_transformers import SentenceTransformer
|
57 |
+
# https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
|
58 |
+
from sklearn.cluster import MiniBatchKMeans
|
59 |
+
from sklearn.decomposition import IncrementalPCA
|
60 |
+
from bertopic.vectorizers import OnlineCountVectorizer
|
61 |
+
np.random.seed(123)
|
62 |
+
from river import cluster
|
63 |
+
from helper import River
|
64 |
+
|
65 |
+
umap_model = IncrementalPCA(n_components=5)
|
66 |
+
cluster_model = River(cluster.DBSTREAM(clustering_threshold = 1.5,
|
67 |
+
fading_factor = 0.05,
|
68 |
+
cleanup_interval = 7,
|
69 |
+
intersection_factor = 0.5,
|
70 |
+
minimum_weight = 1))
|
71 |
+
vectorizer_model = OnlineCountVectorizer(decay=.01,stop_words=stopwords)
|
72 |
+
embedding_model = "all-MiniLM-L6-v2" if language=="English" else "paraphrase-multilingual-MiniLM-L12-v2"
|
73 |
+
sentence_model = SentenceTransformer(embedding_model)
|
74 |
+
topic_model = BERTopic(verbose=True,
|
75 |
+
embedding_model=embedding_model,
|
76 |
+
umap_model=umap_model,
|
77 |
+
hdbscan_model=cluster_model,
|
78 |
+
vectorizer_model=vectorizer_model,
|
79 |
+
calculate_probabilities=True)
|
80 |
+
|
81 |
+
with st.spinner("Preprocessing..."):
|
82 |
+
df = pd.read_csv(df_name).sample(frac=sample/100)
|
83 |
+
df = df.rename({text_col: 'text', date_col: 'date'}, axis=1)
|
84 |
+
df.index = range(len(df))
|
85 |
+
new_df = preproc(df)
|
86 |
+
new_df['id'] = df.index
|
87 |
+
all_docs = list(new_df.proc)
|
88 |
+
st.write(len(df))
|
89 |
+
st.write(len(new_df))
|
90 |
+
|
91 |
+
with st.spinner("Generating embeddings. This may take a couple of hours..."):
|
92 |
+
try:
|
93 |
+
embeddings = np.array(pd.read_csv(embs_name).drop("Unnamed: 0",axis=1))
|
94 |
+
except:
|
95 |
+
embeddings = sentence_model.encode(new_df.proc, show_progress_bar=True)
|
96 |
+
pd.DataFrame(embeddings).to_csv(f"embs_{model_name}.csv")
|
97 |
+
|
98 |
+
with st.spinner("Creating the model. This may take a couple of minutes..."):
|
99 |
+
doc_emb_chunks = [(all_docs[i:i+1000],embeddings[i:i+1000]) for i in range(0, len(all_docs), 1000)]
|
100 |
+
topics = []
|
101 |
+
for doc_chunk, emb_chunk in doc_emb_chunks:
|
102 |
+
topic_model.partial_fit(all_docs,embeddings)
|
103 |
+
topics.extend(topic_model.topics_)
|
104 |
+
topic_model.topics_ = topics
|
105 |
+
|
106 |
+
if emotions_yes:
|
107 |
+
with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
|
108 |
+
ems = get_emotions(new_df,language)
|
109 |
+
new_df = pd.merge(new_df, ems, on='id')
|
110 |
+
|
111 |
+
st.session_state.model = topic_model
|
112 |
+
st.session_state.df = new_df
|
113 |
+
st.session_state.model_df = st.session_state.model.get_document_info(new_df.proc)
|
114 |
+
|
115 |
+
topic_model.save(f"models/{model_name}")
|
116 |
+
st.session_state.df.to_csv(f"models/df_{model_name}.csv")
|
117 |
+
st.success(f"New model trained and saved as '{model_name}', dataframe saved as 'df_{model_name}.csv' in folter 'models'.")
|
pages/Update_Model.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd, numpy as np
|
3 |
+
from bertopic import BERTopic
|
4 |
+
from transformers import pipeline
|
5 |
+
|
6 |
+
@st.cache_data
|
7 |
+
def get_emotions(frame, language):
|
8 |
+
clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
|
9 |
+
st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
|
10 |
+
temp = st.classifier(list(frame.proc2))
|
11 |
+
rangelabels = len(temp[0])
|
12 |
+
temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
|
13 |
+
temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
|
14 |
+
return temp
|
15 |
+
|
16 |
+
def preproc(frame):
|
17 |
+
import re
|
18 |
+
frame["proc"] = frame.text.apply(lambda x: str(x))
|
19 |
+
frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
|
20 |
+
frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
|
21 |
+
frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
|
22 |
+
frame["proc2"] = frame.proc
|
23 |
+
frame.proc2 = frame.proc2.apply(lambda row: row[:2048].lower())
|
24 |
+
return frame
|
25 |
+
|
26 |
+
st.set_page_config(
|
27 |
+
page_title="Update BERTopic",
|
28 |
+
page_icon="π€",
|
29 |
+
layout="wide"
|
30 |
+
)
|
31 |
+
|
32 |
+
st.header("π€ Update BERTopic")
|
33 |
+
st.subheader("Use this page to update your model with new data")
|
34 |
+
|
35 |
+
if "model" not in st.session_state:
|
36 |
+
st.markdown("**No model detected. Please go to the Home Page and add a model first.**")
|
37 |
+
if "model" in st.session_state:
|
38 |
+
old_df = st.session_state.df
|
39 |
+
topics = list(st.session_state.model.topics_)
|
40 |
+
st.markdown(f"**Current data:** {len(old_df)} rows, {len(st.session_state.model.topic_labels_)} topics.")
|
41 |
+
st.markdown(f"**Current date range:** {str(min(pd.to_datetime(old_df.date, format='%d.%m.%Y')))[:10]} -- {str(max(pd.to_datetime(old_df.date,format='%d.%m.%Y')))[:10]}.")
|
42 |
+
st.write(old_df)
|
43 |
+
|
44 |
+
st.markdown("#### Please eneter a name for the updated model and upload files below")
|
45 |
+
name = st.text_input("Please enter a model name (e.g., 'my_cool_model')")
|
46 |
+
language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
|
47 |
+
datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
|
48 |
+
st.session_state.datetime_format = None if datetime_format == "" else datetime_format
|
49 |
+
uploaded_files = st.file_uploader("Choose a CSV file", accept_multiple_files=True)
|
50 |
+
if st.button('All files selected'):
|
51 |
+
for i in range(len(uploaded_files)):
|
52 |
+
uploaded_file = uploaded_files[i]
|
53 |
+
new_df = pd.read_csv(uploaded_file)
|
54 |
+
st.write(f"Uploaded file {uploaded_file.name}")
|
55 |
+
|
56 |
+
with st.spinner("Preprocessing..."):
|
57 |
+
new_df= preproc(new_df)
|
58 |
+
new_df['id'] = [i for i in range(len(old_df),len(new_df)+len(old_df))]
|
59 |
+
docs = list(new_df.proc)
|
60 |
+
|
61 |
+
with st.spinner("Updating the model. This may take a couple of minutes..."):
|
62 |
+
st.session_state.model.partial_fit(docs)
|
63 |
+
topics.extend(st.session_state.model.topics_)
|
64 |
+
|
65 |
+
with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
|
66 |
+
ems = get_emotions(new_df, language)
|
67 |
+
new_df = pd.merge(new_df, ems, on='id')
|
68 |
+
|
69 |
+
old_df = pd.concat([old_df,new_df])
|
70 |
+
st.success(f"Done with file {uploaded_file.name}!")
|
71 |
+
if i == len(uploaded_files)-1:
|
72 |
+
st.session_state.df = old_df
|
73 |
+
st.session_state.model.topics_ = topics
|
74 |
+
|
75 |
+
st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=5, topic_prefix=False, word_length=10, separator=", "))
|
76 |
+
st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
|
77 |
+
st.session_state.df["id"] = st.session_state.model_df.index
|
78 |
+
st.session_state.model_df["id"] = st.session_state.model_df.index
|
79 |
+
st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
|
80 |
+
st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date, format="%d.%m.%Y")
|
81 |
+
|
82 |
+
st.markdown("---")
|
83 |
+
st.markdown(f"**Updated data:** {len(old_df)} rows, {len(st.session_state.model.topic_labels_)} topics.")
|
84 |
+
st.markdown(f"**Updated date range:** {str(min(pd.to_datetime(old_df.date, format='%d.%m.%Y')))[:10]} -- {str(max(pd.to_datetime(old_df.date,format='%d.%m.%Y')))[:10]}.")
|
85 |
+
|
86 |
+
st.session_state.model.save(f"models/model_{name}")
|
87 |
+
st.session_state.df.to_csv(f"models/df_{name}.csv")
|
88 |
+
st.success(f"Model and dataframe saved in folder 'model'!")
|
89 |
+
|
90 |
+
|
91 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
pandas==1.5.3
|
3 |
+
numpy
|
4 |
+
bertopic
|
5 |
+
river==0.10.0
|
6 |
+
scikit-learn
|