Spaces:

YaraKyrychenko
/

BERTinsights

Running

App Files Files Community

Yara Kyrychenko commited on Sep 13, 2023

Commit

8bf791d

•

1 Parent(s): 6074363

Add first files

Browse files

Files changed (6) hide show

.DS_Store +0 -0
Home_Page.py +143 -0
pages/.DS_Store +0 -0
pages/Create_Model.py +117 -0
pages/Update_Model.py +91 -0
requirements.txt +6 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

Home_Page.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import streamlit as st
+import pandas as pd, numpy as np
+from bertopic import BERTopic
+from datetime import datetime
+import math
+from helper import visualize_topics_over_time, visualize_topics_per_class
+@st.cache_data
+def get_df(url):
+    return pd.read_csv(url)
+@st.cache_resource
+def get_model(url):
+    return BERTopic.load(url)
+@st.cache_data
+def get_topics_over_time(frame,lens):
+    strings = frame.proc2.apply(lambda x: str(x))
+    date = pd.to_datetime(frame.date,format=st.session_state.datetime_format)
+    return st.session_state.model.topics_over_time(strings, date, nr_bins=math.floor(len(frame.date.unique())/3))
+@st.cache_data
+def get_topics_per_class(frame,colname):
+    strings = frame.proc2.apply(lambda x: str(x))
+    classes = st.session_state.df[colname].apply(lambda x: str(x))
+    return st.session_state.model.topics_per_class(strings, classes=classes)
+st.set_page_config(
+    page_title="BoardTopic",
+    page_icon="🤖",
+   layout="wide"
+)
+st.header("🤖 BoardTopic")
+st.subheader("Turning your data into insight with behavioral data science")
+if "model" not in st.session_state:
+    st.markdown("Welcome to BoardTopic, a friendly way to understand your big data.")
+    st.markdown("If you do not have a BoardTopic model trained, please go to the 'Create Model' tab.")
+    st.markdown("If you already have a BoardTopic model trained, please enter the information below:")
+    model_name = st.text_input("Please enter model file name (e.g., 'model')")
+    df_name =  st.text_input("Please enter dataframe file name (e.g., 'df_small.csv')")
+    datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
+    st.session_state.datetime_format = None if datetime_format == "" else datetime_format
+    if st.button("Enter"):
+        st.session_state.model = get_model(f'models/{model_name}')
+        st.session_state.df = get_df(f'models/{df_name}')
+        st.success("Model and dataframe loaded!")
+if "model" in st.session_state:
+    if "datetime_format" not in st.session_state:
+        st.session_state.datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="", key="datetime_format")
+        st.session_state.datetime_format = None if st.session_state.datetime_format == "" else st.session_state.datetime_format
+    #st.session_state.df = get_df("df_small.csv")
+    st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=6, topic_prefix=False, word_length=10, separator=", "))
+    st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
+    st.session_state.df["id"] = st.session_state.model_df.index
+    st.session_state.model_df["id"] = st.session_state.model_df.index
+    st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
+    st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date,format=st.session_state.datetime_format)
+    topics_over_time = get_topics_over_time(st.session_state.df,len(st.session_state.df))
+    largest_topics  = st.session_state.model_df.groupby("Topic").agg("count").sort_values("Document",ascending=False)[0:10]
+    st.write(visualize_topics_over_time(st.session_state.model, topics_over_time, topics=list(largest_topics.index),
+                                                           custom_labels=True, title = "10 most popular narratives over time"))
+    st.markdown("#### Overall document distribution")
+    grouped = st.session_state.model_df.groupby("date").agg("count")
+    grouped['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
+    st.bar_chart(data=grouped, x='date', y='Document')
+    st.markdown("#### Emotions")
+    joy = st.session_state.model_df.joy.apply(lambda x: 1 if x > 0.9 else 0)
+    sadness = st.session_state.model_df.sadness.apply(lambda x: 1 if x > 0.9 else 0)
+    surprise = st.session_state.model_df.surprise.apply(lambda x: 1 if x > 0.9 else 0)
+    fear = st.session_state.model_df.fear.apply(lambda x: 1 if x > 0.9 else 0)
+    anger = st.session_state.model_df.anger.apply(lambda x: 1 if x > 0.9 else 0)
+    emotions = pd.DataFrame({"date":st.session_state.model_df.date, "source": st.session_state.model_df.source,
+                        "joy":joy, "sadness":sadness, "surprise":surprise, "fear":fear, "anger":anger})
+#dates = pd.to_datetime(emotions.date.unique(),format="%d.%m.%Y").sort_values()
+#emotions["date"] = pd.to_datetime(emotions.date,format="%d.%m.%Y")
+#emnew = emotions[(dates[-7] <= emotions.date) & (emotions.date <= dates[-1])].drop('date',axis=1, inplace=False).mean()
+#emplot = pd.DataFrame({f"Week of {str(dates[-14])[:10]}": emold, f"Week of {str(dates[-7])[:10]}": emnew}).T
+    st.markdown("##### Percent with emotion by platform")
+    st.bar_chart(emotions.groupby("source").agg("mean").T*100)
+    st.markdown("##### Platform breakdown")
+    st.bar_chart(emotions.groupby("source").agg("mean")*100)
+    emotionsgr = emotions.groupby("date").agg("mean")*100
+    emotionsgr['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format)
+    st.markdown("##### Emotional dynamics over time")
+    st.line_chart(emotionsgr,x="date")
+    st.markdown("#### Topics per class")
+    if "source" in st.session_state.df.columns:
+        topics_per_class1 = get_topics_per_class(st.session_state.df,"source")
+        st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class1, top_n_topics=20, width = 900, height = 600,
+                                               custom_labels=True, title = "20 most popular narratives per platform"))
+    st.session_state.df["emotion"] = st.session_state.df[["joy","sadness","surprise","fear",'anger','no_emotion']].idxmax(axis=1)
+    topics_per_class2 = get_topics_per_class(st.session_state.df,"emotion")
+    st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class2, top_n_topics=20, width = 900, height = 600,
+                                               custom_labels=True, title = "20 most popular narratives per emotion"))
+    st.markdown("#### All topics")
+    last_week = st.session_state.model_df
+    largest_topics_last_week = last_week.groupby("Topic").agg("count").sort_values("Document",ascending=False)
+    largest_topics_last_week["Name"] = [ list(last_week[last_week.Topic == i]["CustomName"])[0] for i in largest_topics_last_week.index ]
+    largest_topics_last_week["Count"] = largest_topics_last_week["Document"]
+    largest_topics_last_week["Percent"] = round(100*largest_topics_last_week["Count"]/len(st.session_state.model_df),3)
+    st.table(largest_topics_last_week[["Name", "Count","Percent"]])
+    dictionary = {i:st.session_state.model.custom_labels_[i] for i in range(len(st.session_state.model.custom_labels_))}
+    def mapping(item):
+        return dictionary[item]
+    st.markdown("#### Explore representative documents")
+    st.selectbox("Select topic",list(st.session_state.model_df.Topic.unique()),key="selected_topic",format_func=mapping)
+    repr_docs_mappings, repr_docs, repr_docs_indices = st.session_state.model._extract_representative_docs(st.session_state.model.c_tf_idf_,st.session_state.model_df,st.session_state.model.topic_representations_)
+    ind = repr_docs_indices[st.session_state.selected_topic]
+    j = 1
+    for doc in st.session_state.model_df.iloc[ind].Document:
+        st.markdown(f"**Representative document {j}**")
+        st.text(doc)
+        j+=1
+    st.markdown("---")
+    st.markdown("### Save current model")
+    name = st.text_input("Please name this model file (e.g., 'my_cool_model')")
+    if st.button("Save this model"):
+        st.session_state.model.save(f"models/model_{name}")
+        st.session_state.df.to_csv(f"models/df_{name}.csv")
+        st.success(f"Model and dataframe saved in folder 'models'!")
+    if st.button("Restart"):
+        st.cache_data.clear()
+        st.cache_resource.clear()
+        for key in st.session_state.keys():
+            del st.session_state[key]

pages/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

pages/Create_Model.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import streamlit as st
+import pandas as pd, numpy as np
+from bertopic import BERTopic
+from transformers import pipeline
+def make_stopwords():
+    text_file = open("dicts/stopwords.txt", "r")
+    stopwords_list = text_file.read().split("\n")
+    text_file.close()
+    return stopwords_list
+stopwords = make_stopwords()
+@st.cache_data
+def get_emotions(frame, language):
+    clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
+    st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
+    temp = st.classifier(list(frame.proc2))
+    rangelabels = len(temp[0])
+    temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
+    temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
+    return temp
+def preproc(frame):
+    import re
+    frame["proc"] = frame.text.apply(lambda x: str(x))
+    frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
+    frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
+    frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
+    frame["proc2"] = frame.proc
+    frame.proc2 = frame.proc2.apply(lambda row: row[:514].lower()) #2048
+    return frame
+st.set_page_config(
+    page_title="Create BERTopic",
+    page_icon="🤖",
+    layout="wide"
+)
+st.header("🤖 Create BERTopic")
+st.subheader("Use this page to create a model with your data")
+model_name = st.text_input("Please enter a name for the new model (e.g., 'ukraine_war_jan5')")
+df_name =  st.text_input("Please enter data file path (e.g., 'data/df.csv')")
+language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
+text_col = st.text_input("Text column name (exactly as appears in the csv)")
+date_col = st.text_input("Date column name (exactly as appears in the csv)")
+datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
+st.session_state.datetime_format = None if datetime_format == "" else datetime_format
+embs_name = st.text_input("Please enter embedding file path if any (e.g., 'data/embs.csv')")
+sample = st.slider("Percent of data to use", 0, 100, 25)
+emotions_yes = st.checkbox("Get emotions")
+if st.button("Train new model"):
+    from sentence_transformers import SentenceTransformer
+    # https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
+    from sklearn.cluster import MiniBatchKMeans
+    from sklearn.decomposition import IncrementalPCA
+    from bertopic.vectorizers import OnlineCountVectorizer
+    np.random.seed(123)
+    from river import cluster
+    from helper import River
+    umap_model = IncrementalPCA(n_components=5)
+    cluster_model = River(cluster.DBSTREAM(clustering_threshold = 1.5,
+                                fading_factor = 0.05,
+                                cleanup_interval = 7,
+                                intersection_factor = 0.5,
+                                minimum_weight = 1))
+    vectorizer_model = OnlineCountVectorizer(decay=.01,stop_words=stopwords)
+    embedding_model = "all-MiniLM-L6-v2" if language=="English" else "paraphrase-multilingual-MiniLM-L12-v2"
+    sentence_model = SentenceTransformer(embedding_model)
+    topic_model = BERTopic(verbose=True,
+                        embedding_model=embedding_model,
+                        umap_model=umap_model,
+                        hdbscan_model=cluster_model,
+                        vectorizer_model=vectorizer_model,
+                        calculate_probabilities=True)
+    with st.spinner("Preprocessing..."):
+        df = pd.read_csv(df_name).sample(frac=sample/100)
+        df = df.rename({text_col: 'text', date_col: 'date'}, axis=1)
+        df.index = range(len(df))
+        new_df = preproc(df)
+        new_df['id'] = df.index
+        all_docs = list(new_df.proc)
+        st.write(len(df))
+        st.write(len(new_df))
+    with st.spinner("Generating embeddings. This may take a couple of hours..."):
+        try:
+            embeddings = np.array(pd.read_csv(embs_name).drop("Unnamed: 0",axis=1))
+        except:
+            embeddings = sentence_model.encode(new_df.proc, show_progress_bar=True)
+            pd.DataFrame(embeddings).to_csv(f"embs_{model_name}.csv")
+    with st.spinner("Creating the model. This may take a couple of minutes..."):
+        doc_emb_chunks = [(all_docs[i:i+1000],embeddings[i:i+1000]) for i in range(0, len(all_docs), 1000)]
+        topics = []
+        for doc_chunk, emb_chunk in doc_emb_chunks:
+            topic_model.partial_fit(all_docs,embeddings)
+            topics.extend(topic_model.topics_)
+        topic_model.topics_ = topics
+    if emotions_yes:
+        with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
+            ems = get_emotions(new_df,language)
+            new_df = pd.merge(new_df, ems, on='id')
+    st.session_state.model = topic_model
+    st.session_state.df = new_df
+    st.session_state.model_df = st.session_state.model.get_document_info(new_df.proc)
+    topic_model.save(f"models/{model_name}")
+    st.session_state.df.to_csv(f"models/df_{model_name}.csv")
+    st.success(f"New model trained and saved as '{model_name}', dataframe saved as 'df_{model_name}.csv' in folter 'models'.")

pages/Update_Model.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import streamlit as st
+import pandas as pd, numpy as np
+from bertopic import BERTopic
+from transformers import pipeline
+@st.cache_data
+def get_emotions(frame, language):
+    clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
+    st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
+    temp = st.classifier(list(frame.proc2))
+    rangelabels = len(temp[0])
+    temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
+    temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
+    return temp
+def preproc(frame):
+    import re
+    frame["proc"] = frame.text.apply(lambda x: str(x))
+    frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
+    frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
+    frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
+    frame["proc2"] = frame.proc
+    frame.proc2 = frame.proc2.apply(lambda row: row[:2048].lower())
+    return frame
+st.set_page_config(
+    page_title="Update BERTopic",
+    page_icon="🤖",
+    layout="wide"
+)
+st.header("🤖 Update BERTopic")
+st.subheader("Use this page to update your model with new data")
+if "model" not in st.session_state:
+    st.markdown("**No model detected. Please go to the Home Page and add a model first.**")
+if "model" in st.session_state:
+    old_df = st.session_state.df
+    topics = list(st.session_state.model.topics_)
+    st.markdown(f"**Current data:** {len(old_df)} rows, {len(st.session_state.model.topic_labels_)} topics.")
+    st.markdown(f"**Current date range:** {str(min(pd.to_datetime(old_df.date, format='%d.%m.%Y')))[:10]} -- {str(max(pd.to_datetime(old_df.date,format='%d.%m.%Y')))[:10]}.")
+    st.write(old_df)
+    st.markdown("#### Please eneter a name for the updated model and upload files below")
+    name = st.text_input("Please enter a model name (e.g., 'my_cool_model')")
+    language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
+    datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
+    st.session_state.datetime_format = None if datetime_format == "" else datetime_format
+    uploaded_files = st.file_uploader("Choose a CSV file", accept_multiple_files=True)
+    if st.button('All files selected'):
+        for i in range(len(uploaded_files)):
+            uploaded_file = uploaded_files[i]
+            new_df = pd.read_csv(uploaded_file)
+            st.write(f"Uploaded file {uploaded_file.name}")
+            with st.spinner("Preprocessing..."):
+                new_df= preproc(new_df)
+                new_df['id'] = [i for i in range(len(old_df),len(new_df)+len(old_df))]
+                docs = list(new_df.proc)
+            with st.spinner("Updating the model. This may take a couple of minutes..."):
+                st.session_state.model.partial_fit(docs)
+                topics.extend(st.session_state.model.topics_)
+            with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
+                ems = get_emotions(new_df, language)
+                new_df = pd.merge(new_df, ems, on='id')
+            old_df = pd.concat([old_df,new_df])
+            st.success(f"Done with file {uploaded_file.name}!")
+            if i == len(uploaded_files)-1:
+                st.session_state.df = old_df
+                st.session_state.model.topics_ = topics
+                st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=5, topic_prefix=False, word_length=10, separator=", "))
+                st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc)
+                st.session_state.df["id"] = st.session_state.model_df.index
+                st.session_state.model_df["id"] = st.session_state.model_df.index
+                st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id")
+                st.session_state.model_df["date"] =  pd.to_datetime(st.session_state.model_df.date, format="%d.%m.%Y")
+                st.markdown("---")
+                st.markdown(f"**Updated data:** {len(old_df)} rows, {len(st.session_state.model.topic_labels_)} topics.")
+                st.markdown(f"**Updated date range:** {str(min(pd.to_datetime(old_df.date, format='%d.%m.%Y')))[:10]} -- {str(max(pd.to_datetime(old_df.date,format='%d.%m.%Y')))[:10]}.")
+                st.session_state.model.save(f"models/model_{name}")
+                st.session_state.df.to_csv(f"models/df_{name}.csv")
+                st.success(f"Model and dataframe saved in folder 'model'!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pandas==1.5.3
+numpy
+bertopic
+river==0.10.0
+scikit-learn