File size: 5,449 Bytes
8bf791d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
import pandas as pd, numpy as np
from bertopic import BERTopic
from transformers import pipeline

def make_stopwords():
    text_file = open("dicts/stopwords.txt", "r")
    stopwords_list = text_file.read().split("\n")
    text_file.close()
    return stopwords_list
stopwords = make_stopwords()

@st.cache_data
def get_emotions(frame, language):
    clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
    st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
    temp = st.classifier(list(frame.proc2))
    rangelabels = len(temp[0])
    temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
    temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
    return temp

def preproc(frame):
    import re
    frame["proc"] = frame.text.apply(lambda x: str(x))
    frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
    frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
    frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
    frame["proc2"] = frame.proc
    frame.proc2 = frame.proc2.apply(lambda row: row[:514].lower()) #2048
    return frame


st.set_page_config(
    page_title="Create BERTopic",
    page_icon="🤖",
    layout="wide"
)

st.header("🤖 Create BERTopic")
st.subheader("Use this page to create a model with your data")

model_name = st.text_input("Please enter a name for the new model (e.g., 'ukraine_war_jan5')")
df_name =  st.text_input("Please enter data file path (e.g., 'data/df.csv')")
language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
text_col = st.text_input("Text column name (exactly as appears in the csv)")
date_col = st.text_input("Date column name (exactly as appears in the csv)")
datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
st.session_state.datetime_format = None if datetime_format == "" else datetime_format
embs_name = st.text_input("Please enter embedding file path if any (e.g., 'data/embs.csv')")
sample = st.slider("Percent of data to use", 0, 100, 25)
emotions_yes = st.checkbox("Get emotions")

if st.button("Train new model"):
    
    from sentence_transformers import SentenceTransformer
    # https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.decomposition import IncrementalPCA
    from bertopic.vectorizers import OnlineCountVectorizer
    np.random.seed(123)
    from river import cluster
    from helper import River

    umap_model = IncrementalPCA(n_components=5)
    cluster_model = River(cluster.DBSTREAM(clustering_threshold = 1.5,
                                fading_factor = 0.05,
                                cleanup_interval = 7,
                                intersection_factor = 0.5,
                                minimum_weight = 1))
    vectorizer_model = OnlineCountVectorizer(decay=.01,stop_words=stopwords)
    embedding_model = "all-MiniLM-L6-v2" if language=="English" else "paraphrase-multilingual-MiniLM-L12-v2" 
    sentence_model = SentenceTransformer(embedding_model)
    topic_model = BERTopic(verbose=True,
                        embedding_model=embedding_model,
                        umap_model=umap_model,
                        hdbscan_model=cluster_model,
                        vectorizer_model=vectorizer_model,
                        calculate_probabilities=True)

    with st.spinner("Preprocessing..."):
        df = pd.read_csv(df_name).sample(frac=sample/100)
        df = df.rename({text_col: 'text', date_col: 'date'}, axis=1)
        df.index = range(len(df))
        new_df = preproc(df)
        new_df['id'] = df.index
        all_docs = list(new_df.proc)
        st.write(len(df))
        st.write(len(new_df))
    
    with st.spinner("Generating embeddings. This may take a couple of hours..."):
        try:
            embeddings = np.array(pd.read_csv(embs_name).drop("Unnamed: 0",axis=1))
        except:
            embeddings = sentence_model.encode(new_df.proc, show_progress_bar=True)
            pd.DataFrame(embeddings).to_csv(f"embs_{model_name}.csv")

    with st.spinner("Creating the model. This may take a couple of minutes..."):
        doc_emb_chunks = [(all_docs[i:i+1000],embeddings[i:i+1000]) for i in range(0, len(all_docs), 1000)]
        topics = []
        for doc_chunk, emb_chunk in doc_emb_chunks:
            topic_model.partial_fit(all_docs,embeddings)
            topics.extend(topic_model.topics_)
        topic_model.topics_ = topics
    
    if emotions_yes:
        with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
            ems = get_emotions(new_df,language)
            new_df = pd.merge(new_df, ems, on='id')

    st.session_state.model = topic_model
    st.session_state.df = new_df
    st.session_state.model_df = st.session_state.model.get_document_info(new_df.proc)

    topic_model.save(f"models/{model_name}")
    st.session_state.df.to_csv(f"models/df_{model_name}.csv")
    st.success(f"New model trained and saved as '{model_name}', dataframe saved as 'df_{model_name}.csv' in folter 'models'.")