BERTinsights / pages /Create_Model.py
Yara Kyrychenko
Add first files
8bf791d
raw
history blame
No virus
5.45 kB
import streamlit as st
import pandas as pd, numpy as np
from bertopic import BERTopic
from transformers import pipeline
def make_stopwords():
text_file = open("dicts/stopwords.txt", "r")
stopwords_list = text_file.read().split("\n")
text_file.close()
return stopwords_list
stopwords = make_stopwords()
@st.cache_data
def get_emotions(frame, language):
clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
temp = st.classifier(list(frame.proc2))
rangelabels = len(temp[0])
temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
return temp
def preproc(frame):
import re
frame["proc"] = frame.text.apply(lambda x: str(x))
frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
frame["proc2"] = frame.proc
frame.proc2 = frame.proc2.apply(lambda row: row[:514].lower()) #2048
return frame
st.set_page_config(
page_title="Create BERTopic",
page_icon="πŸ€–",
layout="wide"
)
st.header("πŸ€– Create BERTopic")
st.subheader("Use this page to create a model with your data")
model_name = st.text_input("Please enter a name for the new model (e.g., 'ukraine_war_jan5')")
df_name = st.text_input("Please enter data file path (e.g., 'data/df.csv')")
language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
text_col = st.text_input("Text column name (exactly as appears in the csv)")
date_col = st.text_input("Date column name (exactly as appears in the csv)")
datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
st.session_state.datetime_format = None if datetime_format == "" else datetime_format
embs_name = st.text_input("Please enter embedding file path if any (e.g., 'data/embs.csv')")
sample = st.slider("Percent of data to use", 0, 100, 25)
emotions_yes = st.checkbox("Get emotions")
if st.button("Train new model"):
from sentence_transformers import SentenceTransformer
# https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer
np.random.seed(123)
from river import cluster
from helper import River
umap_model = IncrementalPCA(n_components=5)
cluster_model = River(cluster.DBSTREAM(clustering_threshold = 1.5,
fading_factor = 0.05,
cleanup_interval = 7,
intersection_factor = 0.5,
minimum_weight = 1))
vectorizer_model = OnlineCountVectorizer(decay=.01,stop_words=stopwords)
embedding_model = "all-MiniLM-L6-v2" if language=="English" else "paraphrase-multilingual-MiniLM-L12-v2"
sentence_model = SentenceTransformer(embedding_model)
topic_model = BERTopic(verbose=True,
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=cluster_model,
vectorizer_model=vectorizer_model,
calculate_probabilities=True)
with st.spinner("Preprocessing..."):
df = pd.read_csv(df_name).sample(frac=sample/100)
df = df.rename({text_col: 'text', date_col: 'date'}, axis=1)
df.index = range(len(df))
new_df = preproc(df)
new_df['id'] = df.index
all_docs = list(new_df.proc)
st.write(len(df))
st.write(len(new_df))
with st.spinner("Generating embeddings. This may take a couple of hours..."):
try:
embeddings = np.array(pd.read_csv(embs_name).drop("Unnamed: 0",axis=1))
except:
embeddings = sentence_model.encode(new_df.proc, show_progress_bar=True)
pd.DataFrame(embeddings).to_csv(f"embs_{model_name}.csv")
with st.spinner("Creating the model. This may take a couple of minutes..."):
doc_emb_chunks = [(all_docs[i:i+1000],embeddings[i:i+1000]) for i in range(0, len(all_docs), 1000)]
topics = []
for doc_chunk, emb_chunk in doc_emb_chunks:
topic_model.partial_fit(all_docs,embeddings)
topics.extend(topic_model.topics_)
topic_model.topics_ = topics
if emotions_yes:
with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
ems = get_emotions(new_df,language)
new_df = pd.merge(new_df, ems, on='id')
st.session_state.model = topic_model
st.session_state.df = new_df
st.session_state.model_df = st.session_state.model.get_document_info(new_df.proc)
topic_model.save(f"models/{model_name}")
st.session_state.df.to_csv(f"models/df_{model_name}.csv")
st.success(f"New model trained and saved as '{model_name}', dataframe saved as 'df_{model_name}.csv' in folter 'models'.")