Spaces:

YaraKyrychenko
/

BERTinsights

Running

App Files Files Community

BERTinsights / pages /Create_Model.py

Yara Kyrychenko

Add first files

8bf791d 12 months ago

raw

history blame

No virus

5.45 kB

	import streamlit as st
	import pandas as pd, numpy as np
	from bertopic import BERTopic
	from transformers import pipeline

	def make_stopwords():
	text_file = open("dicts/stopwords.txt", "r")
	stopwords_list = text_file.read().split("\n")
	text_file.close()
	return stopwords_list
	stopwords = make_stopwords()

	@st.cache_data
	def get_emotions(frame, language):
	clasif = "cointegrated/rubert-tiny2-cedr-emotion-detection" if language == "Russian/Ukrainian" else "j-hartmann/emotion-english-distilroberta-base"
	st.classifier = pipeline("text-classification", model=clasif, return_all_scores=True)
	temp = st.classifier(list(frame.proc2))
	rangelabels = len(temp[0])
	temp = pd.DataFrame({temp[0][j]["label"]: [ temp[i][j]["score"] for i in range(len(temp)) ] for j in range(rangelabels)})
	temp['id'] = [i for i in range(len(st.session_state.df),len(temp)+len(st.session_state.df))]
	return temp

	def preproc(frame):
	import re
	frame["proc"] = frame.text.apply(lambda x: str(x))
	frame.proc = frame.apply(lambda row: re.sub(r"http\S+", "http", row.proc), 1)
	frame.proc = frame.apply(lambda row: re.sub(r"@\S+", "@user", row.proc), 1)
	frame.proc = frame.apply(lambda row: re.sub(r"#", " ", row.proc).strip(), 1)
	frame["proc2"] = frame.proc
	frame.proc2 = frame.proc2.apply(lambda row: row[:514].lower()) #2048
	return frame


	st.set_page_config(
	page_title="Create BERTopic",
	page_icon="🤖",
	layout="wide"
	)

	st.header("🤖 Create BERTopic")
	st.subheader("Use this page to create a model with your data")

	model_name = st.text_input("Please enter a name for the new model (e.g., 'ukraine_war_jan5')")
	df_name = st.text_input("Please enter data file path (e.g., 'data/df.csv')")
	language = st.radio("Please pick one language that best describes your data", ["English","Russian/Ukrainian","Other"],horizontal=True)
	text_col = st.text_input("Text column name (exactly as appears in the csv)")
	date_col = st.text_input("Date column name (exactly as appears in the csv)")
	datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="")
	st.session_state.datetime_format = None if datetime_format == "" else datetime_format
	embs_name = st.text_input("Please enter embedding file path if any (e.g., 'data/embs.csv')")
	sample = st.slider("Percent of data to use", 0, 100, 25)
	emotions_yes = st.checkbox("Get emotions")

	if st.button("Train new model"):

	from sentence_transformers import SentenceTransformer
	# https://www.sbert.net/docs/pretrained_models.html#multi-lingual-models
	from sklearn.cluster import MiniBatchKMeans
	from sklearn.decomposition import IncrementalPCA
	from bertopic.vectorizers import OnlineCountVectorizer
	np.random.seed(123)
	from river import cluster
	from helper import River

	umap_model = IncrementalPCA(n_components=5)
	cluster_model = River(cluster.DBSTREAM(clustering_threshold = 1.5,
	fading_factor = 0.05,
	cleanup_interval = 7,
	intersection_factor = 0.5,
	minimum_weight = 1))
	vectorizer_model = OnlineCountVectorizer(decay=.01,stop_words=stopwords)
	embedding_model = "all-MiniLM-L6-v2" if language=="English" else "paraphrase-multilingual-MiniLM-L12-v2"
	sentence_model = SentenceTransformer(embedding_model)
	topic_model = BERTopic(verbose=True,
	embedding_model=embedding_model,
	umap_model=umap_model,
	hdbscan_model=cluster_model,
	vectorizer_model=vectorizer_model,
	calculate_probabilities=True)

	with st.spinner("Preprocessing..."):
	df = pd.read_csv(df_name).sample(frac=sample/100)
	df = df.rename({text_col: 'text', date_col: 'date'}, axis=1)
	df.index = range(len(df))
	new_df = preproc(df)
	new_df['id'] = df.index
	all_docs = list(new_df.proc)
	st.write(len(df))
	st.write(len(new_df))

	with st.spinner("Generating embeddings. This may take a couple of hours..."):
	try:
	embeddings = np.array(pd.read_csv(embs_name).drop("Unnamed: 0",axis=1))
	except:
	embeddings = sentence_model.encode(new_df.proc, show_progress_bar=True)
	pd.DataFrame(embeddings).to_csv(f"embs_{model_name}.csv")

	with st.spinner("Creating the model. This may take a couple of minutes..."):
	doc_emb_chunks = [(all_docs[i:i+1000],embeddings[i:i+1000]) for i in range(0, len(all_docs), 1000)]
	topics = []
	for doc_chunk, emb_chunk in doc_emb_chunks:
	topic_model.partial_fit(all_docs,embeddings)
	topics.extend(topic_model.topics_)
	topic_model.topics_ = topics

	if emotions_yes:
	with st.spinner("Classifiying emotions. This may take a couple of minutes..."):
	ems = get_emotions(new_df,language)
	new_df = pd.merge(new_df, ems, on='id')

	st.session_state.model = topic_model
	st.session_state.df = new_df
	st.session_state.model_df = st.session_state.model.get_document_info(new_df.proc)

	topic_model.save(f"models/{model_name}")
	st.session_state.df.to_csv(f"models/df_{model_name}.csv")
	st.success(f"New model trained and saved as '{model_name}', dataframe saved as 'df_{model_name}.csv' in folter 'models'.")