from bertopic import BERTopic import streamlit as st import streamlit.components.v1 as components #from datasets import load_dataset import pandas as pd from datasets import load_dataset import json ##Load Dataset from HF Hub #dataset = load_dataset("rshah/million-headlines") #news = pd.DataFrame.from_dict(dataset["train"]) #Load dataset locally - faster for demo news = pd.read_parquet("topic_10000.par") news['date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d') timestamps = news.date.to_list() tweets = news.headline_text.to_list() #Load topics with open("topics", "r") as fp: topics = json.load(fp) option_n = 5 st.set_page_config(page_title="News Topic Clustering") st.title("News Topic Clustering") st.caption("By Rajiv Shah") st.caption("") st.caption("This is a simple example of using identifying topics in the [one million ABC news headline dataset](https://huggingface.co/datasets/rshah/million-headlines). \ If you look at the code for this app, you will see how it uses just a few lines of [BERTopic](https://maartengr.github.io/BERTopic/index.html) to \ build the topics and create the visualizations") st.caption("The preloaded existing model provides the more interesting results. However, this app can be run live by building a new model, but \ is limited to a small number of rows. I also limited topics over time to the existing model.") form = st.sidebar.form("Main Settings") form.header("Main Settings") option = form.selectbox( 'What model would you like to run', ('Load existing model', 'Build new model'),index=0) option_n = form.number_input( 'What topic would you like to get terms for?', min_value=0,max_value=10,value=5) submitted = form.form_submit_button(label = 'Select Model') if option == 'Load existing model': ##Load existing model topic_model = BERTopic.load("topic_10000.model") #topics, _ = topic_model.transform(tweets) else: ##Builds Topic Model #news_sample = news[(news['date'] > '2015-06-01')] news_sample = news[(news['date'] > '2017-01-01') & (news['date'] < '2019-01-01') ] news_sample = news_sample.sample(200,random_state=123) tweets = news_sample.headline_text.to_list() topic_model = BERTopic(min_topic_size=5, verbose=True) topics, _ = topic_model.fit_transform(tweets) #Get top topics freq = topic_model.get_topic_info() freq = freq.iloc[1: , :] ##drop -1 row freq.head(10) st.header("The Main Topic Clusters") st.write(freq) topic_nr = freq.iloc[option_n]["Topic"] # We select a frequent topic st.caption("") st.write('Top words in topic cluster: ',option_n) #st.caption(option_n) mytuple = (topic_model.get_topic(topic_nr)) for item in mytuple: st.write(str(item[0])) st.header("Relationships between clusters ") st.plotly_chart(topic_model.visualize_hierarchy()) if option == 'Load existing model': st.header("Topics over time for Existing Model") topics_over_time = topic_model.topics_over_time(docs=tweets, topics=topics, timestamps=timestamps, global_tuning=True, evolution_tuning=True, nr_bins=20) st.plotly_chart(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20))