Spaces:

awacke1
/

Topic-Wizard-SKlearn

Runtime error

App Files Files Community

awacke1 commited on Mar 20, 2023

Commit

072885d

1 Parent(s): 1112873

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -68

app.py CHANGED Viewed

@@ -1,70 +1,54 @@
 import streamlit as st
-from bertopic import BERTopic
-import streamlit.components.v1 as components
-from sentence_transformers import SentenceTransformer
-from umap import UMAP
-from hdbscan import HDBSCAN
-# Initialize BERTopic model
-model = BERTopic()
-st.subheader("Topic Modeling with Topic-Wizard")
-uploaded_file = st.file_uploader("Choose a text file", type=["txt"])
-if uploaded_file is not None:
-    st.session_state["text"] = uploaded_file.getvalue().decode("utf-8")
-st.write("OR")
-input_text = st.text_area(
-    label="Enter text separated by newlines",
-    value="",
-    key="text",
-    height=150,
-)
-button = st.button("Get Segments")
-if button and (uploaded_file is not None or input_text != ""):
-    if uploaded_file is not None:
-        texts = st.session_state["text"].split("\n")
     else:
-        texts = input_text.split("\n")
-    # Fit BERTopic model
-    topics, probabilities = model.fit_transform(texts)
-    # Create embeddings
-    embeddings_model = SentenceTransformer("distilbert-base-nli-mean-tokens")
-    embeddings = embeddings_model.encode(texts)
-    # Reduce dimensionality of embeddings using UMAP
-    umap_model = UMAP(n_neighbors=15, n_components=2, metric="cosine")
-    umap_embeddings = umap_model.fit_transform(embeddings)
-    # Cluster topics using HDBSCAN
-    cluster = HDBSCAN(
-        min_cluster_size=15, metric="euclidean", cluster_selection_method="eom"
-    ).fit(umap_embeddings)
-    # Visualize BERTopic results with Streamlit
-    st.title("BERTopic Visualization")
-    # Display top N most representative topics and their documents
-    num_topics = st.sidebar.slider("Select number of topics to display", 1, 20, 5, 1)
-    topic_words = model.get_topics()
-    topic_freq = model.get_topic_freq().head(num_topics + 1)  # Add 1 to exclude -1 (outliers topic)
-    for _, row in topic_freq.iterrows():
-        topic_id = row["Topic"]
-        if topic_id == -1:
-            continue  # Skip the outliers topic
-        st.write(f"## Topic {topic_id}")
-        st.write("Keywords:", ", ".join(topic_words[topic_id]))
-        st.write("Documents:")
-        doc_ids = [idx for idx, topic in enumerate(topics) if topic == topic_id][:5]
-        for doc in doc_ids:
-            st.write("-", texts[doc])
-    # Display topic clusters
-    st.write("## Topic Clusters")
-    components.html(cluster.labels_.tolist(), height=500, width=800)

 import streamlit as st
+import pandas as pd
+import bertopic
+import plotly.express as px
+st.set_page_config(page_title="Topic Modeling with Bertopic")
+# Function to read the uploaded file and return a Pandas DataFrame
+def read_file(file):
+    if file.type == 'text/plain':
+        df = pd.read_csv(file, header=None, names=['data'])
+    elif file.type == 'text/csv':
+        df = pd.read_csv(file)
     else:
+        st.error("Unsupported file format. Please upload a TXT or CSV file.")
+        return None
+    return df
+# Sidebar to upload the file
+st.sidebar.title("Upload File")
+file = st.sidebar.file_uploader("Choose a file", type=["txt", "csv"])
+# Perform topic modeling when the user clicks the "Visualize" button
+if st.sidebar.button("Visualize"):
+    # Read the uploaded file
+    df = read_file(file)
+    if df is None:
+        st.stop()
+    # Perform topic modeling using Bertopic
+    model = bertopic.Bertopic()
+    topics, probabilities = model.fit_transform(df['data'])
+    # Create a plot of the topic distribution
+    fig = px.histogram(x=topics, nbins=max(topics)+1, color_discrete_sequence=px.colors.qualitative.Pastel)
+    fig.update_layout(
+        title="Distribution of Topics",
+        xaxis_title="Topic",
+        yaxis_title="Count",
+    )
+    st.plotly_chart(fig)
+    # Display the top words in each topic
+    st.write("Top words in each topic:")
+    for topic_id in range(max(topics)+1):
+        st.write(f"Topic {topic_id}: {model.get_topic(topic_id)}")
+    # Display the clusters
+    st.write("Clusters:")
+    for cluster_id, docs in model.get_clusters().items():
+        st.write(f"Cluster {cluster_id}:")
+        for doc in docs:
+            st.write(f"\t{doc}")