Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

awacke1 commited on Mar 14

Commit

6556590

•

1 Parent(s): e227516

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -26

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ from nltk import FreqDist
 from graphviz import Digraph
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 # Set page configuration with a title and favicon
 st.set_page_config(
@@ -66,11 +69,11 @@ def extract_context_words(text, high_information_words):
 def create_context_graph(context_words):
     graph = Digraph()
     for index, (before_word, high_info_word, after_word) in enumerate(context_words):
-        #graph.node(f'before{index}', before_word, shape='box') if before_word else None
-        if before_word:  graph.node(f'before{index}', before_word, shape='box') # else None
         graph.node(f'high{index}', high_info_word, shape='ellipse')
-        #graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
-        if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
         if before_word:
             graph.edge(f'before{index}', f'high{index}')
         if after_word:
@@ -87,7 +90,6 @@ def display_context_table(context_words):
         table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
     st.markdown(table)
 def load_example_files():
     # Exclude specific files
     excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
@@ -107,10 +109,8 @@ def load_example_files():
     return None
 def cluster_sentences(sentences, num_clusters):
-    # Check if the number of sentences is less than the desired number of clusters
-    if len(sentences) < num_clusters:
-        # If so, adjust the number of clusters to match the number of sentences
-        num_clusters = len(sentences)
     # Vectorize the sentences
     vectorizer = TfidfVectorizer()
@@ -120,15 +120,30 @@ def cluster_sentences(sentences, num_clusters):
     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
     kmeans.fit(X)
-    # Get the cluster labels for each sentence
-    labels = kmeans.labels_
-    # Group sentences by cluster
     clustered_sentences = [[] for _ in range(num_clusters)]
-    for i, label in enumerate(labels):
-        clustered_sentences[label].append(sentences[i])
-    return clustered_sentences
 # Main code for UI
 uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])
@@ -160,19 +175,23 @@ if file_text:
     with st.expander("📑 Context Table"):
         display_context_table(context_words)
-    #with st.expander("Innovation Outlines"):
-    #    showInnovationOutlines()
     with st.expander("📝 Sentence Clustering"):
-        sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if sentence.strip()]
         num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
         clustered_sentences = cluster_sentences(sentences, num_clusters)
-        output_text = ""
         for i, cluster in enumerate(clustered_sentences):
-            output_text += f"Cluster {i+1}:\n"
-            output_text += "\n".join(cluster)
-            output_text += "\n\n"
-        st.text_area("Clustered Sentences", value=output_text, height=400)

 from graphviz import Digraph
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import linear_kernel
+from io import BytesIO
+import base64
 # Set page configuration with a title and favicon
 st.set_page_config(
 def create_context_graph(context_words):
     graph = Digraph()
     for index, (before_word, high_info_word, after_word) in enumerate(context_words):
+        if before_word:
+            graph.node(f'before{index}', before_word, shape='box')
         graph.node(f'high{index}', high_info_word, shape='ellipse')
+        if after_word:
+            graph.node(f'after{index}', after_word, shape='diamond')
         if before_word:
             graph.edge(f'before{index}', f'high{index}')
         if after_word:
         table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
     st.markdown(table)
 def load_example_files():
     # Exclude specific files
     excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
     return None
 def cluster_sentences(sentences, num_clusters):
+    # Filter sentences with length over 10 characters
+    sentences = [sentence for sentence in sentences if len(sentence) > 10]
     # Vectorize the sentences
     vectorizer = TfidfVectorizer()
     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
     kmeans.fit(X)
+    # Calculate the centroid of each cluster
+    cluster_centers = kmeans.cluster_centers_
+    # Group sentences by cluster and calculate similarity to centroid
     clustered_sentences = [[] for _ in range(num_clusters)]
+    for i, label in enumerate(kmeans.labels_):
+        similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
+        clustered_sentences[label].append((similarity, sentences[i]))
+    # Order sentences within each cluster based on their similarity to the centroid
+    for cluster in clustered_sentences:
+        cluster.sort(reverse=True)  # Sort based on similarity (descending order)
+    # Return the ordered clustered sentences without similarity scores for display
+    return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
+# Function to convert text to a downloadable file
+def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
+    buffer = BytesIO()
+    buffer.write(text_to_download.encode())
+    buffer.seek(0)
+    b64 = base64.b64encode(buffer.read()).decode()
+    href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
+    return href
 # Main code for UI
 uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])
     with st.expander("📑 Context Table"):
         display_context_table(context_words)
     with st.expander("📝 Sentence Clustering"):
+        sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
         num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
         clustered_sentences = cluster_sentences(sentences, num_clusters)
         for i, cluster in enumerate(clustered_sentences):
+            st.subheader(f"Cluster {i+1}")
+            cluster_text = "\n".join(cluster)
+            st.text_area(f"Cluster {i+1} Sentences", value=cluster_text, height=200)
+            # Input for custom filename
+            default_filename = f"Cluster_{i+1}_Output.txt"
+            filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
+            # Download button
+            download_link = get_text_file_download_link(cluster_text, filename, f"💾 Save Cluster {i+1}")
+            st.markdown(download_link, unsafe_allow_html=True)
+st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")