awacke1 commited on
Commit
38b1c07
1 Parent(s): 2736a95

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import nltk
4
+ import os
5
+ from nltk.corpus import stopwords
6
+ from nltk import FreqDist
7
+ from graphviz import Digraph
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.cluster import KMeans
10
+ from sklearn.metrics.pairwise import linear_kernel
11
+ from io import BytesIO
12
+ import base64
13
+
14
+ # Set page configuration with a title and favicon
15
+ st.set_page_config(
16
+ page_title="📺Transcript📜EDA🔍NLTK",
17
+ page_icon="🌠",
18
+ layout="wide",
19
+ initial_sidebar_state="expanded",
20
+ menu_items={
21
+ 'Get Help': 'https://huggingface.co/awacke1',
22
+ 'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
23
+ 'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
24
+ }
25
+ )
26
+
27
+ # Download NLTK resources
28
+ nltk.download('punkt')
29
+ nltk.download('stopwords')
30
+
31
+ def remove_timestamps(text):
32
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
33
+
34
+ def extract_high_information_words(text, top_n=10):
35
+ words = nltk.word_tokenize(text)
36
+ words = [word.lower() for word in words if word.isalpha()]
37
+ stop_words = set(stopwords.words('english'))
38
+ filtered_words = [word for word in words if word not in stop_words]
39
+ freq_dist = FreqDist(filtered_words)
40
+ return [word for word, _ in freq_dist.most_common(top_n)]
41
+
42
+ def cluster_sentences(sentences, num_clusters):
43
+ # Filter sentences with length over 10 characters
44
+ sentences = [sentence for sentence in sentences if len(sentence) > 10]
45
+
46
+ # Vectorize the sentences
47
+ vectorizer = TfidfVectorizer()
48
+ X = vectorizer.fit_transform(sentences)
49
+
50
+ # Perform k-means clustering
51
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
52
+ kmeans.fit(X)
53
+
54
+ # Calculate the centroid of each cluster
55
+ cluster_centers = kmeans.cluster_centers_
56
+
57
+ # Group sentences by cluster and calculate similarity to centroid
58
+ clustered_sentences = [[] for _ in range(num_clusters)]
59
+ for i, label in enumerate(kmeans.labels_):
60
+ similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
61
+ clustered_sentences[label].append((similarity, sentences[i]))
62
+
63
+ # Order sentences within each cluster based on their similarity to the centroid
64
+ for cluster in clustered_sentences:
65
+ cluster.sort(reverse=True) # Sort based on similarity (descending order)
66
+
67
+ # Return the ordered clustered sentences without similarity scores for display
68
+ return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
69
+
70
+ # Function to convert text to a downloadable file
71
+ def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
72
+ buffer = BytesIO()
73
+ buffer.write(text_to_download.encode())
74
+ buffer.seek(0)
75
+ b64 = base64.b64encode(buffer.read()).decode()
76
+ href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
77
+ return href
78
+
79
+ # Main code for UI
80
+ uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])
81
+
82
+ if uploaded_file:
83
+ file_text = uploaded_file.read().decode("utf-8")
84
+ else:
85
+ file_text = ""
86
+
87
+ if file_text:
88
+ text_without_timestamps = remove_timestamps(file_text)
89
+ sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
90
+
91
+ with st.expander("📝 Sentence Clustering"):
92
+ num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
93
+ clustered_sentences = cluster_sentences(sentences, num_clusters)
94
+
95
+ for i, cluster in enumerate(clustered_sentences):
96
+ st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100)
97
+
98
+ # Input for custom filename
99
+ default_filename = f"Cluster_{i+1}_Output.txt"
100
+ filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
101
+
102
+ # Download button
103
+ download_link = get_text_file_download_link("\n".join(cluster), filename, f"💾 Save Cluster {i+1}")
104
+ st.markdown(download_link, unsafe_allow_html=True)
105
+
106
+ st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")