Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import re
|
3 |
+
import nltk
|
4 |
+
import os
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from nltk import FreqDist
|
7 |
+
from graphviz import Digraph
|
8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
+
from sklearn.cluster import KMeans
|
10 |
+
from sklearn.metrics.pairwise import linear_kernel
|
11 |
+
from io import BytesIO
|
12 |
+
import base64
|
13 |
+
|
14 |
+
# Set page configuration with a title and favicon
|
15 |
+
st.set_page_config(
|
16 |
+
page_title="📺Transcript📜EDA🔍NLTK",
|
17 |
+
page_icon="🌠",
|
18 |
+
layout="wide",
|
19 |
+
initial_sidebar_state="expanded",
|
20 |
+
menu_items={
|
21 |
+
'Get Help': 'https://huggingface.co/awacke1',
|
22 |
+
'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
|
23 |
+
'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
|
24 |
+
}
|
25 |
+
)
|
26 |
+
|
27 |
+
# Download NLTK resources
|
28 |
+
nltk.download('punkt')
|
29 |
+
nltk.download('stopwords')
|
30 |
+
|
31 |
+
def remove_timestamps(text):
|
32 |
+
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
|
33 |
+
|
34 |
+
def extract_high_information_words(text, top_n=10):
|
35 |
+
words = nltk.word_tokenize(text)
|
36 |
+
words = [word.lower() for word in words if word.isalpha()]
|
37 |
+
stop_words = set(stopwords.words('english'))
|
38 |
+
filtered_words = [word for word in words if word not in stop_words]
|
39 |
+
freq_dist = FreqDist(filtered_words)
|
40 |
+
return [word for word, _ in freq_dist.most_common(top_n)]
|
41 |
+
|
42 |
+
def cluster_sentences(sentences, num_clusters):
|
43 |
+
# Filter sentences with length over 10 characters
|
44 |
+
sentences = [sentence for sentence in sentences if len(sentence) > 10]
|
45 |
+
|
46 |
+
# Vectorize the sentences
|
47 |
+
vectorizer = TfidfVectorizer()
|
48 |
+
X = vectorizer.fit_transform(sentences)
|
49 |
+
|
50 |
+
# Perform k-means clustering
|
51 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
52 |
+
kmeans.fit(X)
|
53 |
+
|
54 |
+
# Calculate the centroid of each cluster
|
55 |
+
cluster_centers = kmeans.cluster_centers_
|
56 |
+
|
57 |
+
# Group sentences by cluster and calculate similarity to centroid
|
58 |
+
clustered_sentences = [[] for _ in range(num_clusters)]
|
59 |
+
for i, label in enumerate(kmeans.labels_):
|
60 |
+
similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
|
61 |
+
clustered_sentences[label].append((similarity, sentences[i]))
|
62 |
+
|
63 |
+
# Order sentences within each cluster based on their similarity to the centroid
|
64 |
+
for cluster in clustered_sentences:
|
65 |
+
cluster.sort(reverse=True) # Sort based on similarity (descending order)
|
66 |
+
|
67 |
+
# Return the ordered clustered sentences without similarity scores for display
|
68 |
+
return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
|
69 |
+
|
70 |
+
# Function to convert text to a downloadable file
|
71 |
+
def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
|
72 |
+
buffer = BytesIO()
|
73 |
+
buffer.write(text_to_download.encode())
|
74 |
+
buffer.seek(0)
|
75 |
+
b64 = base64.b64encode(buffer.read()).decode()
|
76 |
+
href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
|
77 |
+
return href
|
78 |
+
|
79 |
+
# Main code for UI
|
80 |
+
uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])
|
81 |
+
|
82 |
+
if uploaded_file:
|
83 |
+
file_text = uploaded_file.read().decode("utf-8")
|
84 |
+
else:
|
85 |
+
file_text = ""
|
86 |
+
|
87 |
+
if file_text:
|
88 |
+
text_without_timestamps = remove_timestamps(file_text)
|
89 |
+
sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
|
90 |
+
|
91 |
+
with st.expander("📝 Sentence Clustering"):
|
92 |
+
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
|
93 |
+
clustered_sentences = cluster_sentences(sentences, num_clusters)
|
94 |
+
|
95 |
+
for i, cluster in enumerate(clustered_sentences):
|
96 |
+
st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100)
|
97 |
+
|
98 |
+
# Input for custom filename
|
99 |
+
default_filename = f"Cluster_{i+1}_Output.txt"
|
100 |
+
filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
|
101 |
+
|
102 |
+
# Download button
|
103 |
+
download_link = get_text_file_download_link("\n".join(cluster), filename, f"💾 Save Cluster {i+1}")
|
104 |
+
st.markdown(download_link, unsafe_allow_html=True)
|
105 |
+
|
106 |
+
st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
|