Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,9 @@ from nltk import FreqDist
|
|
8 |
from graphviz import Digraph
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
from sklearn.cluster import KMeans
|
|
|
|
|
|
|
11 |
|
12 |
# Set page configuration with a title and favicon
|
13 |
st.set_page_config(
|
@@ -66,11 +69,11 @@ def extract_context_words(text, high_information_words):
|
|
66 |
def create_context_graph(context_words):
|
67 |
graph = Digraph()
|
68 |
for index, (before_word, high_info_word, after_word) in enumerate(context_words):
|
69 |
-
|
70 |
-
|
71 |
graph.node(f'high{index}', high_info_word, shape='ellipse')
|
72 |
-
|
73 |
-
|
74 |
if before_word:
|
75 |
graph.edge(f'before{index}', f'high{index}')
|
76 |
if after_word:
|
@@ -87,7 +90,6 @@ def display_context_table(context_words):
|
|
87 |
table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
|
88 |
st.markdown(table)
|
89 |
|
90 |
-
|
91 |
def load_example_files():
|
92 |
# Exclude specific files
|
93 |
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
|
@@ -107,10 +109,8 @@ def load_example_files():
|
|
107 |
return None
|
108 |
|
109 |
def cluster_sentences(sentences, num_clusters):
|
110 |
-
#
|
111 |
-
if len(
|
112 |
-
# If so, adjust the number of clusters to match the number of sentences
|
113 |
-
num_clusters = len(sentences)
|
114 |
|
115 |
# Vectorize the sentences
|
116 |
vectorizer = TfidfVectorizer()
|
@@ -120,15 +120,30 @@ def cluster_sentences(sentences, num_clusters):
|
|
120 |
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
121 |
kmeans.fit(X)
|
122 |
|
123 |
-
#
|
124 |
-
|
125 |
|
126 |
-
# Group sentences by cluster
|
127 |
clustered_sentences = [[] for _ in range(num_clusters)]
|
128 |
-
for i, label in enumerate(
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
# Main code for UI
|
134 |
uploaded_file = st.file_uploader("๐ Choose a .txt file", type=['txt'])
|
@@ -160,19 +175,23 @@ if file_text:
|
|
160 |
with st.expander("๐ Context Table"):
|
161 |
display_context_table(context_words)
|
162 |
|
163 |
-
#with st.expander("Innovation Outlines"):
|
164 |
-
# showInnovationOutlines()
|
165 |
-
|
166 |
with st.expander("๐ Sentence Clustering"):
|
167 |
-
sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if sentence.strip()]
|
168 |
|
169 |
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
|
170 |
clustered_sentences = cluster_sentences(sentences, num_clusters)
|
171 |
|
172 |
-
output_text = ""
|
173 |
for i, cluster in enumerate(clustered_sentences):
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
from graphviz import Digraph
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
from sklearn.cluster import KMeans
|
11 |
+
from sklearn.metrics.pairwise import linear_kernel
|
12 |
+
from io import BytesIO
|
13 |
+
import base64
|
14 |
|
15 |
# Set page configuration with a title and favicon
|
16 |
st.set_page_config(
|
|
|
69 |
def create_context_graph(context_words):
|
70 |
graph = Digraph()
|
71 |
for index, (before_word, high_info_word, after_word) in enumerate(context_words):
|
72 |
+
if before_word:
|
73 |
+
graph.node(f'before{index}', before_word, shape='box')
|
74 |
graph.node(f'high{index}', high_info_word, shape='ellipse')
|
75 |
+
if after_word:
|
76 |
+
graph.node(f'after{index}', after_word, shape='diamond')
|
77 |
if before_word:
|
78 |
graph.edge(f'before{index}', f'high{index}')
|
79 |
if after_word:
|
|
|
90 |
table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
|
91 |
st.markdown(table)
|
92 |
|
|
|
93 |
def load_example_files():
|
94 |
# Exclude specific files
|
95 |
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
|
|
|
109 |
return None
|
110 |
|
111 |
def cluster_sentences(sentences, num_clusters):
|
112 |
+
# Filter sentences with length over 10 characters
|
113 |
+
sentences = [sentence for sentence in sentences if len(sentence) > 10]
|
|
|
|
|
114 |
|
115 |
# Vectorize the sentences
|
116 |
vectorizer = TfidfVectorizer()
|
|
|
120 |
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
121 |
kmeans.fit(X)
|
122 |
|
123 |
+
# Calculate the centroid of each cluster
|
124 |
+
cluster_centers = kmeans.cluster_centers_
|
125 |
|
126 |
+
# Group sentences by cluster and calculate similarity to centroid
|
127 |
clustered_sentences = [[] for _ in range(num_clusters)]
|
128 |
+
for i, label in enumerate(kmeans.labels_):
|
129 |
+
similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
|
130 |
+
clustered_sentences[label].append((similarity, sentences[i]))
|
131 |
+
|
132 |
+
# Order sentences within each cluster based on their similarity to the centroid
|
133 |
+
for cluster in clustered_sentences:
|
134 |
+
cluster.sort(reverse=True) # Sort based on similarity (descending order)
|
135 |
+
|
136 |
+
# Return the ordered clustered sentences without similarity scores for display
|
137 |
+
return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
|
138 |
+
|
139 |
+
# Function to convert text to a downloadable file
|
140 |
+
def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐พ Save"):
|
141 |
+
buffer = BytesIO()
|
142 |
+
buffer.write(text_to_download.encode())
|
143 |
+
buffer.seek(0)
|
144 |
+
b64 = base64.b64encode(buffer.read()).decode()
|
145 |
+
href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
|
146 |
+
return href
|
147 |
|
148 |
# Main code for UI
|
149 |
uploaded_file = st.file_uploader("๐ Choose a .txt file", type=['txt'])
|
|
|
175 |
with st.expander("๐ Context Table"):
|
176 |
display_context_table(context_words)
|
177 |
|
|
|
|
|
|
|
178 |
with st.expander("๐ Sentence Clustering"):
|
179 |
+
sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
|
180 |
|
181 |
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
|
182 |
clustered_sentences = cluster_sentences(sentences, num_clusters)
|
183 |
|
|
|
184 |
for i, cluster in enumerate(clustered_sentences):
|
185 |
+
st.subheader(f"Cluster {i+1}")
|
186 |
+
cluster_text = "\n".join(cluster)
|
187 |
+
st.text_area(f"Cluster {i+1} Sentences", value=cluster_text, height=200)
|
188 |
+
|
189 |
+
# Input for custom filename
|
190 |
+
default_filename = f"Cluster_{i+1}_Output.txt"
|
191 |
+
filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
|
192 |
+
|
193 |
+
# Download button
|
194 |
+
download_link = get_text_file_download_link(cluster_text, filename, f"๐พ Save Cluster {i+1}")
|
195 |
+
st.markdown(download_link, unsafe_allow_html=True)
|
196 |
+
|
197 |
+
st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
|