awacke1 commited on
Commit
6556590
โ€ข
1 Parent(s): e227516

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -26
app.py CHANGED
@@ -8,6 +8,9 @@ from nltk import FreqDist
8
  from graphviz import Digraph
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.cluster import KMeans
 
 
 
11
 
12
  # Set page configuration with a title and favicon
13
  st.set_page_config(
@@ -66,11 +69,11 @@ def extract_context_words(text, high_information_words):
66
  def create_context_graph(context_words):
67
  graph = Digraph()
68
  for index, (before_word, high_info_word, after_word) in enumerate(context_words):
69
- #graph.node(f'before{index}', before_word, shape='box') if before_word else None
70
- if before_word: graph.node(f'before{index}', before_word, shape='box') # else None
71
  graph.node(f'high{index}', high_info_word, shape='ellipse')
72
- #graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
73
- if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
74
  if before_word:
75
  graph.edge(f'before{index}', f'high{index}')
76
  if after_word:
@@ -87,7 +90,6 @@ def display_context_table(context_words):
87
  table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
88
  st.markdown(table)
89
 
90
-
91
  def load_example_files():
92
  # Exclude specific files
93
  excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
@@ -107,10 +109,8 @@ def load_example_files():
107
  return None
108
 
109
  def cluster_sentences(sentences, num_clusters):
110
- # Check if the number of sentences is less than the desired number of clusters
111
- if len(sentences) < num_clusters:
112
- # If so, adjust the number of clusters to match the number of sentences
113
- num_clusters = len(sentences)
114
 
115
  # Vectorize the sentences
116
  vectorizer = TfidfVectorizer()
@@ -120,15 +120,30 @@ def cluster_sentences(sentences, num_clusters):
120
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
121
  kmeans.fit(X)
122
 
123
- # Get the cluster labels for each sentence
124
- labels = kmeans.labels_
125
 
126
- # Group sentences by cluster
127
  clustered_sentences = [[] for _ in range(num_clusters)]
128
- for i, label in enumerate(labels):
129
- clustered_sentences[label].append(sentences[i])
130
-
131
- return clustered_sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # Main code for UI
134
  uploaded_file = st.file_uploader("๐Ÿ“ Choose a .txt file", type=['txt'])
@@ -160,19 +175,23 @@ if file_text:
160
  with st.expander("๐Ÿ“‘ Context Table"):
161
  display_context_table(context_words)
162
 
163
- #with st.expander("Innovation Outlines"):
164
- # showInnovationOutlines()
165
-
166
  with st.expander("๐Ÿ“ Sentence Clustering"):
167
- sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if sentence.strip()]
168
 
169
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
170
  clustered_sentences = cluster_sentences(sentences, num_clusters)
171
 
172
- output_text = ""
173
  for i, cluster in enumerate(clustered_sentences):
174
- output_text += f"Cluster {i+1}:\n"
175
- output_text += "\n".join(cluster)
176
- output_text += "\n\n"
177
-
178
- st.text_area("Clustered Sentences", value=output_text, height=400)
 
 
 
 
 
 
 
 
 
8
  from graphviz import Digraph
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.cluster import KMeans
11
+ from sklearn.metrics.pairwise import linear_kernel
12
+ from io import BytesIO
13
+ import base64
14
 
15
  # Set page configuration with a title and favicon
16
  st.set_page_config(
 
69
  def create_context_graph(context_words):
70
  graph = Digraph()
71
  for index, (before_word, high_info_word, after_word) in enumerate(context_words):
72
+ if before_word:
73
+ graph.node(f'before{index}', before_word, shape='box')
74
  graph.node(f'high{index}', high_info_word, shape='ellipse')
75
+ if after_word:
76
+ graph.node(f'after{index}', after_word, shape='diamond')
77
  if before_word:
78
  graph.edge(f'before{index}', f'high{index}')
79
  if after_word:
 
90
  table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
91
  st.markdown(table)
92
 
 
93
  def load_example_files():
94
  # Exclude specific files
95
  excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
 
109
  return None
110
 
111
  def cluster_sentences(sentences, num_clusters):
112
+ # Filter sentences with length over 10 characters
113
+ sentences = [sentence for sentence in sentences if len(sentence) > 10]
 
 
114
 
115
  # Vectorize the sentences
116
  vectorizer = TfidfVectorizer()
 
120
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
121
  kmeans.fit(X)
122
 
123
+ # Calculate the centroid of each cluster
124
+ cluster_centers = kmeans.cluster_centers_
125
 
126
+ # Group sentences by cluster and calculate similarity to centroid
127
  clustered_sentences = [[] for _ in range(num_clusters)]
128
+ for i, label in enumerate(kmeans.labels_):
129
+ similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
130
+ clustered_sentences[label].append((similarity, sentences[i]))
131
+
132
+ # Order sentences within each cluster based on their similarity to the centroid
133
+ for cluster in clustered_sentences:
134
+ cluster.sort(reverse=True) # Sort based on similarity (descending order)
135
+
136
+ # Return the ordered clustered sentences without similarity scores for display
137
+ return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
138
+
139
+ # Function to convert text to a downloadable file
140
+ def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐Ÿ’พ Save"):
141
+ buffer = BytesIO()
142
+ buffer.write(text_to_download.encode())
143
+ buffer.seek(0)
144
+ b64 = base64.b64encode(buffer.read()).decode()
145
+ href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
146
+ return href
147
 
148
  # Main code for UI
149
  uploaded_file = st.file_uploader("๐Ÿ“ Choose a .txt file", type=['txt'])
 
175
  with st.expander("๐Ÿ“‘ Context Table"):
176
  display_context_table(context_words)
177
 
 
 
 
178
  with st.expander("๐Ÿ“ Sentence Clustering"):
179
+ sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
180
 
181
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
182
  clustered_sentences = cluster_sentences(sentences, num_clusters)
183
 
 
184
  for i, cluster in enumerate(clustered_sentences):
185
+ st.subheader(f"Cluster {i+1}")
186
+ cluster_text = "\n".join(cluster)
187
+ st.text_area(f"Cluster {i+1} Sentences", value=cluster_text, height=200)
188
+
189
+ # Input for custom filename
190
+ default_filename = f"Cluster_{i+1}_Output.txt"
191
+ filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
192
+
193
+ # Download button
194
+ download_link = get_text_file_download_link(cluster_text, filename, f"๐Ÿ’พ Save Cluster {i+1}")
195
+ st.markdown(download_link, unsafe_allow_html=True)
196
+
197
+ st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")