awacke1 commited on
Commit
c91ce8d
โ€ข
1 Parent(s): c48df87

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -0
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.cluster import KMeans
5
+ from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
6
+ import nltk
7
+ from nltk.corpus import stopwords
8
+ from nltk import FreqDist
9
+ import re
10
+ import os
11
+ import base64
12
+ from graphviz import Digraph
13
+ from io import BytesIO
14
+ import networkx as nx
15
+ import matplotlib.pyplot as plt
16
+
17
+ # Set page configuration with a title and favicon
18
+ st.set_page_config(
19
+ page_title="๐Ÿ“บTranscript๐Ÿ“œEDA๐Ÿ”NLTK",
20
+ page_icon="๐ŸŒ ",
21
+ layout="wide",
22
+ initial_sidebar_state="expanded",
23
+ menu_items={
24
+ 'Get Help': 'https://huggingface.co/awacke1',
25
+ 'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
26
+ 'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
27
+ }
28
+ )
29
+
30
+ st.markdown('''
31
+ 1. ๐Ÿ” **Transcript Insights Using Exploratory Data Analysis (EDA)** ๐Ÿ“Š - Unveil hidden patterns ๐Ÿ•ต๏ธโ€โ™‚๏ธ and insights ๐Ÿง  in your transcripts. ๐Ÿ†.
32
+ 2. ๐Ÿ“œ **Natural Language Toolkit (NLTK)** ๐Ÿ› ๏ธ:- your compass ๐Ÿงญ in the vast landscape of NLP.
33
+ 3. ๐Ÿ“บ **Transcript Analysis** ๐Ÿ“ˆ:Speech recognition ๐ŸŽ™๏ธ and thematic extraction ๐ŸŒ, audiovisual content to actionable insights ๐Ÿ”‘.
34
+ ''')
35
+
36
+ # Download NLTK resources
37
+ nltk.download('punkt')
38
+ nltk.download('stopwords')
39
+
40
+ def remove_timestamps(text):
41
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
42
+
43
+ def extract_high_information_words(text, top_n=10):
44
+ words = nltk.word_tokenize(text)
45
+ words = [word.lower() for word in words if word.isalpha()]
46
+ stop_words = set(stopwords.words('english'))
47
+ filtered_words = [word for word in words if word not in stop_words]
48
+ freq_dist = FreqDist(filtered_words)
49
+ return [word for word, _ in freq_dist.most_common(top_n)]
50
+
51
+ def create_relationship_graph(words):
52
+ graph = Digraph()
53
+ for index, word in enumerate(words):
54
+ graph.node(str(index), word)
55
+ if index > 0:
56
+ graph.edge(str(index - 1), str(index), label=str(index))
57
+ return graph
58
+
59
+ def display_relationship_graph(words):
60
+ graph = create_relationship_graph(words)
61
+ st.graphviz_chart(graph)
62
+
63
+ def extract_context_words(text, high_information_words):
64
+ words = nltk.word_tokenize(text)
65
+ context_words = []
66
+ for index, word in enumerate(words):
67
+ if word.lower() in high_information_words:
68
+ before_word = words[index - 1] if index > 0 else None
69
+ after_word = words[index + 1] if index < len(words) - 1 else None
70
+ context_words.append((before_word, word, after_word))
71
+ return context_words
72
+
73
+ def create_context_graph(context_words):
74
+ graph = Digraph()
75
+ for index, (before_word, high_info_word, after_word) in enumerate(context_words):
76
+ if before_word:
77
+ graph.node(f'before{index}', before_word, shape='box')
78
+ graph.node(f'high{index}', high_info_word, shape='ellipse')
79
+ if after_word:
80
+ graph.node(f'after{index}', after_word, shape='diamond')
81
+ if before_word:
82
+ graph.edge(f'before{index}', f'high{index}')
83
+ if after_word:
84
+ graph.edge(f'high{index}', f'after{index}')
85
+ return graph
86
+
87
+ def display_context_graph(context_words):
88
+ graph = create_context_graph(context_words)
89
+ st.graphviz_chart(graph)
90
+
91
+ def display_context_table(context_words):
92
+ table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
93
+ for before, high, after in context_words:
94
+ table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
95
+ st.markdown(table)
96
+
97
+ def load_example_files():
98
+ # Exclude specific files
99
+ excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
100
+
101
+ # List all .txt files excluding the ones in excluded_files
102
+ example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
103
+
104
+ # Check if there are any files to select from
105
+ if example_files:
106
+ selected_file = st.selectbox("๐Ÿ“„ Select an example file:", example_files)
107
+ if st.button(f"๐Ÿ“‚ Load {selected_file}"):
108
+ with open(selected_file, 'r', encoding="utf-8") as file:
109
+ return file.read()
110
+ else:
111
+ st.write("No suitable example files found.")
112
+
113
+ return None
114
+
115
+ def cluster_sentences(sentences, num_clusters):
116
+ # Filter sentences with length over 10 characters
117
+ sentences = [sentence for sentence in sentences if len(sentence) > 10]
118
+
119
+ # Check if the number of sentences is less than the desired number of clusters
120
+ if len(sentences) < num_clusters:
121
+ # If so, adjust the number of clusters to match the number of sentences
122
+ num_clusters = len(sentences)
123
+
124
+ # Vectorize the sentences
125
+ vectorizer = TfidfVectorizer()
126
+ X = vectorizer.fit_transform(sentences)
127
+
128
+ # Perform k-means clustering
129
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
130
+ kmeans.fit(X)
131
+
132
+ # Calculate the centroid of each cluster
133
+ cluster_centers = kmeans.cluster_centers_
134
+
135
+ # Group sentences by cluster and calculate similarity to centroid
136
+ clustered_sentences = [[] for _ in range(num_clusters)]
137
+ for i, label in enumerate(kmeans.labels_):
138
+ similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
139
+ clustered_sentences[label].append((similarity, sentences[i]))
140
+
141
+ # Order sentences within each cluster based on their similarity to the centroid
142
+ for cluster in clustered_sentences:
143
+ cluster.sort(reverse=True) # Sort based on similarity (descending order)
144
+
145
+ # Return the ordered clustered sentences without similarity scores for display
146
+ return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
147
+
148
+ def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐Ÿ’พ Save"):
149
+ buffer = BytesIO()
150
+ buffer.write(text_to_download.encode())
151
+ buffer.seek(0)
152
+ b64 = base64.b64encode(buffer.read()).decode()
153
+ href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
154
+ return href
155
+
156
+ def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
157
+ cluster_high_info_words = []
158
+ for cluster in cluster_sentences:
159
+ cluster_text = " ".join(cluster)
160
+ high_info_words = extract_high_information_words(cluster_text, num_words)
161
+ cluster_high_info_words.append(high_info_words)
162
+ return cluster_high_info_words
163
+
164
+ def plot_cluster_words(cluster_sentences):
165
+ for i, cluster in enumerate(cluster_sentences):
166
+ cluster_text = " ".join(cluster)
167
+ words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
168
+ word_freq = FreqDist(words)
169
+ top_words = [word for word, _ in word_freq.most_common(20)]
170
+
171
+ vectorizer = TfidfVectorizer()
172
+ X = vectorizer.fit_transform(top_words)
173
+ word_vectors = X.toarray()
174
+
175
+ similarity_matrix = cosine_similarity(word_vectors)
176
+
177
+ G = nx.from_numpy_array(similarity_matrix)
178
+ pos = nx.spring_layout(G, k=0.5)
179
+
180
+ plt.figure(figsize=(8, 6))
181
+ nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, node_color='skyblue', edge_color='gray')
182
+ plt.axis('off')
183
+ plt.title(f"Cluster {i+1} Word Arrangement")
184
+
185
+ st.pyplot(plt)
186
+
187
+ st.markdown(f"**Cluster {i+1} Details:**")
188
+ st.markdown(f"Top Words: {', '.join(top_words)}")
189
+ st.markdown(f"Number of Sentences: {len(cluster)}")
190
+ st.markdown("---")
191
+
192
+ # Main code for UI
193
+ uploaded_file = st.file_uploader("๐Ÿ“ Choose a .txt file", type=['txt'])
194
+
195
+ example_text = load_example_files()
196
+
197
+ if example_text:
198
+ file_text = example_text
199
+ elif uploaded_file:
200
+ file_text = uploaded_file.read().decode("utf-8")
201
+ else:
202
+ file_text = ""
203
+
204
+ if file_text:
205
+ text_without_timestamps = remove_timestamps(file_text)
206
+ top_words = extract_high_information_words(text_without_timestamps, 10)
207
+
208
+ with st.expander("๐Ÿ“Š Top 10 High Information Words"):
209
+ st.write(top_words)
210
+
211
+ with st.expander("๐Ÿ“ˆ Relationship Graph"):
212
+ display_relationship_graph(top_words)
213
+
214
+ context_words = extract_context_words(text_without_timestamps, top_words)
215
+
216
+ with st.expander("๐Ÿ”— Context Graph"):
217
+ display_context_graph(context_words)
218
+
219
+ with st.expander("๐Ÿ“‘ Context Table"):
220
+ display_context_table(context_words)
221
+
222
+ with st.expander("๐Ÿ“ Sentence Clustering", expanded=True):
223
+ sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
224
+
225
+ num_sentences = len(sentences)
226
+ st.write(f"Total Sentences: {num_sentences}")
227
+
228
+ num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
229
+ clustered_sentences = cluster_sentences(sentences, num_clusters)
230
+
231
+ col1, col2 = st.columns(2)
232
+
233
+ with col1:
234
+ st.subheader("Original Text")
235
+ original_text = "\n".join(sentences)
236
+ st.text_area("Original Sentences", value=original_text, height=400)
237
+
238
+ with col2:
239
+ st.subheader("Clustered Text")
240
+ clusters = ""
241
+ clustered_text = ""
242
+ cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
243
+
244
+ for i, cluster in enumerate(clustered_sentences):
245
+ cluster_text = "\n".join(cluster)
246
+ high_info_words = ", ".join(cluster_high_info_words[i])
247
+ clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
248
+ clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
249
+
250
+ st.text_area("Clusters", value=clusters, height=200)
251
+ st.text_area("Clustered Sentences", value=clustered_text, height=200)
252
+
253
+ # Verify that all sentences are accounted for in the clustered output
254
+ clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
255
+ if set(sentences) == set(clustered_sentences_flat):
256
+ st.write("โœ… All sentences are accounted for in the clustered output.")
257
+ else:
258
+ st.write("โŒ Some sentences are missing in the clustered output.")
259
+
260
+ plot_cluster_words(clustered_sentences)
261
+
262
+ st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")