awacke1 commited on
Commit
e227516
β€’
1 Parent(s): 38b1c07

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -40
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import streamlit as st
2
  import re
3
  import nltk
@@ -7,9 +8,6 @@ from nltk import FreqDist
7
  from graphviz import Digraph
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.cluster import KMeans
10
- from sklearn.metrics.pairwise import linear_kernel
11
- from io import BytesIO
12
- import base64
13
 
14
  # Set page configuration with a title and favicon
15
  st.set_page_config(
@@ -24,6 +22,10 @@ st.set_page_config(
24
  }
25
  )
26
 
 
 
 
 
27
  # Download NLTK resources
28
  nltk.download('punkt')
29
  nltk.download('stopwords')
@@ -39,9 +41,76 @@ def extract_high_information_words(text, top_n=10):
39
  freq_dist = FreqDist(filtered_words)
40
  return [word for word, _ in freq_dist.most_common(top_n)]
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def cluster_sentences(sentences, num_clusters):
43
- # Filter sentences with length over 10 characters
44
- sentences = [sentence for sentence in sentences if len(sentence) > 10]
 
 
45
 
46
  # Vectorize the sentences
47
  vectorizer = TfidfVectorizer()
@@ -51,56 +120,59 @@ def cluster_sentences(sentences, num_clusters):
51
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
52
  kmeans.fit(X)
53
 
54
- # Calculate the centroid of each cluster
55
- cluster_centers = kmeans.cluster_centers_
56
 
57
- # Group sentences by cluster and calculate similarity to centroid
58
  clustered_sentences = [[] for _ in range(num_clusters)]
59
- for i, label in enumerate(kmeans.labels_):
60
- similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
61
- clustered_sentences[label].append((similarity, sentences[i]))
62
-
63
- # Order sentences within each cluster based on their similarity to the centroid
64
- for cluster in clustered_sentences:
65
- cluster.sort(reverse=True) # Sort based on similarity (descending order)
66
-
67
- # Return the ordered clustered sentences without similarity scores for display
68
- return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
69
-
70
- # Function to convert text to a downloadable file
71
- def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="πŸ’Ύ Save"):
72
- buffer = BytesIO()
73
- buffer.write(text_to_download.encode())
74
- buffer.seek(0)
75
- b64 = base64.b64encode(buffer.read()).decode()
76
- href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
77
- return href
78
 
79
  # Main code for UI
80
  uploaded_file = st.file_uploader("πŸ“ Choose a .txt file", type=['txt'])
81
 
82
- if uploaded_file:
 
 
 
 
83
  file_text = uploaded_file.read().decode("utf-8")
84
  else:
85
  file_text = ""
86
 
87
  if file_text:
88
  text_without_timestamps = remove_timestamps(file_text)
89
- sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if len(sentence.strip()) > 10]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  with st.expander("πŸ“ Sentence Clustering"):
 
 
92
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
93
  clustered_sentences = cluster_sentences(sentences, num_clusters)
94
 
 
95
  for i, cluster in enumerate(clustered_sentences):
96
- st.text_area(f"Cluster {i+1}", value="\n".join(cluster), height=100)
97
-
98
- # Input for custom filename
99
- default_filename = f"Cluster_{i+1}_Output.txt"
100
- filename = st.text_input("Enter filename for download:", value=default_filename, key=f"filename_{i}")
101
-
102
- # Download button
103
- download_link = get_text_file_download_link("\n".join(cluster), filename, f"πŸ’Ύ Save Cluster {i+1}")
104
- st.markdown(download_link, unsafe_allow_html=True)
105
-
106
- st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
 
1
+ # Import necessary libraries
2
  import streamlit as st
3
  import re
4
  import nltk
 
8
  from graphviz import Digraph
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.cluster import KMeans
 
 
 
11
 
12
  # Set page configuration with a title and favicon
13
  st.set_page_config(
 
22
  }
23
  )
24
 
25
+ st.markdown('''πŸ” **Exploratory Data Analysis (EDA)** πŸ“Š: - Dive deep into the sea of data with our EDA feature, unveiling hidden patterns πŸ•΅οΈβ€β™‚οΈ and insights 🧠 in your transcripts. Transform raw data into a treasure trove of information πŸ†.
26
+ πŸ“œ **Natural Language Toolkit (NLTK)** πŸ› οΈ: - Harness the power of NLTK to process and understand human language πŸ—£οΈ. From tokenization to sentiment analysis, our toolkit is your compass 🧭 in the vast landscape of natural language processing (NLP).
27
+ πŸ“Ί **Transcript Analysis** πŸ“ˆ: - Elevate your text analysis with our advanced transcript analysis tools. Whether it's speech recognition πŸŽ™οΈ or thematic extraction 🌐, turn your audiovisual content into actionable insights πŸ”‘.''')
28
+
29
  # Download NLTK resources
30
  nltk.download('punkt')
31
  nltk.download('stopwords')
 
41
  freq_dist = FreqDist(filtered_words)
42
  return [word for word, _ in freq_dist.most_common(top_n)]
43
 
44
+ def create_relationship_graph(words):
45
+ graph = Digraph()
46
+ for index, word in enumerate(words):
47
+ graph.node(str(index), word)
48
+ if index > 0:
49
+ graph.edge(str(index - 1), str(index), label=str(index))
50
+ return graph
51
+
52
+ def display_relationship_graph(words):
53
+ graph = create_relationship_graph(words)
54
+ st.graphviz_chart(graph)
55
+
56
+ def extract_context_words(text, high_information_words):
57
+ words = nltk.word_tokenize(text)
58
+ context_words = []
59
+ for index, word in enumerate(words):
60
+ if word.lower() in high_information_words:
61
+ before_word = words[index - 1] if index > 0 else None
62
+ after_word = words[index + 1] if index < len(words) - 1 else None
63
+ context_words.append((before_word, word, after_word))
64
+ return context_words
65
+
66
+ def create_context_graph(context_words):
67
+ graph = Digraph()
68
+ for index, (before_word, high_info_word, after_word) in enumerate(context_words):
69
+ #graph.node(f'before{index}', before_word, shape='box') if before_word else None
70
+ if before_word: graph.node(f'before{index}', before_word, shape='box') # else None
71
+ graph.node(f'high{index}', high_info_word, shape='ellipse')
72
+ #graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
73
+ if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
74
+ if before_word:
75
+ graph.edge(f'before{index}', f'high{index}')
76
+ if after_word:
77
+ graph.edge(f'high{index}', f'after{index}')
78
+ return graph
79
+
80
+ def display_context_graph(context_words):
81
+ graph = create_context_graph(context_words)
82
+ st.graphviz_chart(graph)
83
+
84
+ def display_context_table(context_words):
85
+ table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
86
+ for before, high, after in context_words:
87
+ table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
88
+ st.markdown(table)
89
+
90
+
91
+ def load_example_files():
92
+ # Exclude specific files
93
+ excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
94
+
95
+ # List all .txt files excluding the ones in excluded_files
96
+ example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
97
+
98
+ # Check if there are any files to select from
99
+ if example_files:
100
+ selected_file = st.selectbox("πŸ“„ Select an example file:", example_files)
101
+ if st.button(f"πŸ“‚ Load {selected_file}"):
102
+ with open(selected_file, 'r', encoding="utf-8") as file:
103
+ return file.read()
104
+ else:
105
+ st.write("No suitable example files found.")
106
+
107
+ return None
108
+
109
  def cluster_sentences(sentences, num_clusters):
110
+ # Check if the number of sentences is less than the desired number of clusters
111
+ if len(sentences) < num_clusters:
112
+ # If so, adjust the number of clusters to match the number of sentences
113
+ num_clusters = len(sentences)
114
 
115
  # Vectorize the sentences
116
  vectorizer = TfidfVectorizer()
 
120
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
121
  kmeans.fit(X)
122
 
123
+ # Get the cluster labels for each sentence
124
+ labels = kmeans.labels_
125
 
126
+ # Group sentences by cluster
127
  clustered_sentences = [[] for _ in range(num_clusters)]
128
+ for i, label in enumerate(labels):
129
+ clustered_sentences[label].append(sentences[i])
130
+
131
+ return clustered_sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # Main code for UI
134
  uploaded_file = st.file_uploader("πŸ“ Choose a .txt file", type=['txt'])
135
 
136
+ example_text = load_example_files()
137
+
138
+ if example_text:
139
+ file_text = example_text
140
+ elif uploaded_file:
141
  file_text = uploaded_file.read().decode("utf-8")
142
  else:
143
  file_text = ""
144
 
145
  if file_text:
146
  text_without_timestamps = remove_timestamps(file_text)
147
+ top_words = extract_high_information_words(text_without_timestamps, 10)
148
+
149
+ with st.expander("πŸ“Š Top 10 High Information Words"):
150
+ st.write(top_words)
151
+
152
+ with st.expander("πŸ“ˆ Relationship Graph"):
153
+ display_relationship_graph(top_words)
154
+
155
+ context_words = extract_context_words(text_without_timestamps, top_words)
156
+
157
+ with st.expander("πŸ”— Context Graph"):
158
+ display_context_graph(context_words)
159
+
160
+ with st.expander("πŸ“‘ Context Table"):
161
+ display_context_table(context_words)
162
+
163
+ #with st.expander("Innovation Outlines"):
164
+ # showInnovationOutlines()
165
 
166
  with st.expander("πŸ“ Sentence Clustering"):
167
+ sentences = [sentence.strip() for sentence in text_without_timestamps.split('.') if sentence.strip()]
168
+
169
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
170
  clustered_sentences = cluster_sentences(sentences, num_clusters)
171
 
172
+ output_text = ""
173
  for i, cluster in enumerate(clustered_sentences):
174
+ output_text += f"Cluster {i+1}:\n"
175
+ output_text += "\n".join(cluster)
176
+ output_text += "\n\n"
177
+
178
+ st.text_area("Clustered Sentences", value=output_text, height=400)