awacke1 commited on
Commit
d9c302e
1 Parent(s): 4886a69

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import re
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk import FreqDist
6
+ from graphviz import Digraph
7
+
8
+ nltk.download('punkt')
9
+ nltk.download('stopwords')
10
+
11
+ def remove_timestamps(text):
12
+ return re.sub(r'\d{1,2}:\d{2}\n', '', text)
13
+
14
+ def process_text(text):
15
+ lines = text.split("\n")
16
+ processed_lines = []
17
+
18
+ for line in lines:
19
+ if line:
20
+ processed_lines.append(line)
21
+
22
+ outline = ""
23
+ for i, line in enumerate(processed_lines):
24
+ if i % 2 == 0:
25
+ outline += f"**{line}**\n"
26
+ else:
27
+ outline += f"- {line} 😄\n"
28
+
29
+ return outline
30
+
31
+ def unit_test(input_text):
32
+ st.write("Test Text without Timestamps:")
33
+ test_text_without_timestamps = remove_timestamps(input_text)
34
+ st.write(test_text_without_timestamps)
35
+
36
+ def extract_high_information_words(text, top_n=10):
37
+ words = nltk.word_tokenize(text)
38
+ words = [word.lower() for word in words if word.isalpha()]
39
+
40
+ stop_words = set(stopwords.words('english'))
41
+ filtered_words = [word for word in words if word not in stop_words]
42
+
43
+ freq_dist = FreqDist(filtered_words)
44
+ high_information_words = [word for word, _ in freq_dist.most_common(top_n)]
45
+
46
+ return high_information_words
47
+
48
+ def create_relationship_graph(words):
49
+ graph = Digraph()
50
+
51
+ for index, word in enumerate(words):
52
+ graph.node(str(index), word)
53
+
54
+ if index > 0:
55
+ graph.edge(str(index - 1), str(index), label=str(index))
56
+
57
+ return graph
58
+
59
+ def display_relationship_graph(words):
60
+ graph = create_relationship_graph(words)
61
+ st.graphviz_chart(graph)
62
+
63
+ uploaded_file = st.file_uploader("Choose a .txt file", type=['txt'])
64
+
65
+ if uploaded_file:
66
+ file_text = uploaded_file.read().decode("utf-8")
67
+ text_without_timestamps = remove_timestamps(file_text)
68
+
69
+ st.markdown("**Text without Timestamps:**")
70
+ st.write(text_without_timestamps)
71
+
72
+ processed_text = process_text(text_without_timestamps)
73
+ st.markdown("**Markdown Outline with Emojis:**")
74
+ st.markdown(processed_text)
75
+
76
+ unit_test(file_text)
77
+
78
+ top_words = extract_high_information_words(text_without_timestamps, 10)
79
+ st.markdown("**Top 10 High Information Words:**")
80
+ st.write(top_words)
81
+
82
+ st.markdown("**Relationship Graph:**")
83
+ display_relationship_graph(top_words)