awacke1 commited on
Commit
15fa1c4
1 Parent(s): 84a3b75

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +153 -0
app.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import datetime
4
+ import io
5
+ import nltk
6
+ import base64
7
+ import os
8
+ from nltk.tokenize import sent_tokenize, word_tokenize
9
+ from nltk.corpus import stopwords
10
+ from sklearn.feature_extraction.text import CountVectorizer
11
+ from sklearn.decomposition import LatentDirichletAllocation
12
+
13
+ nltk.download('punkt')
14
+ nltk.download('stopwords')
15
+
16
+ def generate_file_name(text, file_type):
17
+ try:
18
+ # Tokenize the text into sentences
19
+ sentences = sent_tokenize(text)
20
+
21
+ # Tokenize the sentences into words and remove stopwords
22
+ words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum()]
23
+ stop_words = set(stopwords.words('english'))
24
+ filtered_words = [word for word in words if word not in stop_words]
25
+
26
+ # Count word frequencies
27
+ word_freq = nltk.FreqDist(filtered_words)
28
+
29
+ # Get the top 3 most frequent words
30
+ top_words = [word for word, _ in word_freq.most_common(3)]
31
+
32
+ # Generate the file name
33
+ current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
34
+ file_name = f"{'-'.join(top_words)}_{current_time}.{file_type}"
35
+ return file_name
36
+ except:
37
+ # Fallback to default file naming if an error occurs
38
+ current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
39
+ file_name = f"text_file_{current_time}.{file_type}"
40
+ return file_name
41
+
42
+ def save_text_as_file(text, file_type):
43
+ file_name = generate_file_name(text, file_type)
44
+ with open(file_name, "w") as file:
45
+ file.write(text)
46
+ st.success(f"Text saved as {file_name}")
47
+ return file_name
48
+
49
+ def save_list_as_excel(text):
50
+ lines = text.split("\n")
51
+ data = []
52
+ for line in lines:
53
+ if line.strip():
54
+ parts = line.split(" - ", 1)
55
+ if len(parts) == 2:
56
+ data.append(parts)
57
+ else:
58
+ data.append([line.strip(), ""])
59
+ df = pd.DataFrame(data, columns=["Character", "Description"])
60
+ file_name = generate_file_name(text, "xlsx")
61
+ df.to_excel(file_name, index=False)
62
+ st.success(f"Character list saved as {file_name}")
63
+ return file_name
64
+
65
+ @st.cache_resource
66
+ def get_download_link(file_path):
67
+ try:
68
+ with open(file_path, 'rb') as file:
69
+ data = file.read()
70
+ b64 = base64.b64encode(data).decode()
71
+ file_name = os.path.basename(file_path)
72
+ ext = os.path.splitext(file_name)[1] # get the file extension
73
+ if ext == '.xlsx':
74
+ mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
75
+ elif ext == '.csv':
76
+ mime_type = 'text/csv'
77
+ elif ext == '.md':
78
+ mime_type = 'text/markdown'
79
+ else:
80
+ mime_type = 'application/octet-stream' # general binary data type
81
+ href = f'<a href="data:{mime_type};base64,{b64}" download="{file_name}">{file_name}</a>'
82
+ return href
83
+ except:
84
+ return ''
85
+
86
+ def perform_nlp(text):
87
+ sentences = sent_tokenize(text)
88
+ # Topic Modeling
89
+ vectorizer = CountVectorizer(stop_words='english')
90
+ X = vectorizer.fit_transform(sentences)
91
+ lda = LatentDirichletAllocation(n_components=3, random_state=42)
92
+ lda.fit(X)
93
+ topics = lda.transform(X)
94
+ # Display topics
95
+ st.subheader("Topic Modeling")
96
+ for i, topic in enumerate(topics):
97
+ st.write(f"Topic {i+1}:")
98
+ topic_words = ", ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-6:-1]])
99
+ st.write(topic_words)
100
+ # Word Frequency
101
+ word_freq = pd.Series(" ".join(sentences).split()).value_counts()[:10]
102
+ st.subheader("Word Frequency")
103
+ st.bar_chart(word_freq)
104
+
105
+ def show_files_in_directory():
106
+ st.subheader("Files in Current Directory")
107
+ files = []
108
+ for file in os.listdir("."):
109
+ if file.endswith((".md", ".xlsx", ".csv")):
110
+ file_size = os.path.getsize(file)
111
+ file_modified_time = datetime.datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d %H:%M:%S")
112
+ files.append({"File Name": get_download_link(file), "Size (bytes)": file_size, "Last Modified": file_modified_time})
113
+ files_df = pd.DataFrame(files)
114
+ st.write(files_df.to_html(escape=False, index=False), unsafe_allow_html=True)
115
+
116
+ def main():
117
+ st.title("AI UI for Text Processing")
118
+ text_input = st.text_area("Paste your text here")
119
+
120
+ if st.button("Process Text"):
121
+ if text_input.strip() == "":
122
+ st.warning("Please paste some text.")
123
+ else:
124
+ file_name = None
125
+ if text_input.strip().startswith(("1.", "1 -", "1 _")) and "\n" in text_input:
126
+ file_name = save_list_as_excel(text_input)
127
+ save_text_as_file(text_input, "csv")
128
+ save_text_as_file(text_input, "md")
129
+ elif "." in text_input or "!" in text_input or "?" in text_input:
130
+ file_name = save_text_as_file(text_input, "txt")
131
+ save_text_as_file(text_input, "csv")
132
+ save_text_as_file(text_input, "md")
133
+ perform_nlp(text_input)
134
+ else:
135
+ file_name = save_text_as_file(text_input, "txt")
136
+ save_text_as_file(text_input, "csv")
137
+ save_text_as_file(text_input, "md")
138
+
139
+ if file_name:
140
+ try:
141
+ df = pd.read_excel(file_name)
142
+ st.subheader("Saved Data")
143
+ st.dataframe(df)
144
+ st.markdown(get_download_link(file_name), unsafe_allow_html=True)
145
+ st.markdown(get_download_link(file_name.replace(".xlsx", ".csv")), unsafe_allow_html=True)
146
+ st.markdown(get_download_link(file_name.replace(".xlsx", ".md")), unsafe_allow_html=True)
147
+ except:
148
+ pass
149
+
150
+ show_files_in_directory()
151
+
152
+ if __name__ == "__main__":
153
+ main()