File size: 5,982 Bytes
15fa1c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import streamlit as st
import pandas as pd
import datetime
import io
import nltk
import base64
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('punkt')
nltk.download('stopwords')
def generate_file_name(text, file_type):
try:
# Tokenize the text into sentences
sentences = sent_tokenize(text)
# Tokenize the sentences into words and remove stopwords
words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum()]
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
# Count word frequencies
word_freq = nltk.FreqDist(filtered_words)
# Get the top 3 most frequent words
top_words = [word for word, _ in word_freq.most_common(3)]
# Generate the file name
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"{'-'.join(top_words)}_{current_time}.{file_type}"
return file_name
except:
# Fallback to default file naming if an error occurs
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"text_file_{current_time}.{file_type}"
return file_name
def save_text_as_file(text, file_type):
file_name = generate_file_name(text, file_type)
with open(file_name, "w") as file:
file.write(text)
st.success(f"Text saved as {file_name}")
return file_name
def save_list_as_excel(text):
lines = text.split("\n")
data = []
for line in lines:
if line.strip():
parts = line.split(" - ", 1)
if len(parts) == 2:
data.append(parts)
else:
data.append([line.strip(), ""])
df = pd.DataFrame(data, columns=["Character", "Description"])
file_name = generate_file_name(text, "xlsx")
df.to_excel(file_name, index=False)
st.success(f"Character list saved as {file_name}")
return file_name
@st.cache_resource
def get_download_link(file_path):
try:
with open(file_path, 'rb') as file:
data = file.read()
b64 = base64.b64encode(data).decode()
file_name = os.path.basename(file_path)
ext = os.path.splitext(file_name)[1] # get the file extension
if ext == '.xlsx':
mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
elif ext == '.csv':
mime_type = 'text/csv'
elif ext == '.md':
mime_type = 'text/markdown'
else:
mime_type = 'application/octet-stream' # general binary data type
href = f'<a href="data:{mime_type};base64,{b64}" download="{file_name}">{file_name}</a>'
return href
except:
return ''
def perform_nlp(text):
sentences = sent_tokenize(text)
# Topic Modeling
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)
topics = lda.transform(X)
# Display topics
st.subheader("Topic Modeling")
for i, topic in enumerate(topics):
st.write(f"Topic {i+1}:")
topic_words = ", ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-6:-1]])
st.write(topic_words)
# Word Frequency
word_freq = pd.Series(" ".join(sentences).split()).value_counts()[:10]
st.subheader("Word Frequency")
st.bar_chart(word_freq)
def show_files_in_directory():
st.subheader("Files in Current Directory")
files = []
for file in os.listdir("."):
if file.endswith((".md", ".xlsx", ".csv")):
file_size = os.path.getsize(file)
file_modified_time = datetime.datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d %H:%M:%S")
files.append({"File Name": get_download_link(file), "Size (bytes)": file_size, "Last Modified": file_modified_time})
files_df = pd.DataFrame(files)
st.write(files_df.to_html(escape=False, index=False), unsafe_allow_html=True)
def main():
st.title("AI UI for Text Processing")
text_input = st.text_area("Paste your text here")
if st.button("Process Text"):
if text_input.strip() == "":
st.warning("Please paste some text.")
else:
file_name = None
if text_input.strip().startswith(("1.", "1 -", "1 _")) and "\n" in text_input:
file_name = save_list_as_excel(text_input)
save_text_as_file(text_input, "csv")
save_text_as_file(text_input, "md")
elif "." in text_input or "!" in text_input or "?" in text_input:
file_name = save_text_as_file(text_input, "txt")
save_text_as_file(text_input, "csv")
save_text_as_file(text_input, "md")
perform_nlp(text_input)
else:
file_name = save_text_as_file(text_input, "txt")
save_text_as_file(text_input, "csv")
save_text_as_file(text_input, "md")
if file_name:
try:
df = pd.read_excel(file_name)
st.subheader("Saved Data")
st.dataframe(df)
st.markdown(get_download_link(file_name), unsafe_allow_html=True)
st.markdown(get_download_link(file_name.replace(".xlsx", ".csv")), unsafe_allow_html=True)
st.markdown(get_download_link(file_name.replace(".xlsx", ".md")), unsafe_allow_html=True)
except:
pass
show_files_in_directory()
if __name__ == "__main__":
main() |