awacke1's picture
Create app.py
15fa1c4 verified
import streamlit as st
import pandas as pd
import datetime
import io
import nltk
import base64
import os
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
nltk.download('punkt')
nltk.download('stopwords')
def generate_file_name(text, file_type):
try:
# Tokenize the text into sentences
sentences = sent_tokenize(text)
# Tokenize the sentences into words and remove stopwords
words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum()]
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
# Count word frequencies
word_freq = nltk.FreqDist(filtered_words)
# Get the top 3 most frequent words
top_words = [word for word, _ in word_freq.most_common(3)]
# Generate the file name
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"{'-'.join(top_words)}_{current_time}.{file_type}"
return file_name
except:
# Fallback to default file naming if an error occurs
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"text_file_{current_time}.{file_type}"
return file_name
def save_text_as_file(text, file_type):
file_name = generate_file_name(text, file_type)
with open(file_name, "w") as file:
file.write(text)
st.success(f"Text saved as {file_name}")
return file_name
def save_list_as_excel(text):
lines = text.split("\n")
data = []
for line in lines:
if line.strip():
parts = line.split(" - ", 1)
if len(parts) == 2:
data.append(parts)
else:
data.append([line.strip(), ""])
df = pd.DataFrame(data, columns=["Character", "Description"])
file_name = generate_file_name(text, "xlsx")
df.to_excel(file_name, index=False)
st.success(f"Character list saved as {file_name}")
return file_name
@st.cache_resource
def get_download_link(file_path):
try:
with open(file_path, 'rb') as file:
data = file.read()
b64 = base64.b64encode(data).decode()
file_name = os.path.basename(file_path)
ext = os.path.splitext(file_name)[1] # get the file extension
if ext == '.xlsx':
mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
elif ext == '.csv':
mime_type = 'text/csv'
elif ext == '.md':
mime_type = 'text/markdown'
else:
mime_type = 'application/octet-stream' # general binary data type
href = f'<a href="data:{mime_type};base64,{b64}" download="{file_name}">{file_name}</a>'
return href
except:
return ''
def perform_nlp(text):
sentences = sent_tokenize(text)
# Topic Modeling
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(sentences)
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)
topics = lda.transform(X)
# Display topics
st.subheader("Topic Modeling")
for i, topic in enumerate(topics):
st.write(f"Topic {i+1}:")
topic_words = ", ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-6:-1]])
st.write(topic_words)
# Word Frequency
word_freq = pd.Series(" ".join(sentences).split()).value_counts()[:10]
st.subheader("Word Frequency")
st.bar_chart(word_freq)
def show_files_in_directory():
st.subheader("Files in Current Directory")
files = []
for file in os.listdir("."):
if file.endswith((".md", ".xlsx", ".csv")):
file_size = os.path.getsize(file)
file_modified_time = datetime.datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d %H:%M:%S")
files.append({"File Name": get_download_link(file), "Size (bytes)": file_size, "Last Modified": file_modified_time})
files_df = pd.DataFrame(files)
st.write(files_df.to_html(escape=False, index=False), unsafe_allow_html=True)
def main():
st.title("AI UI for Text Processing")
text_input = st.text_area("Paste your text here")
if st.button("Process Text"):
if text_input.strip() == "":
st.warning("Please paste some text.")
else:
file_name = None
if text_input.strip().startswith(("1.", "1 -", "1 _")) and "\n" in text_input:
file_name = save_list_as_excel(text_input)
save_text_as_file(text_input, "csv")
save_text_as_file(text_input, "md")
elif "." in text_input or "!" in text_input or "?" in text_input:
file_name = save_text_as_file(text_input, "txt")
save_text_as_file(text_input, "csv")
save_text_as_file(text_input, "md")
perform_nlp(text_input)
else:
file_name = save_text_as_file(text_input, "txt")
save_text_as_file(text_input, "csv")
save_text_as_file(text_input, "md")
if file_name:
try:
df = pd.read_excel(file_name)
st.subheader("Saved Data")
st.dataframe(df)
st.markdown(get_download_link(file_name), unsafe_allow_html=True)
st.markdown(get_download_link(file_name.replace(".xlsx", ".csv")), unsafe_allow_html=True)
st.markdown(get_download_link(file_name.replace(".xlsx", ".md")), unsafe_allow_html=True)
except:
pass
show_files_in_directory()
if __name__ == "__main__":
main()