|
import streamlit as st |
|
import pandas as pd |
|
import datetime |
|
import io |
|
import nltk |
|
import base64 |
|
import os |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from nltk.corpus import stopwords |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.decomposition import LatentDirichletAllocation |
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
|
|
def generate_file_name(text, file_type): |
|
try: |
|
|
|
sentences = sent_tokenize(text) |
|
|
|
|
|
words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum()] |
|
stop_words = set(stopwords.words('english')) |
|
filtered_words = [word for word in words if word not in stop_words] |
|
|
|
|
|
word_freq = nltk.FreqDist(filtered_words) |
|
|
|
|
|
top_words = [word for word, _ in word_freq.most_common(3)] |
|
|
|
|
|
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
file_name = f"{'-'.join(top_words)}_{current_time}.{file_type}" |
|
return file_name |
|
except: |
|
|
|
current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
|
file_name = f"text_file_{current_time}.{file_type}" |
|
return file_name |
|
|
|
def save_text_as_file(text, file_type): |
|
file_name = generate_file_name(text, file_type) |
|
with open(file_name, "w") as file: |
|
file.write(text) |
|
st.success(f"Text saved as {file_name}") |
|
return file_name |
|
|
|
def save_list_as_excel(text): |
|
lines = text.split("\n") |
|
data = [] |
|
for line in lines: |
|
if line.strip(): |
|
parts = line.split(" - ", 1) |
|
if len(parts) == 2: |
|
data.append(parts) |
|
else: |
|
data.append([line.strip(), ""]) |
|
df = pd.DataFrame(data, columns=["Character", "Description"]) |
|
file_name = generate_file_name(text, "xlsx") |
|
df.to_excel(file_name, index=False) |
|
st.success(f"Character list saved as {file_name}") |
|
return file_name |
|
|
|
@st.cache_resource |
|
def get_download_link(file_path): |
|
try: |
|
with open(file_path, 'rb') as file: |
|
data = file.read() |
|
b64 = base64.b64encode(data).decode() |
|
file_name = os.path.basename(file_path) |
|
ext = os.path.splitext(file_name)[1] |
|
if ext == '.xlsx': |
|
mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' |
|
elif ext == '.csv': |
|
mime_type = 'text/csv' |
|
elif ext == '.md': |
|
mime_type = 'text/markdown' |
|
else: |
|
mime_type = 'application/octet-stream' |
|
href = f'<a href="data:{mime_type};base64,{b64}" download="{file_name}">{file_name}</a>' |
|
return href |
|
except: |
|
return '' |
|
|
|
def perform_nlp(text): |
|
sentences = sent_tokenize(text) |
|
|
|
vectorizer = CountVectorizer(stop_words='english') |
|
X = vectorizer.fit_transform(sentences) |
|
lda = LatentDirichletAllocation(n_components=3, random_state=42) |
|
lda.fit(X) |
|
topics = lda.transform(X) |
|
|
|
st.subheader("Topic Modeling") |
|
for i, topic in enumerate(topics): |
|
st.write(f"Topic {i+1}:") |
|
topic_words = ", ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-6:-1]]) |
|
st.write(topic_words) |
|
|
|
word_freq = pd.Series(" ".join(sentences).split()).value_counts()[:10] |
|
st.subheader("Word Frequency") |
|
st.bar_chart(word_freq) |
|
|
|
def show_files_in_directory(): |
|
st.subheader("Files in Current Directory") |
|
files = [] |
|
for file in os.listdir("."): |
|
if file.endswith((".md", ".xlsx", ".csv")): |
|
file_size = os.path.getsize(file) |
|
file_modified_time = datetime.datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d %H:%M:%S") |
|
files.append({"File Name": get_download_link(file), "Size (bytes)": file_size, "Last Modified": file_modified_time}) |
|
files_df = pd.DataFrame(files) |
|
st.write(files_df.to_html(escape=False, index=False), unsafe_allow_html=True) |
|
|
|
def main(): |
|
st.title("AI UI for Text Processing") |
|
text_input = st.text_area("Paste your text here") |
|
|
|
if st.button("Process Text"): |
|
if text_input.strip() == "": |
|
st.warning("Please paste some text.") |
|
else: |
|
file_name = None |
|
if text_input.strip().startswith(("1.", "1 -", "1 _")) and "\n" in text_input: |
|
file_name = save_list_as_excel(text_input) |
|
save_text_as_file(text_input, "csv") |
|
save_text_as_file(text_input, "md") |
|
elif "." in text_input or "!" in text_input or "?" in text_input: |
|
file_name = save_text_as_file(text_input, "txt") |
|
save_text_as_file(text_input, "csv") |
|
save_text_as_file(text_input, "md") |
|
perform_nlp(text_input) |
|
else: |
|
file_name = save_text_as_file(text_input, "txt") |
|
save_text_as_file(text_input, "csv") |
|
save_text_as_file(text_input, "md") |
|
|
|
if file_name: |
|
try: |
|
df = pd.read_excel(file_name) |
|
st.subheader("Saved Data") |
|
st.dataframe(df) |
|
st.markdown(get_download_link(file_name), unsafe_allow_html=True) |
|
st.markdown(get_download_link(file_name.replace(".xlsx", ".csv")), unsafe_allow_html=True) |
|
st.markdown(get_download_link(file_name.replace(".xlsx", ".md")), unsafe_allow_html=True) |
|
except: |
|
pass |
|
|
|
show_files_in_directory() |
|
|
|
if __name__ == "__main__": |
|
main() |