Spaces:
Build error
Build error
| import streamlit as st | |
| import pandas as pd | |
| import datetime | |
| import io | |
| import nltk | |
| import base64 | |
| import os | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import stopwords | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.decomposition import LatentDirichletAllocation | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| def generate_file_name(text, file_type): | |
| try: | |
| # Tokenize the text into sentences | |
| sentences = sent_tokenize(text) | |
| # Tokenize the sentences into words and remove stopwords | |
| words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum()] | |
| stop_words = set(stopwords.words('english')) | |
| filtered_words = [word for word in words if word not in stop_words] | |
| # Count word frequencies | |
| word_freq = nltk.FreqDist(filtered_words) | |
| # Get the top 3 most frequent words | |
| top_words = [word for word, _ in word_freq.most_common(3)] | |
| # Generate the file name | |
| current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| file_name = f"{'-'.join(top_words)}_{current_time}.{file_type}" | |
| return file_name | |
| except: | |
| # Fallback to default file naming if an error occurs | |
| current_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | |
| file_name = f"text_file_{current_time}.{file_type}" | |
| return file_name | |
| def save_text_as_file(text, file_type): | |
| file_name = generate_file_name(text, file_type) | |
| with open(file_name, "w") as file: | |
| file.write(text) | |
| st.success(f"Text saved as {file_name}") | |
| return file_name | |
| def save_list_as_excel(text): | |
| lines = text.split("\n") | |
| data = [] | |
| for line in lines: | |
| if line.strip(): | |
| parts = line.split(" - ", 1) | |
| if len(parts) == 2: | |
| data.append(parts) | |
| else: | |
| data.append([line.strip(), ""]) | |
| df = pd.DataFrame(data, columns=["Character", "Description"]) | |
| file_name = generate_file_name(text, "xlsx") | |
| df.to_excel(file_name, index=False) | |
| st.success(f"Character list saved as {file_name}") | |
| return file_name | |
| def get_download_link(file_path): | |
| try: | |
| with open(file_path, 'rb') as file: | |
| data = file.read() | |
| b64 = base64.b64encode(data).decode() | |
| file_name = os.path.basename(file_path) | |
| ext = os.path.splitext(file_name)[1] # get the file extension | |
| if ext == '.xlsx': | |
| mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' | |
| elif ext == '.csv': | |
| mime_type = 'text/csv' | |
| elif ext == '.md': | |
| mime_type = 'text/markdown' | |
| else: | |
| mime_type = 'application/octet-stream' # general binary data type | |
| href = f'<a href="data:{mime_type};base64,{b64}" download="{file_name}">{file_name}</a>' | |
| return href | |
| except: | |
| return '' | |
| def perform_nlp(text): | |
| sentences = sent_tokenize(text) | |
| # Topic Modeling | |
| vectorizer = CountVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(sentences) | |
| lda = LatentDirichletAllocation(n_components=3, random_state=42) | |
| lda.fit(X) | |
| topics = lda.transform(X) | |
| # Display topics | |
| st.subheader("Topic Modeling") | |
| for i, topic in enumerate(topics): | |
| st.write(f"Topic {i+1}:") | |
| topic_words = ", ".join([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-6:-1]]) | |
| st.write(topic_words) | |
| # Word Frequency | |
| word_freq = pd.Series(" ".join(sentences).split()).value_counts()[:10] | |
| st.subheader("Word Frequency") | |
| st.bar_chart(word_freq) | |
| def show_files_in_directory(): | |
| st.subheader("Files in Current Directory") | |
| files = [] | |
| for file in os.listdir("."): | |
| if file.endswith((".md", ".xlsx", ".csv")): | |
| file_size = os.path.getsize(file) | |
| file_modified_time = datetime.datetime.fromtimestamp(os.path.getmtime(file)).strftime("%Y-%m-%d %H:%M:%S") | |
| files.append({"File Name": get_download_link(file), "Size (bytes)": file_size, "Last Modified": file_modified_time}) | |
| files_df = pd.DataFrame(files) | |
| st.write(files_df.to_html(escape=False, index=False), unsafe_allow_html=True) | |
| def main(): | |
| st.title("AI UI for Text Processing") | |
| text_input = st.text_area("Paste your text here") | |
| if st.button("Process Text"): | |
| if text_input.strip() == "": | |
| st.warning("Please paste some text.") | |
| else: | |
| file_name = None | |
| if text_input.strip().startswith(("1.", "1 -", "1 _")) and "\n" in text_input: | |
| file_name = save_list_as_excel(text_input) | |
| save_text_as_file(text_input, "csv") | |
| save_text_as_file(text_input, "md") | |
| elif "." in text_input or "!" in text_input or "?" in text_input: | |
| file_name = save_text_as_file(text_input, "txt") | |
| save_text_as_file(text_input, "csv") | |
| save_text_as_file(text_input, "md") | |
| perform_nlp(text_input) | |
| else: | |
| file_name = save_text_as_file(text_input, "txt") | |
| save_text_as_file(text_input, "csv") | |
| save_text_as_file(text_input, "md") | |
| if file_name: | |
| try: | |
| df = pd.read_excel(file_name) | |
| st.subheader("Saved Data") | |
| st.dataframe(df) | |
| st.markdown(get_download_link(file_name), unsafe_allow_html=True) | |
| st.markdown(get_download_link(file_name.replace(".xlsx", ".csv")), unsafe_allow_html=True) | |
| st.markdown(get_download_link(file_name.replace(".xlsx", ".md")), unsafe_allow_html=True) | |
| except: | |
| pass | |
| show_files_in_directory() | |
| if __name__ == "__main__": | |
| main() |