import streamlit as st import pandas as pd from fuzzywuzzy import fuzz from fuzzywuzzy import process import string from nltk.corpus import stopwords import nltk import io nltk.download('stopwords') # Helper functions def clean_text(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) return text def clean_text_fuzzy(text): stop_words = set(stopwords.words('english')) text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) words = text.split() words = [word for word in words if word not in stop_words] return " ".join(words) def process_matching(keywords, article, fuzzy, mode): keywords = [clean_text(k) for k in keywords] article_exact = clean_text(article) article_fuzzy = clean_text_fuzzy(article) results = {} max_keyword_length = max(len(k.split()) for k in keywords) for keyword in keywords: if keyword not in results: results[keyword] = 0 if fuzzy: for n in range(1, max_keyword_length + 1): n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)] matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None) results[keyword] += sum(1 for match, score in matches if score > 90) else: for n in range(1, max_keyword_length + 1): n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)] results[keyword] += n_grams_exact.count(keyword) if mode == "filter": results = {k: v for k, v in results.items() if v > 0} total_count = sum(results.values()) return results, total_count # Streamlit app st.title("Keyword Matcher") # Mode selection mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True) mode = "frequency" if mode == "Keyword Frequency" else "filter" # Keyword input st.subheader("Keywords") keywords_input = st.text_area("Enter keywords (comma separated):") uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"]) keywords = [] if uploaded_file: if uploaded_file.name.endswith(".csv"): df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) if not df.empty: keywords = df.iloc[:, 0].dropna().tolist() else: keywords = [k.strip() for k in keywords_input.split(",") if k.strip()] # Article input st.subheader("Article") article = st.text_area("Paste the article text here:") # Fuzzy matching checkbox fuzzy = st.checkbox("Enable Fuzzy Matching") # Process button if st.button("Process"): if not keywords: st.error("Please provide keywords.") elif not article: st.error("Please provide an article.") else: results, total_count = process_matching(keywords, article, fuzzy, mode) st.subheader("Results") for keyword, count in results.items(): st.write(f"{keyword}: {count}") st.write(f"**Total Count:** {total_count}") # Save to Excel st.subheader("Download Results") df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"]) output = io.BytesIO() with pd.ExcelWriter(output, engine='openpyxl') as writer: df.to_excel(writer, index=False, sheet_name="Results") output.seek(0) st.download_button( label="Download Results as Excel", data=output, file_name="results.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" )