Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from fuzzywuzzy import fuzz | |
from fuzzywuzzy import process | |
import string | |
from nltk.corpus import stopwords | |
import nltk | |
import io | |
nltk.download('stopwords') | |
# Helper functions | |
def clean_text(text): | |
text = text.lower() | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
return text | |
def clean_text_fuzzy(text): | |
stop_words = set(stopwords.words('english')) | |
text = text.lower() | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
words = text.split() | |
words = [word for word in words if word not in stop_words] | |
return " ".join(words) | |
def process_matching(keywords, article, fuzzy, mode): | |
keywords = [clean_text(k) for k in keywords] | |
article_exact = clean_text(article) | |
article_fuzzy = clean_text_fuzzy(article) | |
results = {} | |
max_keyword_length = max(len(k.split()) for k in keywords) | |
for keyword in keywords: | |
if keyword not in results: | |
results[keyword] = 0 | |
if fuzzy: | |
for n in range(1, max_keyword_length + 1): | |
n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)] | |
matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None) | |
results[keyword] += sum(1 for match, score in matches if score > 90) | |
else: | |
for n in range(1, max_keyword_length + 1): | |
n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)] | |
results[keyword] += n_grams_exact.count(keyword) | |
if mode == "filter": | |
results = {k: v for k, v in results.items() if v > 0} | |
total_count = sum(results.values()) | |
return results, total_count | |
# Streamlit app | |
st.title("Keyword Matcher") | |
# Mode selection | |
mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True) | |
mode = "frequency" if mode == "Keyword Frequency" else "filter" | |
# Keyword input | |
st.subheader("Keywords") | |
keywords_input = st.text_area("Enter keywords (comma separated):") | |
uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"]) | |
keywords = [] | |
if uploaded_file: | |
if uploaded_file.name.endswith(".csv"): | |
df = pd.read_csv(uploaded_file) | |
else: | |
df = pd.read_excel(uploaded_file) | |
if not df.empty: | |
keywords = df.iloc[:, 0].dropna().tolist() | |
else: | |
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()] | |
# Article input | |
st.subheader("Article") | |
article = st.text_area("Paste the article text here:") | |
# Fuzzy matching checkbox | |
fuzzy = st.checkbox("Enable Fuzzy Matching") | |
# Process button | |
if st.button("Process"): | |
if not keywords: | |
st.error("Please provide keywords.") | |
elif not article: | |
st.error("Please provide an article.") | |
else: | |
results, total_count = process_matching(keywords, article, fuzzy, mode) | |
st.subheader("Results") | |
for keyword, count in results.items(): | |
st.write(f"{keyword}: {count}") | |
st.write(f"**Total Count:** {total_count}") | |
# Save to Excel | |
st.subheader("Download Results") | |
df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"]) | |
output = io.BytesIO() | |
with pd.ExcelWriter(output, engine='openpyxl') as writer: | |
df.to_excel(writer, index=False, sheet_name="Results") | |
output.seek(0) | |
st.download_button( | |
label="Download Results as Excel", | |
data=output, | |
file_name="results.xlsx", | |
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
) | |