Spaces:
Running
Running
File size: 3,758 Bytes
e8d4fbe 9ead40f e8d4fbe 336a973 cd2e815 336a973 cd2e815 336a973 e8d4fbe 336a973 e8d4fbe 0f5c3da e8d4fbe 0f5c3da e8d4fbe 0f5c3da e8d4fbe 9ead40f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import streamlit as st
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
from nltk.corpus import stopwords
import nltk
import io
nltk.download('stopwords')
# Helper functions
def clean_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
return text
def clean_text_fuzzy(text):
stop_words = set(stopwords.words('english'))
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
words = text.split()
words = [word for word in words if word not in stop_words]
return " ".join(words)
def process_matching(keywords, article, fuzzy, mode):
keywords = [clean_text(k) for k in keywords]
article_exact = clean_text(article)
article_fuzzy = clean_text_fuzzy(article)
results = {}
max_keyword_length = max(len(k.split()) for k in keywords)
for keyword in keywords:
if keyword not in results:
results[keyword] = 0
if fuzzy:
for n in range(1, max_keyword_length + 1):
n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)]
matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None)
results[keyword] += sum(1 for match, score in matches if score > 90)
else:
for n in range(1, max_keyword_length + 1):
n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)]
results[keyword] += n_grams_exact.count(keyword)
if mode == "filter":
results = {k: v for k, v in results.items() if v > 0}
total_count = sum(results.values())
return results, total_count
# Streamlit app
st.title("Keyword Matcher")
# Mode selection
mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True)
mode = "frequency" if mode == "Keyword Frequency" else "filter"
# Keyword input
st.subheader("Keywords")
keywords_input = st.text_area("Enter keywords (comma separated):")
uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"])
keywords = []
if uploaded_file:
if uploaded_file.name.endswith(".csv"):
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
if not df.empty:
keywords = df.iloc[:, 0].dropna().tolist()
else:
keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]
# Article input
st.subheader("Article")
article = st.text_area("Paste the article text here:")
# Fuzzy matching checkbox
fuzzy = st.checkbox("Enable Fuzzy Matching")
# Process button
if st.button("Process"):
if not keywords:
st.error("Please provide keywords.")
elif not article:
st.error("Please provide an article.")
else:
results, total_count = process_matching(keywords, article, fuzzy, mode)
st.subheader("Results")
for keyword, count in results.items():
st.write(f"{keyword}: {count}")
st.write(f"**Total Count:** {total_count}")
# Save to Excel
st.subheader("Download Results")
df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"])
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name="Results")
output.seek(0)
st.download_button(
label="Download Results as Excel",
data=output,
file_name="results.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
|