File size: 3,758 Bytes
e8d4fbe
 
 
 
 
 
 
9ead40f
e8d4fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336a973
cd2e815
336a973
cd2e815
 
336a973
 
e8d4fbe
 
336a973
 
 
e8d4fbe
 
 
 
 
0f5c3da
 
e8d4fbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f5c3da
e8d4fbe
 
 
 
0f5c3da
e8d4fbe
 
9ead40f
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
from nltk.corpus import stopwords
import nltk
import io

nltk.download('stopwords')

# Helper functions
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def clean_text_fuzzy(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

def process_matching(keywords, article, fuzzy, mode):
    keywords = [clean_text(k) for k in keywords]

    article_exact = clean_text(article)
    article_fuzzy = clean_text_fuzzy(article)

    results = {}
    max_keyword_length = max(len(k.split()) for k in keywords)

    for keyword in keywords:
        if keyword not in results:
            results[keyword] = 0

        if fuzzy:
            for n in range(1, max_keyword_length + 1):
                n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)]
                matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None)
                results[keyword] += sum(1 for match, score in matches if score > 90)
        else:
            for n in range(1, max_keyword_length + 1):
                n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)]
                results[keyword] += n_grams_exact.count(keyword)

    if mode == "filter":
        results = {k: v for k, v in results.items() if v > 0}

    total_count = sum(results.values())
    return results, total_count

# Streamlit app
st.title("Keyword Matcher")

# Mode selection
mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True)
mode = "frequency" if mode == "Keyword Frequency" else "filter"

# Keyword input
st.subheader("Keywords")
keywords_input = st.text_area("Enter keywords (comma separated):")
uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"])

keywords = []
if uploaded_file:
    if uploaded_file.name.endswith(".csv"):
        df = pd.read_csv(uploaded_file)
    else:
        df = pd.read_excel(uploaded_file)
    if not df.empty:
        keywords = df.iloc[:, 0].dropna().tolist()
else:
    keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]

# Article input
st.subheader("Article")
article = st.text_area("Paste the article text here:")

# Fuzzy matching checkbox
fuzzy = st.checkbox("Enable Fuzzy Matching")

# Process button
if st.button("Process"):
    if not keywords:
        st.error("Please provide keywords.")
    elif not article:
        st.error("Please provide an article.")
    else:
        results, total_count = process_matching(keywords, article, fuzzy, mode)

        st.subheader("Results")
        for keyword, count in results.items():
            st.write(f"{keyword}: {count}")
        st.write(f"**Total Count:** {total_count}")

        # Save to Excel
        st.subheader("Download Results")
        df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"])
        output = io.BytesIO()
        with pd.ExcelWriter(output, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name="Results")
        output.seek(0)

        st.download_button(
            label="Download Results as Excel",
            data=output,
            file_name="results.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )