import streamlit as st
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
from nltk.corpus import stopwords
import nltk
import io

nltk.download('stopwords')

# Helper functions
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def clean_text_fuzzy(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

def process_matching(keywords, article, fuzzy, mode):
    keywords = [clean_text(k) for k in keywords]

    article_exact = clean_text(article)
    article_fuzzy = clean_text_fuzzy(article)

    results = {}
    max_keyword_length = max(len(k.split()) for k in keywords)

    for keyword in keywords:
        if keyword not in results:
            results[keyword] = 0

        if fuzzy:
            for n in range(1, max_keyword_length + 1):
                n_grams_fuzzy = [" ".join(article_fuzzy.split()[i:i + n]) for i in range(len(article_fuzzy.split()) - n + 1)]
                matches = process.extract(keyword, n_grams_fuzzy, scorer=fuzz.partial_ratio, limit=None)
                results[keyword] += sum(1 for match, score in matches if score > 90)
        else:
            for n in range(1, max_keyword_length + 1):
                n_grams_exact = [" ".join(article_exact.split()[i:i + n]) for i in range(len(article_exact.split()) - n + 1)]
                results[keyword] += n_grams_exact.count(keyword)

    if mode == "filter":
        results = {k: v for k, v in results.items() if v > 0}

    total_count = sum(results.values())
    return results, total_count

# Streamlit app
st.title("Keyword Matcher")

# Mode selection
mode = st.radio("Select Mode:", ["Keyword Frequency", "Keyword Filter"], horizontal=True)
mode = "frequency" if mode == "Keyword Frequency" else "filter"

# Keyword input
st.subheader("Keywords")
keywords_input = st.text_area("Enter keywords (comma separated):")
uploaded_file = st.file_uploader("Or upload a CSV/Excel file with keywords (first column):", type=["csv", "xlsx"])

keywords = []
if uploaded_file:
    if uploaded_file.name.endswith(".csv"):
        df = pd.read_csv(uploaded_file)
    else:
        df = pd.read_excel(uploaded_file)
    if not df.empty:
        keywords = df.iloc[:, 0].dropna().tolist()
else:
    keywords = [k.strip() for k in keywords_input.split(",") if k.strip()]

# Article input
st.subheader("Article")
article = st.text_area("Paste the article text here:")

# Fuzzy matching checkbox
fuzzy = st.checkbox("Enable Fuzzy Matching")

# Process button
if st.button("Process"):
    if not keywords:
        st.error("Please provide keywords.")
    elif not article:
        st.error("Please provide an article.")
    else:
        results, total_count = process_matching(keywords, article, fuzzy, mode)

        st.subheader("Results")
        for keyword, count in results.items():
            st.write(f"{keyword}: {count}")
        st.write(f"**Total Count:** {total_count}")

        # Save to Excel
        st.subheader("Download Results")
        df = pd.DataFrame(list(results.items()), columns=["Keyword", "Count"])
        output = io.BytesIO()
        with pd.ExcelWriter(output, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name="Results")
        output.seek(0)

        st.download_button(
            label="Download Results as Excel",
            data=output,
            file_name="results.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )