import re
import io
import pandas as pd
from collections import Counter
import gradio as gr

def process_excel(file_obj):
    df = pd.read_excel(file_obj)

    # D열에서 4번째 행(D4)부터 끝까지 추출
    if len(df.columns) < 4:
        return "엑셀 파일에 D열이 존재하지 않습니다."

    product_col = df.iloc[3:, 3]  # 3은 0-based로 D열

    freq_counter = Counter()
    for cell_value in product_col:
        if pd.isna(cell_value):
            continue
        text = str(cell_value)
        text_cleaned = re.sub(r'[^0-9a-zA-Z가-힣\s]', '', text)
        keywords = text_cleaned.split()
        keywords_unique = set(keywords)
        freq_counter.update(keywords_unique)

    sorted_freq = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)

    result_data = {
        "키워드": [item[0] for item in sorted_freq],
        "빈도": [item[1] for item in sorted_freq]
    }
    result_df = pd.DataFrame(result_data)

    # 메모리에 엑셀 작성
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine='openpyxl') as writer:
        result_df.to_excel(writer, index=False, startrow=3, header=False)
    output.seek(0)

    # Gradio가 인식할 수 있도록 딕셔너리 반환
    return {
        "name": "keyword_frequency.xlsx",
        "mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        "data": output.getvalue()
    }

def on_submit(excel_file):
    return process_excel(excel_file.name)

with gr.Blocks() as demo:
    gr.Markdown("## 상품명에서 키워드 추출하기")
    excel_input = gr.File(label="엑셀 파일 업로드")
    download_button = gr.File(label="결과 파일 다운로드")

    run_button = gr.Button("분석 및 결과 생성")
    run_button.click(
        fn=on_submit,
        inputs=[excel_input],
        outputs=[download_button]
    )

if __name__ == "__main__":
    demo.launch()