Spaces:

SG34
/

test3

Sleeping

App Files Files Community

SG34 commited on Dec 26, 2024

Commit

c14f346

verified ·

1 Parent(s): 8878ba1

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -62

app.py CHANGED Viewed

@@ -1,71 +1,96 @@
-# app.py
-import gradio as gr
-import pandas as pd
 import re
-def extract_keywords(file):
-    try:
-        # 엑셀 파일 읽기
-        df = pd.read_excel(file, engine='openpyxl')
-        print("Excel file loaded successfully")
-        print("DataFrame columns:", df.columns)  # 열 이름 확인
-        # 열 이름 유연하게 처리
-        target_column = None
-        for column in df.columns:
-            if column.strip().lower() == 'd':
-                target_column = column
-                break
-        if not target_column:
-            raise ValueError("No column resembling 'D' found in the uploaded file.")
-        product_names = df[target_column]
-        print(f"Column '{target_column}' data extracted:", product_names.head())
-        # 키워드 추출 함수
-        def process_text(text):
-            # 특수문자 제거 및 공백으로 분리
-            words = re.sub(r'[^\w\s]', '', str(text)).split()
-            return words
-        # 키워드와 빈도 계산
-        keyword_counts = {}
-        for product_name in product_names.dropna():
-            keywords = process_text(product_name)
-            for keyword in set(keywords):  # 중복 제거
-                keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
-        print("Keyword counts calculated:", keyword_counts)
-        # 결과를 데이터프레임으로 변환
-        result_df = pd.DataFrame(keyword_counts.items(), columns=['Keyword', 'Frequency'])
-        result_df.sort_values(by='Frequency', ascending=False, inplace=True)
-        print("Result DataFrame:")
-        print(result_df.head())
-        # 결과 엑셀 파일로 저장
-        output_file = "keyword_analysis_result.xlsx"
-        result_df.to_excel(output_file, index=False, engine='openpyxl')
-        print(f"Results saved to {output_file}")
-        return output_file
-    except Exception as e:
-        print(f"Error occurred: {str(e)}")
-        return f"Error: {str(e)}"
-# Gradio 인터페이스 정의
-def main():
-    interface = gr.Interface(
-        fn=extract_keywords,
-        inputs=gr.File(label="Upload Excel File (.xlsx)"),
-        outputs="file",
-        title="Excel Keyword Extractor",
-        description="Upload an Excel file to extract keywords from a column resembling 'D' and calculate their frequencies. The result will be sorted in descending order and saved to a new Excel file."
     )
-    interface.launch()
 if __name__ == "__main__":
-    main()

 import re
+import io
+import pandas as pd
+from collections import Counter
+import gradio as gr
+def process_excel(file_obj):
+    """
+    1. 업로드된 엑셀 파일을 읽는다.
+    2. D4 ~ 끝까지의 셀을 순회하면서 키워드를 추출하고,
+       이를 통해 각 키워드별 빈도수를 계산한다.
+    3. 결과를 새 엑셀로 만들어 다운로드 링크를 반환한다.
+    """
+    # 1) 엑셀 읽기
+    df = pd.read_excel(file_obj)
+    # D열에서 4번째 행(D4)부터 끝까지 추출(0-based index이므로 df.iloc[3:] 사용)
+    # 만약 컬럼명이 'D'가 아니라면 df.columns로 확인 후 '상품명' 등의 실제 이름 사용
+    if len(df.columns) < 4:
+        return "엑셀 파일에 D열이 존재하지 않습니다."
+    product_col = df.iloc[3:, 3]  # 0-based이므로 3이 D열
+    # 2) 키워드 추출
+    # Counter 객체 생성
+    freq_counter = Counter()
+    for cell_value in product_col:
+        if pd.isna(cell_value):
+            continue
+        # 문자열로 변환
+        text = str(cell_value)
+        # 특수문자 제거
+        text_cleaned = re.sub(r'[^0-9a-zA-Z가-힣\s]', '', text)
+        # 공백 분할
+        keywords = text_cleaned.split()
+        # 셀 내부 중복 제거
+        keywords_unique = set(keywords)
+        # 전체 카운트
+        freq_counter.update(keywords_unique)
+    # 빈도수 높은 순(내림차순)으로 정렬
+    sorted_freq = sorted(freq_counter.items(), key=lambda x: x[1], reverse=True)
+    # 3) 결과를 새 엑셀로 만들기
+    # A열: 키워드, B열: 빈도수
+    # A4, B4부터라면, 실제 편의를 위해 행을 몇 줄 비워두거나
+    # 단순히 DataFrame을 만들어 header 없이 저장해도 무방함.
+    result_data = {
+        "키워드": [item[0] for item in sorted_freq],
+        "빈도": [item[1] for item in sorted_freq]
+    }
+    result_df = pd.DataFrame(result_data)
+    # 출력용 엑셀을 메모리에 저장 후 반환
+    output = io.BytesIO()
+    with pd.ExcelWriter(output, engine='openpyxl') as writer:
+        # 만약 A4, B4를 정말 맞춰 쓰려면, startrow=3 사용 가능(0-based)
+        # 예: result_df.to_excel(writer, index=False, startrow=3)
+        result_df.to_excel(writer, index=False, startrow=3, header=False)
+        # A4에는 첫 번째 키워드가, B4에는 빈도수가 들어가도록 (startrow=3)
+        # 다만, header=False로 하면 컬럼명 대신 바로 데이터가 들어감
+    output.seek(0)
+    # Gradio에서 파일로 반환하기 위해 ("filename.xlsx", bytes, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet") 형식
+    return ("keyword_frequency.xlsx", output.getvalue())
+# Gradio 인터페이스 구성
+with gr.Blocks() as demo:
+    gr.Markdown("## 상품명에서 키워드 추출하기")
+    excel_input = gr.File(label="엑셀 파일 업로드")
+    download_button = gr.File(label="결과 파일 다운로드")
+    def on_submit(excel_file):
+        # 업로드 파일의 결과 처리를 해서, (파일명, 바이트) 튜플 반환
+        return process_excel(excel_file.name)
+    run_button = gr.Button("분석 및 결과 생성")
+    run_button.click(
+        fn=on_submit,
+        inputs=[excel_input],
+        outputs=[download_button]
     )
+# 이 스크립트를 실행했을 때 Gradio 서버가 뜨게 하는 부분
 if __name__ == "__main__":
+    demo.launch()