Spaces:

ssboost
/

word_b

Running

App Files Files Community

ssboost commited on Dec 23, 2024

Commit

7dad958

verified ·

1 Parent(s): 9691e07

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -133

app.py CHANGED Viewed

@@ -3,181 +3,164 @@ import pandas as pd
 import tempfile
 import re
-########################################
-# 1) 본문 전처리: 한글만 남기고 제거  #
-########################################
 def preprocess_text(text: str) -> str:
     """
-    쉼표, 마침표, 공백, 숫자, 영어 등
-    한글(가-힣) 이외의 모든 문자를 제거하여
     한글만 연속으로 남긴다.
     """
     return re.sub(r'[^가-힣]', '', text)
-########################################
-# 2) 메인 함수
-########################################
 def count_keywords(main_text, excel_file, direct_input):
     """
-    1) 본문 전처리 (한글만).
-    2) 키워드:
-       - 직접 입력(공백 제외)이 있으면 우선
-       - 비어있으면 엑셀 사용
-         * A5~A10000 → 키워드
-         * N5~N10000 → 카운트
-       - 둘 다 없으면 에러
-    3) 카운트 로직:
-       - .count(키워드)
-       - 1회 이상만 결과 표(Markdown)에 표시
-    4) 엑셀 결과:
-       - 직접 입력일 경우: A열=키워드, B열=카운트
-       - 엑셀 업로드일 경우:
-         * 기존 데이터프레임 사용
-         * A5~A10000 읽어 키워드
-         * N5~N10000에 카운트
     """
-    # 1) 본문 전처리
     cleaned_text = preprocess_text(main_text)
-    ####################################
-    #  직접 입력 vs 엑셀 파일 우선순위
-    ####################################
-    direct_input = direct_input.strip()  # 앞뒤 공백 제거
-    used_excel = False  # 엑셀을 사용했는지 여부
-    # 키워드 목록과 카운트 기록을 담을 df
-    df = None
-    # CASE A) 직접 입력이 있으면 → 우선 사용
     if direct_input:
-        # 여러 줄(엔터)로 구분된 키워드
         keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
         if not keywords:
-            return ("직접 입력 키워드가 비어있습니다.", None)
-        # DataFrame: A열=키워드, B열=카운트
-        df = pd.DataFrame({"A": keywords})
-    # CASE B) 직접 입력이 비었으면 → 엑셀 파일 확인
     else:
         if not excel_file:
-            return ("키워드를 직접 입력하거나 엑셀을 업로드하세요.", None)
-        used_excel = True
-        # 엑셀 읽기
-        temp_df = pd.read_excel(excel_file.name)
-        # A5 ~ A10000만 키워드로 사용 (인덱스 4~9999)
-        # 만약 행 개수가 10,000보다 적으면 가능한 범위까지만 처리
-        max_row = min(temp_df.shape[0], 10000)  # 실제 행 개수 vs 10000 중 작은 것
-        sub_df = temp_df.iloc[4:max_row, 0]     # A열(인덱스 0), 5~10000행 → 실제로는 4~9999 인덱스
-        # NaN 제거, 문자열화
-        keywords = sub_df.dropna().astype(str).tolist()
         if not keywords:
             return ("A5~A10000 범위에 키워드가 없습니다.", None)
-        # 기존 엑셀 전체를 df로 사용
-        df = temp_df.copy()
-        # 혹시 이미 'N' 열이 있더라도 덮어쓰기
-        if 'N' not in df.columns:
-            # 만약 df가 14열 이상 존재하지 않는다면?
-            # 일단 새 컬럼 'N'을 맨 끝에 추가
-            df['N'] = None
-    # 여기까지 오면 df와 keywords가 존재
-    # 2) 실제 카운팅
-    counts = [cleaned_text.count(kw) for kw in keywords]
-    # 3) 1회 이상인 키워드만 결과 표(Markdown)에 표시
-    filtered = []
-    for kw, cnt in zip(keywords, counts):
-        if cnt > 0:
-            filtered.append((kw, cnt))
-    if not filtered:
-        msg_no_keywords = "본문에 해당 키워드가 전혀 등장하지 않았습니다 (0회)."
-        # 그래도 엑셀 파일은 만들어주어야 함
-        # CASE A) 직접 입력
-        if not used_excel:
-            # 새 df(A,B)
-            out_df = pd.DataFrame({"A": keywords, "B": counts})
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
-                out_df.to_excel(tmp.name, index=False)
-                tmp_path = tmp.name
-            return (msg_no_keywords, tmp_path)
-        else:
-            # CASE B) 엑셀
-            # N열에 counts 기록 (A5~A10000 → N5~N10000)
             for i, cnt_val in enumerate(counts):
-                row_idx = 4 + i  # 0-based, 4면 실제 엑셀5행
                 if row_idx < df.shape[0]:
-                    df.loc[row_idx, 'N'] = cnt_val
             with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
-                df.to_excel(tmp.name, index=False)
                 tmp_path = tmp.name
-            return (msg_no_keywords, tmp_path)
-    # (filtered) → 1회 이상
-    # 표 만들기 (Markdown)
-    lines = ["| 키워드 | 등장 횟수 |", "|---|---|"]
-    for (kw, cnt) in filtered:
-        lines.append(f"| {kw} | {cnt} |")
-    result_md = "\n".join(lines)
-    # 4) 엑셀 파일 생성/갱신
-    if not used_excel:
-        # 직접 입력:
-        # A열=키워드, B열=카운트
-        out_df = pd.DataFrame({"A": keywords, "B": counts})
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
-            out_df.to_excel(tmp.name, index=False)
-            tmp_path = tmp.name
-        return (result_md, tmp_path)
-    else:
-        # 업로드 엑셀에 N열=카운트
         for i, cnt_val in enumerate(counts):
-            row_idx = 4 + i  # A5 ~ → N5 ~
             if row_idx < df.shape[0]:
-                df.loc[row_idx, 'N'] = cnt_val
         with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
-            df.to_excel(tmp.name, index=False)
             tmp_path = tmp.name
-        return (result_md, tmp_path)
-#######################################
-# Gradio 인터페이스
-#######################################
 with gr.Blocks() as demo:
-    gr.Markdown("## 본문 & 키워드 분석 (특정 셀 범위 사용)")
     with gr.Row():
         with gr.Column():
-            main_text = gr.Textbox(
                 label="본문 텍스트",
-                placeholder="여기에 본문을 복사해서 붙여넣으세요\n(한글 이외는 자동 제거됩니다)",
-                lines=18
             )
         with gr.Column():
-            keyword_direct = gr.Textbox(
-                label="직접 입력 키워드(엔터로 구분)",
-                placeholder="가습기\n가습\n초음파\n...",
-                lines=6
             )
-            keyword_excel = gr.File(
-                label="엑셀 업로드(A5~A10000: 키워드, N5~N10000: 카운트)",
                 file_types=[".xlsx"]
             )
-            submit_btn = gr.Button("카운트하기")
-    output_md = gr.Markdown(label="결과 (마크다운 표)")
-    output_file = gr.File(label="엑셀 다운로드 (카운트 기록)")
-    # 클릭 이벤트 연결
-    submit_btn.click(
         fn=count_keywords,
-        inputs=[main_text, keyword_excel, keyword_direct],
         outputs=[output_md, output_file]
     )

 import tempfile
 import re
 def preprocess_text(text: str) -> str:
     """
+    쉼표, 마침표, 공백, 숫자, 영어 등
+    한글(가-힣) 이외의 문자를 모두 제거하고
     한글만 연속으로 남긴다.
     """
     return re.sub(r'[^가-힣]', '', text)
+def expand_columns_if_needed(df, needed_index: int):
+    """
+    df에 (needed_index + 1)번째 열이 존재하지 않으면
+    임시로 확장해서 빈 열을 만든다.
+    예) needed_index=13 → N열(14번째 열)을 쓰려면
+       df.shape[1]이 14 이상이 되도록 확장
+    """
+    while df.shape[1] <= needed_index:
+        # 맨 끝에 빈 열 추가
+        df[df.shape[1]] = None
 def count_keywords(main_text, excel_file, direct_input):
     """
+    - 직접 입력 키워드(줄바꿈 구분)가 있으면 우선 사용(A열=키워드, B열=카운트)
+    - 없으면 엑셀 사용:
+      * 헤더를 사용하지 않음(header=None) → 1행 그대로 보존
+      * A5~A10000: 키워드
+      * N5~N10000: 카운트 기록(열 인덱스 13)
+    - 본문은 한글만 남기고 .count(키워드)로 등장 횟수를 계산
+    - 1회 이상인 키워드만 결과 표(Markdown)에 표시
     """
+    # 본문 전처리
     cleaned_text = preprocess_text(main_text)
+    direct_input = direct_input.strip()
     if direct_input:
+        # ===== 직접 입력 키워드 사용 =====
         keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
         if not keywords:
+            return ("직접 입력 키워드가 없습니다.", None)
+        # counts
+        counts = [cleaned_text.count(k) for k in keywords]
+        # 1회 이상 필터
+        filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
+        if not filtered:
+            # 전부 0회
+            msg = "본문에 해당 키워드가 전혀 등장하지 않았습니다."
+            # 그래도 결과 엑셀(A,B) 만들어서 반환
+            tmp_df = pd.DataFrame({"A": keywords, "B": counts})
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
+                tmp_df.to_excel(tmp.name, index=False, header=False)
+                # header=False → 1행에 "A,B" 같은 열이름 안 쓰도록
+                tmp_path = tmp.name
+            return (msg, tmp_path)
+        # 1회 이상 표(Markdown)
+        lines = ["| 키워드 | 등장 횟수 |", "|---|---|"]
+        for (k, c) in filtered:
+            lines.append(f"| {k} | {c} |")
+        md_table = "\n".join(lines)
+        # 엑셀(A,B) 저장
+        tmp_df = pd.DataFrame({"A": keywords, "B": counts})
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
+            tmp_df.to_excel(tmp.name, index=False, header=False)
+            tmp_path = tmp.name
+        return (md_table, tmp_path)
     else:
+        # ===== 엑셀 파일 사용 =====
         if not excel_file:
+            return ("엑셀 파일을 업로드하거나 키워드를 직접 입력하세요.", None)
+        # 1) 엑셀 전체를 header=None로 읽음 → 1행 그대로 보존
+        df = pd.read_excel(excel_file.name, header=None)
+        # 2) A5~A10000 → (인덱스 4~9999) 키워드
+        max_row = min(df.shape[0], 10000)  # 실제 행 개수 vs 10000 중 더 작은 것
+        sub_df = df.iloc[4:max_row, 0]     # 첫 번째 열(인덱스=0)
+        # strip + NaN 제거
+        keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist()
         if not keywords:
             return ("A5~A10000 범위에 키워드가 없습니다.", None)
+        # counts
+        counts = [cleaned_text.count(k) for k in keywords]
+        # 1회 이상 필터
+        filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
+        if not filtered:
+            msg = "본문에 해당 키워드가 전혀 등장하지 않았습니다(0회)."
+            # 그래도 N5~N10000에 기록
+            expand_columns_if_needed(df, 13)  # N열=13
             for i, cnt_val in enumerate(counts):
+                row_idx = 4 + i
                 if row_idx < df.shape[0]:
+                    df.iloc[row_idx, 13] = cnt_val
             with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
+                df.to_excel(tmp.name, index=False, header=False)
                 tmp_path = tmp.name
+            return (msg, tmp_path)
+        # 1회 이상 표(Markdown)
+        lines = ["| 키워드 | 등장 횟수 |", "|---|---|"]
+        for (k, c) in filtered:
+            lines.append(f"| {k} | {c} |")
+        md_table = "\n".join(lines)
+        # N5~N10000에 기록
+        expand_columns_if_needed(df, 13)  # 열이 14개 미만이면 N열(13)까지 확장
         for i, cnt_val in enumerate(counts):
+            row_idx = 4 + i
             if row_idx < df.shape[0]:
+                df.iloc[row_idx, 13] = cnt_val
         with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp:
+            df.to_excel(tmp.name, index=False, header=False)
             tmp_path = tmp.name
+        return (md_table, tmp_path)
+########################
+#  Gradio 인터페이스  #
+########################
 with gr.Blocks() as demo:
+    gr.Markdown("## 본문 & 키워드 분석 - (A5~A10000, N5~N10000)")
     with gr.Row():
         with gr.Column():
+            main_textbox = gr.Textbox(
                 label="본문 텍스트",
+                lines=16,
+                placeholder="여기에 긴 본문을 붙여넣으세요. 한글만 남기고 나머지는 제거됩니다."
             )
         with gr.Column():
+            keyword_input = gr.Textbox(
+                label="(선택) 직접 입력 키워드 - 엔터로 구분",
+                lines=6,
+                placeholder="예)\n초음파가습기\n가습기\n..."
             )
+            excel_input = gr.File(
+                label="(선택) 엑셀 업로드 (A5~A10000=키워드, N5~N10000=카운트)",
                 file_types=[".xlsx"]
             )
+            run_button = gr.Button("카운트하기")
+    output_md = gr.Markdown(label="결과 표")
+    output_file = gr.File(label="결과 엑셀 다운로드")
+    run_button.click(
         fn=count_keywords,
+        inputs=[main_textbox, excel_input, keyword_input],
         outputs=[output_md, output_file]
     )