Spaces:

kevineen
/

tanuki_annotation_phase2

Sleeping

App Files Files Community

kevineen commited on 27 days ago

Commit

7fcccde

•

1 Parent(s): edbc65c

for team

Browse files

Files changed (5) hide show

.gitignore +1 -9
README.md +6 -6
app.py +441 -0
note.txt +1 -1
requirements.txt +1 -2

.gitignore CHANGED Viewed

@@ -161,12 +161,4 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-user_annotation/*
-run_2.py
-run_3.py
-run_4.py
-backup.py
-idea.txt
-dataclass.py

 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+user_annotation/*

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
-title: tanuki_annotation_phase2
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-app_file: run.py # HotReloadデバッグのため、app.pyから変更中 gradio run.pyで開発中は変更の監視が可能
 pinned: false
-license: apache-2.0
 hf_oauth: true
 # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.

 ---
+title: Tanuki Annotation Phase2
+emoji: 📊
+colorFrom: red
+colorTo: red
 sdk: gradio
+app_file: app.py
 pinned: false
+license: unknown
 hf_oauth: true
 # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.

app.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import os
+import json
+import datetime
+from pathlib import Path
+import uuid
+from typing import Tuple
+import pandas as pd
+import gradio as gr
+from datasets import load_dataset
+from huggingface_hub import CommitScheduler
+from huggingface_hub import HfFolder
+# HF_Spaceでプライベート関連にアクセスするための環境変数
+# SecretKey をSpaceのSettingsに設定
+HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    HfFolder.save_token(HF_TOKEN)
+else:
+    print("Warning: HF_TOKEN not found. Please set it in your Space secrets.")
+# HFデータセット アップロード先
+# (切替てテストする用に配列)
+output_dataset = [
+    "team-hatakeyama-phase2/annotation_tanuki_phase2",
+    "kevineen/Tanuki-Phase2-annotation-dataset", # test
+]
+# アノテーション対象データセット
+annotation_dataset_list = [
+    "hatakeyama-llm-team/AutoGeneratedJapaneseQA",
+    "hatakeyama-llm-team/AutoGeneratedJapaneseQA-other",
+    "kanhatakeyama/ChatbotArenaJaMixtral8x22b",
+    "kanhatakeyama/OrcaJaMixtral8x22b",
+    "kanhatakeyama/LogicalDatasetsByMixtral8x22b",
+    # データ形式未対応（対応予定
+    # "hatakeyama-llm-team/WikiBookJa",
+    # "kanhatakeyama/AutoWikiQA",
+    # "susumuota/SyntheticTextWikiTranslate-askllm-v1", # Ask-LLM 翻訳
+]
+multi_turn_annotation_dataset_list = [
+    # マルチターン 未対応
+    "kanhatakeyama/AutoMultiTurnByMixtral8x22b",
+]
+# Session State : (ブラウザセッション単位の変数管理) ===========================
+# UIのEnable/Disable用State
+is_selected_dataset = gr.State(False)
+is_loaded_dataset = gr.State(False)
+# 選択中のデータセットリスト
+dropdown_dataset_list = gr.State(value=annotation_dataset_list)
+# 現在の対象データセット 初期値は"hatakeyama-llm-team/AutoGeneratedJapaneseQA",
+select_dropdown_dataset = gr.State(dropdown_dataset_list.value[0])
+select_dataset = gr.State(None)  # 現在のデータセット
+select_dataset_total_len = gr.State(0)  # 現在のデータセットの長さ
+select_idx = gr.State(0)  # 現在のインデックス (ランダムモードにするなら不要？
+random_mode = gr.State(False)
+# 回答者がアノテーションしたデータセット
+annotated_dataset = gr.State(
+    pd.DataFrame({
+        'dataset_name': [],
+        'dataset_id': [],
+        'who': [],
+        'good': [],
+        'bad': [],
+        'score': [],
+        'is_proofreading_1': [],
+        'answer_text_1': [],
+        'is_proofreading_2': [],  # マルチターン用
+        'answer_text_2': [],  # マルチターン用
+    })
+)
+initial_answer_text_1 = gr.State("")  # 回答1を整形したかチェック用
+initial_answer_text_2 = gr.State("")  # 回答2を整形したかチェック用
+is_dataset_loaded = gr.State(False)
+you_dataset_id = gr.State(0)  # 回答者がアノテーションしているデータのID
+dataset_name = gr.State("")  # 編集に使用したデータセット名
+dataset_id = gr.State(0)  # 加工元データセットのindex
+who = gr.State("")  # アノテーション者名
+good = gr.State(False)  # 良
+bad = gr.State(False)  # 悪
+score = gr.State(3)  # スコア 初期値は3
+is_proofreading_1 = gr.State(False)  # 回答1を整形したか_1
+answer_text_1 = gr.State("")  # answer_1 回答
+is_proofreading_2 = gr.State(False)  # 回答2を整形したか_2
+answer_text_2 = gr.State("")  # answer_2 回答
+# 未整理
+# データ読み込み　========================================
+def dataset_load_fn() -> Tuple[
+        str,
+        str,
+        str,
+        str,
+        gr.update,
+        gr.update,
+        gr.update,
+        gr.update,
+        gr.update,
+        gr.update,
+        gr.update]:
+    is_dataset_loaded.value = False  # ロード状態
+    select_dataset.value = load_dataset(
+        select_dropdown_dataset.value
+    )
+    # DatasetオブジェクトをPandas DataFrameに変換
+    df = select_dataset.value["train"].to_pandas()
+    # index列を追加し、シャッフル
+    df = df.reset_index(drop=False)  # 元のindexを保持
+    df = df.sample(frac=1).reset_index(drop=True)  # シャッフル
+    select_dataset.value["train"] = df  # シャッフルされたDataFrameを格納
+    select_idx.value = 0  # index初期化
+    select_dataset_total_len.value = len(df)  # 長さを取得
+    is_dataset_loaded.value = True  # ロード完了
+    # データロード時に初期値を設定
+    initial_answer_text_1.value = df.iloc[select_idx.value]["answer"]
+    initial_answer_text_2.value = df.iloc[select_idx.value]["answer"]
+    return df.iloc[select_idx.value]["question"], \
+        df.iloc[select_idx.value]["answer"], \
+        df.iloc[select_idx.value]["question"], \
+        df.iloc[select_idx.value]["answer"], \
+        gr.update(interactive=True), \
+        gr.update(interactive=True), \
+        gr.update(interactive=True), \
+        gr.update(interactive=True), \
+        gr.update(interactive=True), \
+        gr.update(interactive=True), \
+        gr.update(interactive=True)
+# データの保存処理　========================================
+# Spaceの場合の保存先はCommitSchedulerのpath_in_repoフォルダ
+# (ローカル開発の場合./user_annotationフォルダにjsonファイルが作成される)
+annotation_file = Path("user_annotation/") / f"data_{uuid.uuid4()}.json"
+annotated_folder = annotation_file.parent
+scheduler = CommitScheduler(
+    repo_id=output_dataset[0],
+    repo_type="dataset",
+    folder_path=annotated_folder,
+    path_in_repo="data",  # Spaceの場合の保存先フォルダー
+    private=True,
+    every=5,  # 5分毎にアップロード HuggingFAce_Documentの最低推奨値
+)
+# CommitScheduler (HFへのデータアップロード
+def save_annotation(
+        dataset_name: str,
+        dataset_id: int,
+        who: str,
+        good: bool,
+        bad: bool,
+        score: int,
+        is_proofreading_1: bool,
+        answer_text_1: str,
+        is_proofreading_2: bool,
+        answer_text_2: str) -> None:
+    annotated_dataset.value = pd.concat([
+        annotated_dataset.value,
+        pd.DataFrame({
+            'dataset_name': [dataset_name],
+            'dataset_id': [dataset_id],
+            'who': [who],
+            'good': [good],
+            'bad': [bad],
+            'score': [score],
+            'is_proofreading_1': [is_proofreading_1],
+            "answer_text_1": [answer_text_1],
+            'is_proofreading_2': [is_proofreading_2],
+            'answer_text_2': [answer_text_2]
+        })], ignore_index=True).reset_index(drop=True)
+    # 書き込み
+    with scheduler.lock:
+        with annotation_file.open("a", encoding='utf-8') as f:
+            data_to_write = {
+                # "id": , CommitSchedulerだと取得して末尾idを付与することが無理？
+                "datetime": str(datetime.datetime.now().isoformat()),
+                "dataset_name": dataset_name,
+                "dataset_id": int(dataset_id),
+                "who": who,
+                "good": good,
+                "bad": bad,
+                "score": score,
+                "is_proofreading_1": is_proofreading_1,
+                "answer_text_1": answer_text_1,
+                "is_proofreading_2": is_proofreading_2,
+                "answer_text_2": answer_text_2,
+            }
+            f.write(json.dumps(data_to_write, ensure_ascii=False))
+            f.write("\n")
+# アノテーションの追加処理　========================================
+# UI処理 ========================================
+# ユーザー名表示
+def hello(profile: gr.OAuthProfile | None) -> Tuple[str, str]:
+    if profile is None:
+        return "プライベートデータセット取得のためにログインしてください。", who.value
+    who.value = profile.username
+    return f'{profile.username} さん、よろしくお願いいたします。', who.value
+# テーマの状態
+theme_ = gr.themes.Default()
+# 後のCSSデザイン変更用
+def load_css():
+    with open("style.css", "r") as file:
+        css_content = file.read()
+    return css_content
+# Gradio 画面 ============================================
+with gr.Blocks(theme=theme_, css=load_css()) as demo:
+    gr.Markdown("# データセット アノテーション for Tanuki (Phase2)")
+    with gr.Tab("アノテーション (シングルターン)"):
+        with gr.Row(equal_height=True):
+            gr.LoginButton(value="HuggingFace ログイン",
+                           logout_value="HuggingFace ログアウト", scale=1)
+            # ユーザー名
+            gr_profile_name = gr.Markdown()
+            demo.load(hello, inputs=None, outputs=[gr_profile_name, who])
+        with gr.Row():
+            def dropdown_select(select_value) -> None:
+                select_dropdown_dataset.value = select_value
+            # 対象データセットの選択
+            gr_dropdown_dataset = gr.Dropdown(
+                label="データセット選択 ①",
+                choices=dropdown_dataset_list.value,
+                value=select_dropdown_dataset.value,
+                elem_id="dataset_sel",
+                scale=2)
+            gr_dropdown_dataset.change(
+                dropdown_select,
+                inputs=[gr_dropdown_dataset]
+            )
+            gr_data_load_btn = gr.Button("② データセットを読み込む")
+        with gr.Column() as content_column:
+            with gr.Tab("③ シンプル(良・悪)"):
+                with gr.Column():
+                    with gr.Row(equal_height=True):
+                        good_btn = gr.Button("良い", interactive=False)
+                        bad_btn = gr.Button("悪い", interactive=False)
+                gr_question_text_1 = gr.Textbox(
+                    label="質問: ", lines=5, interactive=False)
+                gr_answer_text_1 = gr.Textbox(
+                    label="回答: 訂正頂けると品質が��がります。",
+                    lines=20,
+                    interactive=True)
+            with gr.Tab("③ ５段階評価"):
+                gr_question_text_3 = gr.Textbox(
+                    label="質問: ", lines=5, interactive=False)
+                with gr.Row() as score_btn:
+                    gr_score_1 = gr.Button("1: 低品質", interactive=False)
+                    gr_score_2 = gr.Button("2: 悪い", interactive=False)
+                    gr_score_3 = gr.Button("3: 普通", interactive=False)
+                    gr_score_4 = gr.Button("4: 良い", interactive=False)
+                    gr_score_5 = gr.Button("5: 高品質", interactive=False)
+                gr_answer_text_3 = gr.Textbox(
+                    label="回答: 訂正して頂けると品質が上がります。", lines=20, interactive=True)
+            # 5段階評価ボタンのクリックイベントを定義
+            def score_button_clicked(button_value):
+                good.value = False
+                bad.value = False
+                score.value = button_value
+            gr_data_load_btn.click(
+                dataset_load_fn,
+                inputs=None,
+                outputs=[gr_question_text_1,
+                         gr_answer_text_1,
+                         gr_question_text_3,
+                         gr_answer_text_3,
+                         good_btn,
+                         bad_btn,
+                         gr_score_1,
+                         gr_score_2,
+                         gr_score_3,
+                         gr_score_4,
+                         gr_score_5,
+                         ]
+            )
+            def update_annotation(
+                input_ans_1: str = None,
+                input_ans_3: str = None,
+                is_good: bool = None,  # good/bad を表すフラグを追加
+                score_value: int = None  # 5段階評価の値、good/badの場合はNone
+            ) -> Tuple[gr.update, gr.update, gr.update, gr.update]:
+                # good/bad と score の状態を更新
+                if score_value is not None:  # 5段階評価の場合
+                    good.value = False
+                    bad.value = False
+                    score.value = score_value
+                else:  # good/bad評価の場合
+                    good.value = is_good
+                    bad.value = not is_good
+                # 変更を検知 (5段階評価の場合も処理するように変更)
+                if input_ans_1 is not None and initial_answer_text_1.value != input_ans_1:
+                    is_proofreading_1.value = True
+                    answer_text_1.value = input_ans_1
+                else:
+                    answer_text_1.value = ""
+                if input_ans_3 is not None and initial_answer_text_2.value != input_ans_3:
+                    is_proofreading_2.value = True
+                    answer_text_2.value = input_ans_3
+                else:
+                    answer_text_2.value = ""
+                # 表示更新
+                # indexを進める
+                select_idx.value += 1
+                df = select_dataset.value["train"]
+                # ループさせるか、エラー処理を行う
+                if select_idx.value >= len(df):
+                    select_idx.value = 0
+                # データセットに追加
+                # 元のindex番号(dataset_id)を指定して保存
+                save_annotation(
+                    select_dropdown_dataset.value,
+                    # datasetIdは元のindex番号を使用
+                    df.iloc[select_idx.value]['index'],
+                    who.value,
+                    good.value,
+                    bad.value,
+                    score.value,
+                    is_proofreading_1.value,
+                    answer_text_1.value,
+                    is_proofreading_2.value,
+                    answer_text_2.value
+                )
+                # Nextデータ初期化
+                is_proofreading_1.value = False
+                is_proofreading_2.value = False
+                initial_answer_text_1.value = df.iloc[select_idx.value]["answer"]
+                initial_answer_text_2.value = df.iloc[select_idx.value]["answer"]
+                return gr.update(value=df.iloc[select_idx.value]["question"]), \
+                    gr.update(value=df.iloc[select_idx.value]["answer"]), \
+                    gr.update(value=df.iloc[select_idx.value]["question"]), \
+                    gr.update(value=df.iloc[select_idx.value]["answer"])
+            def good_click(input_ans_1, input_ans_3):
+                return update_annotation(input_ans_1=input_ans_1, input_ans_3=input_ans_3, is_good=True)
+            good_btn.click(
+                good_click,
+                inputs=[
+                    gr_answer_text_1,
+                    gr_answer_text_3
+                ],
+                outputs=[gr_question_text_1,
+                         gr_answer_text_1,
+                         gr_question_text_3,
+                         gr_answer_text_3]
+            )
+            def bad_click(input_ans_1, input_ans_3):
+                return update_annotation(input_ans_1=input_ans_1, input_ans_3=input_ans_3, is_good=False)
+            bad_btn.click(
+                bad_click,
+                inputs=[
+                    gr_answer_text_1,
+                    gr_answer_text_3
+                ],
+                outputs=[gr_question_text_1,
+                         gr_answer_text_1,
+                         gr_question_text_3,
+                         gr_answer_text_3]
+            )
+            # 5段階評価ボタンのクリックイベント
+            gr_score_1.click(lambda x: update_annotation(input_ans_1=x, input_ans_3=x, score_value=1),
+                             inputs=[gr_answer_text_3], outputs=[gr_question_text_1, gr_answer_text_1, gr_question_text_3, gr_answer_text_3])
+            gr_score_2.click(lambda x: update_annotation(input_ans_1=x, input_ans_3=x, score_value=2),
+                             inputs=[gr_answer_text_3], outputs=[gr_question_text_1, gr_answer_text_1, gr_question_text_3, gr_answer_text_3])
+            gr_score_3.click(lambda x: update_annotation(input_ans_1=x, input_ans_3=x, score_value=3),
+                             inputs=[gr_answer_text_3], outputs=[gr_question_text_1, gr_answer_text_1, gr_question_text_3, gr_answer_text_3])
+            gr_score_4.click(lambda x: update_annotation(input_ans_1=x, input_ans_3=x, score_value=4),
+                             inputs=[gr_answer_text_3],  outputs=[gr_question_text_1, gr_answer_text_1, gr_question_text_3, gr_answer_text_3])
+            gr_score_5.click(lambda x: update_annotation(input_ans_1=x, input_ans_3=x, score_value=5),
+                             inputs=[gr_answer_text_3],  outputs=[gr_question_text_1, gr_answer_text_1, gr_question_text_3, gr_answer_text_3])
+            # TODO Tab切り替えで、アノテ済みの一覧を表示する
+            # with gr.Tab("アノテ済みデータセット(管理画面)"):
+            # タブを切り替えた時にデータ表示を更新する
+if __name__ == "__main__":
+    demo.launch()

note.txt CHANGED Viewed

@@ -5,4 +5,4 @@ Secretsに作成したTokenを
 HF_TOKENに設定して頂けますでしょうか？
 - team-hatakeyama-phase2/annotation_tanuki_phase2
-側も設定が必要？

 HF_TOKENに設定して頂けますでしょうか？
 - team-hatakeyama-phase2/annotation_tanuki_phase2
+側も設定が必要？

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
 huggingface_hub==0.22.2
-minijinja
-transformers
 datasets

 huggingface_hub==0.22.2
+gradio
 datasets