Turing-test-web-en

Sleeping

App Files Files Community

Spark Chou commited on Jul 21

Commit

88c9820

1 Parent(s): acceb2a

add app.py for latest English version

Browse files

Files changed (1) hide show

app.py +282 -190

app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from huggingface_hub import HfApi, hf_hub_download
 from multiprocessing import TimeoutError
 from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
 dataset = load_dataset("intersteller2887/Turing-test-dataset-en", split="train")
 dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
@@ -20,7 +21,6 @@ dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 't
 target_audio_dir = "/home/user/app/audio"
 os.makedirs(target_audio_dir, exist_ok=True)
 COUNT_JSON_PATH = "/home/user/app/count.json"
 COUNT_JSON_REPO_PATH = "submissions/count.json" # Output directory (Huggingface dataset directory)
 # Copy recordings to the working directory
@@ -42,90 +42,84 @@ sample1_audio_path = local_audio_paths[0]
 print(sample1_audio_path)
 # ==============================================================================
-# 数据定义 (Data Definition)
 # ==============================================================================
 DIMENSIONS_DATA = [
     {
         "title": "Semantic and Pragmatic Features",
-        "audio": "sample1_audio_path",
         "sub_dims": [
-            "Memory Consistency: Human memory in short contexts is usually consistent and self-correcting (e.g., by asking questions); machines may show inconsistent context memory and fail to notice or correct errors (e.g., forgetting key information and persisting in wrong answers).",
-            "Logical Coherence: Human logic is naturally coherent and allows reasonable leaps; machine logic is abrupt or self-contradictory (e.g., sudden topic shifts without transitions).",
-            "Pronunciation Accuracy: Humans generally pronounce words correctly and naturally, distinguishing polyphonic characters based on context; machines often mispronounce or lack contextual judgment for polyphonic words.",
-            "Multilingual Mixing: Humans mix multiple languages fluently and contextually; machines mix languages rigidly, lacking logical language switching.",
-            "Linguistic Vagueness: Human speech tends to include vague expressions (e.g., “more or less,” “I guess”) and self-corrections; machine responses are typically precise and assertive.",
-            "Filler Word Usage: Human filler words (e.g., 'uh', 'like') appear randomly and show signs of thinking; machine fillers are either repetitive and patterned or completely absent.",
-            "Metaphor and Pragmatic Intent: Humans use metaphors, irony, and euphemisms to express layered meanings; machines interpret literally or use rhetorical devices awkwardly, lacking semantic richness."
         ],
-        "reference_scores": [5, 5, 3, 3, 5, 5, 3]
     },
     {
         "title": "Non-Physiological Paralinguistic Features",
-        "audio": "sample1_audio_path",
         "sub_dims": [
-            "Rhythm: Human speech rate varies with meaning, occasionally hesitating or pausing; machine rhythm is uniform, with little or mechanical pauses.",
-            "Intonation: Humans naturally raise or lower pitch to express questions, surprise, or emphasis; machine intonation is monotonous or overly patterned, mismatching the context.",
-            "Emphasis: Humans consciously stress key words to highlight important information; machines have uniform word emphasis or stress incorrect parts.",
-            "Auxiliary Vocalizations: Humans produce context-appropriate non-verbal sounds (e.g., laughter, sighs); machine non-verbal sounds are contextually incorrect, mechanical, or absent."
         ],
-        "reference_scores": [4, 5, 4, 3]
     },
     {
         "title": "Physiological Paralinguistic Features",
-        "audio": "sample1_audio_path",
         "sub_dims": [
-            "Micro-physiological Noise: Human speech includes unconscious physiological sounds like breathing, saliva, or bubbling, naturally woven into rhythm; machine speech is overly clean or adds unnatural noises.",
-            "Pronunciation Instability: Human pronunciation includes irregularities (e.g., linking, tremors, slurring, nasal sounds); machine pronunciation is overly standard and uniform, lacking personality.",
-            "Accent: Humans naturally exhibit regional accents or speech traits; machine accents sound forced or unnatural."
         ],
-        "reference_scores": [3, 3, 4]
     },
     {
         "title": "Mechanical Persona",
-        "audio": "sample1_audio_path",
         "sub_dims": [
-            "Flattery: Humans assess context to agree or disagree, sometimes offering differing opinions; machines excessively agree, thank, or apologize, over-validating the other party and lacking authentic interaction.",
-            "Formalized Expression: Human speech is flexible; machine responses are formally structured, overly written, and use vague wording."
         ],
         "reference_scores": [5, 5]
     },
     {
         "title": "Emotional Expression",
-        "audio": "sample1_audio_path",
         "sub_dims": [
-            "Semantic Level: Humans show appropriate emotional responses to contexts like sadness or joy; machines are emotionally flat, or use emotional words vaguely and out of context.",
-            "Acoustic Level: Human pitch, volume, and rhythm change dynamically with emotion; machine emotional tone is formulaic or mismatched with the context."
         ],
-        "reference_scores": [3, 3]
     }
 ]
 DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
 MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
-"""
-# Issue: this is initialized on the starting of the space, might somehow not covered
-count_data = load_or_initialize_count_json(all_data_audio_paths)
-selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
-QUESTION_SET = [
-    {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
-    for path in selected_audio_paths
-]"""
 # ==============================================================================
-# 功能函数定义 (Function Definitions)
 # ==============================================================================
-# Function that load or initialize count.json
-def load_or_initialize_count_json(audio_paths):
     try:
         # Only try downloading if file doesn't exist yet
         if not os.path.exists(COUNT_JSON_PATH):
             downloaded_path = hf_hub_download(
-                repo_id="intersteller2887/Turing-test-dataset-en",
                 repo_type="dataset",
                 filename=COUNT_JSON_REPO_PATH,
                 token=os.getenv("HF_TOKEN")
@@ -134,7 +128,7 @@ def load_or_initialize_count_json(audio_paths):
             with open(downloaded_path, "rb") as src, open(COUNT_JSON_PATH, "wb") as dst:
                 dst.write(src.read())
     except Exception as e:
-        print(f"Could not download or save count.json from HuggingFace dataset: {e}")
     # Add filelock to /workspace/count.json
     lock_path = COUNT_JSON_PATH + ".lock"
@@ -168,75 +162,139 @@ def load_or_initialize_count_json(audio_paths):
             with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
                 json.dump(count_data, f, indent=4, ensure_ascii=False)
-    return count_data
-# Shorten the time of playing previous audio when reached next question
-def append_cache_buster(audio_path):
-    return f"{audio_path}?t={int(time.time() * 1000)}"
-"""def sample_audio_paths(audio_paths, count_data, k=5, max_count=1):
-    eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
-    if len(eligible_paths) < k:
-        raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
-    eligible_paths_copy = eligible_paths.copy()
-    random.seed(int(time.time()))
-    selected = random.sample(eligible_paths_copy, k)
-    for path in selected:
-        filename = os.path.basename(path)
-        count_data[filename] = count_data.get(filename, 0) + 1
-    with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
-        json.dump(count_data, f, indent=4, ensure_ascii=False)
-    return selected, count_data"""
-def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
     eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
     if len(eligible_paths) < k:
         raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
     selected = random.sample(eligible_paths, k)
     for path in selected:
         filename = os.path.basename(path)
         count_data[filename] = count_data.get(filename, 0) + 1
     lock_path = COUNT_JSON_PATH + ".lock"
     with FileLock(lock_path, timeout=10):
         with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
             json.dump(count_data, f, indent=4, ensure_ascii=False)
-    return selected, count_data
-"""def start_challenge(user_data_state):
-    # global QUESTION_SET, updated_count_data
-    # Issue: global variables in huggingface hub is shared by all threads
-    # 每次点击“开始挑战”时重新抽题
-    count_data = load_or_initialize_count_json(all_data_audio_paths)
-    selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
-    QUESTION_SET = [
-        {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
-        for path in selected_audio_paths
-    ]
-    # 重置 user_data 中的状态（也可以留空）
-    user_data_state.clear()
-    return gr.update(visible=False), gr.update(visible=True), user_data_state"""
 # Save question_set in each user_data_state, preventing global sharing
 def start_challenge(user_data_state):
-    count_data = load_or_initialize_count_json(all_data_audio_paths)
-    selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, count_data, k=5)
     question_set = [
         {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
@@ -244,13 +302,18 @@ def start_challenge(user_data_state):
     ]
     user_data_state["question_set"] = question_set
-    user_data_state["updated_count_data"] = updated_count_data
-    return gr.update(visible=False), gr.update(visible=True), user_data_state
 def toggle_education_other(choice):
     is_other = (choice == "其他（请注明）")
     return gr.update(visible=is_other, interactive=is_other, value="")
 def check_info_complete(username, age, gender, education, education_other, ai_experience):
     if username.strip() and age and gender and education and ai_experience:
         if education == "其他（请注明）" and not education_other.strip():
@@ -258,6 +321,7 @@ def check_info_complete(username, age, gender, education, education_other, ai_ex
         return gr.update(interactive=True)
     return gr.update(interactive=False)
 def show_sample_page_and_init(username, age, gender, education, education_other, ai_experience, user_data):
     final_edu = education_other if education == "其他（请注明）" else education
     user_data.update({
@@ -282,7 +346,7 @@ def update_sample_view(dimension_title):
         # audio_up = gr.update(value=append_cache_buster(dim_data["audio"]))
         interactive_view_up = gr.update(visible=True)
         reference_view_up = gr.update(visible=False)
-        reference_btn_up = gr.update(value="Reference Answer")
         sample_slider_ups = []
         ref_slider_ups = []
         scores = dim_data.get("reference_scores", [])
@@ -302,23 +366,53 @@ def update_sample_view(dimension_title):
     return empty_updates + slider_empty_updates
 def update_test_dimension_view(d_idx, selections):
-    dimension = DIMENSIONS_DATA[d_idx]
-    progress_d = f"Dimension {d_idx + 1} / {len(DIMENSIONS_DATA)}: **{dimension['title']}**"
-    existing_scores = selections.get(dimension['title'], {})
     slider_updates = []
     for i in range(MAX_SUB_DIMS):
-        if i < len(dimension['sub_dims']):
-            sub_dim_label = dimension['sub_dims'][i]
-            value = existing_scores.get(sub_dim_label, 0)
-            slider_updates.append(gr.update(visible=True, label=sub_dim_label, value=value))
         else:
-            slider_updates.append(gr.update(visible=False, value=0))
     prev_btn_update = gr.update(interactive=(d_idx > 0))
     next_btn_update = gr.update(
-        value="Proceed to Final Judgement" if d_idx == len(DIMENSIONS_DATA) - 1 else "Next Dimension",
         interactive=True
     )
@@ -326,10 +420,8 @@ def update_test_dimension_view(d_idx, selections):
 def init_test_question(user_data, q_idx):
     d_idx = 0
-    # question = QUESTION_SET[q_idx]
-    # progress_q = f"第 {q_idx + 1} / {len(QUESTION_SET)} 题"
     question = user_data["question_set"][q_idx]
-    progress_q = f"Question {q_idx + 1} / {len(user_data['question_set'])}"
     initial_updates = update_test_dimension_view(d_idx, {})
     dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
@@ -390,11 +482,33 @@ def navigate_dimensions(direction, q_idx, d_idx, selections, *slider_values):
             next_btn_update,
         ) + tuple(slider_updates)
 # ==============================================================================
-# 重连函数定义 (Retry Function Definitions)
 # ==============================================================================
-# Function for handling connection error
 def retry_with_timeout(max_retries=3, timeout=10, backoff=1):
     def decorator(func):
         @wraps(func)
@@ -423,12 +537,12 @@ def retry_with_timeout(max_retries=3, timeout=10, backoff=1):
         return wrapper
     return decorator
-def save_with_retry(all_results, user_data, count_data):
     # 尝试上传到Hugging Face Hub
     try:
         # 使用线程安全的保存方式
         with ThreadPoolExecutor(max_workers=1) as executor:
-            future = executor.submit(save_all_results_to_file, all_results, user_data, count_data)
             try:
                 future.result(timeout=30)  # 设置30秒超时
                 return True
@@ -525,6 +639,7 @@ def update_count_with_retry(count_data, question_set, max_retries=3):
             gr.update(), gr.update(),
         ) + (gr.update(),) * MAX_SUB_DIMS + (all_results, result_str)"""
 def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
     try:
         # 准备数据
@@ -550,7 +665,7 @@ def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_resu
             return init_q_updates + (all_results, gr.update(value=""))
         else:
             # 准备完整结果数据
-            result_str = "### Test Finished!\n\nOverview of the submission：\n"
             for res in all_results:
                 result_str += f"##### 最终判断: **{res['selections'].get('final_choice', '未选择')}**\n"
                 for dim_title, dim_data in res['selections'].items():
@@ -561,7 +676,8 @@ def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_resu
             # 尝试上传（带重试）
             try:
-                success = save_with_retry(all_results, user_data, user_data.get("updated_count_data"))
             except Exception as e:
                 print(f"上传过程中发生错误: {e}")
                 success = False
@@ -574,7 +690,7 @@ def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_resu
                 # 准备数据包
                 user_info_clean = {
-                    k: v for k, v in user_data.items() if k not in ["question_set", "updated_count_data"]
                 }
                 final_data_package = {
                     "user_info": user_info_clean,
@@ -591,10 +707,10 @@ def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_resu
                 # 更新count.json（剔除未完成的题目）
                 try:
-                    count_update_success = update_count_with_retry(
-                        user_data.get("updated_count_data", {}),
-                        user_data["question_set"]
-                    )
                 except Exception as e:
                     print(f"更新count.json失败: {e}")
                     count_update_success = False
@@ -671,14 +787,14 @@ def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_resu
     except Exception as e:
         print(f"上传出错: {e}")"""
-def save_all_results_to_file(all_results, user_data, count_data=None):
     repo_id = "intersteller2887/Turing-test-dataset-en"
     username = user_data.get("username", "user")
     timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
     submission_filename = f"submissions_{username}_{timestamp}.json"
     user_info_clean = {
-        k: v for k, v in user_data.items() if k not in ["question_set", "updated_count_data"]
     }
     final_data_package = {
@@ -703,41 +819,21 @@ def save_all_results_to_file(all_results, user_data, count_data=None):
         commit_message=f"Add new submission from {username}"
     )
-    if count_data:
         with FileLock(COUNT_JSON_PATH + ".lock", timeout=5):
-            with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
-                json.dump(count_data, f, indent=4, ensure_ascii=False)
         api.upload_file(
-            path_or_fileobj=COUNT_JSON_PATH,
             path_in_repo=COUNT_JSON_REPO_PATH,
             repo_id=repo_id,
             repo_type="dataset",
             token=hf_token,
             commit_message=f"Update count.json after submission by {username}"
         )
-def toggle_reference_view(current):
-    if current == "参考":
-        return gr.update(visible=False), gr.update(visible=True), gr.update(value="返回")
-    else:
-        return gr.update(visible=True), gr.update(visible=False), gr.update(value="参考")
-def back_to_welcome():
-    return (
-        gr.update(visible=True),   # welcome_page
-        gr.update(visible=False),  # info_page
-        gr.update(visible=False),  # sample_page
-        gr.update(visible=False),  # pretest_page
-        gr.update(visible=False),  # test_page
-        gr.update(visible=False),  # final_judgment_page
-        gr.update(visible=False),  # result_page
-        {},                        # user_data_state
-        0,                         # current_question_index
-        0,                         # current_test_dimension_index
-        {},                        # current_question_selections
-        []                         # test_results
-    )
 # ==============================================================================
 # Gradio 界面定义 (Gradio UI Definition)
@@ -763,82 +859,78 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
     }
     with welcome_page:
-        gr.Markdown("# AI Detective\nListen to the following conversations. Please determine which respondent is an AI.")
-        start_btn = gr.Button("Start", variant="primary")
     with info_page:
-        gr.Markdown("## Basic Information")
-        username_input = gr.Textbox(label="Username", placeholder="Please enter your nickname")
-        age_input = gr.Radio(["Under 18", "18-25", "26-35", "36-50", "Over 50"], label="Age")
-        gender_input = gr.Radio(["Male", "Female", "Other"], label="Gender")
-        education_input = gr.Radio(["High school or below", "Bachelor", "Master", "PhD", "Other (please specify)"], label="Education Level")
-        education_other_input = gr.Textbox(label="Please enter your education", visible=False, interactive=False)
-        ai_experience_input = gr.Radio([
-            "Never used",
-            "Occasionally exposed (e.g., watching others use)",
-            "Used a few times, understand basic functions",
-            "Use frequently, have some experience",
-            "Very familiar, have in-depth experience with multiple AI tools"
-        ], label="Familiarity with AI Tools")
-        submit_info_btn = gr.Button("Submit and Start Learning Sample", variant="primary", interactive=False)
     with sample_page:
-        gr.Markdown("## Sample Analysis\nPlease select a dimension to study and practice scoring. All dimensions share the same sample audio.")
-        sample_dimension_selector = gr.Radio(DIMENSION_TITLES, label="Select Learning Dimension", value=DIMENSION_TITLES[0])
         with gr.Row():
             with gr.Column(scale=1):
-                sample_audio = gr.Audio(label="Sample Audio", value=DIMENSIONS_DATA[0]["audio"])
             with gr.Column(scale=2):
                 with gr.Column(visible=True) as interactive_view:
-                    gr.Markdown("#### Please rate the following features (0-5 points. 0 - Feature not present; 1 - Machine; 3 - Neutral; 5 - Human)")
                     sample_sliders = [gr.Slider(minimum=0, maximum=5, step=1, label=f"Sub-dim {i+1}", visible=False, interactive=True) for i in range(MAX_SUB_DIMS)]
                 with gr.Column(visible=False) as reference_view:
-                    gr.Markdown("### Reference Answer Explanation (1-5 points. 1 = Machine, 5 = Human)")
                     reference_sliders = [gr.Slider(minimum=0, maximum=5, step=1, label=f"Sub-dim {i+1}", visible=False, interactive=False) for i in range(MAX_SUB_DIMS)]
         with gr.Row():
-            reference_btn = gr.Button("Reference")
-            go_to_pretest_btn = gr.Button("Got it, start the test", variant="primary")
     with pretest_page:
-        gr.Markdown("## Test Instructions\n"
-                    "- For each question, you need to evaluate **all 5 dimensions**.\n"
-                    "- Within each dimension, please rate each appearing feature **from 1 to 5**.\n"
-                    "- **Scoring Guide:**\n"
-                    "  - **0 points: Feature not present**;\n"
-                    "  - **1 point: Strongly machine-like**;\n"
-                    "  - **2 points: Somewhat machine-like**;\n"
-                    "  - **3 points: Neutral**;\n"
-                    "  - **4 points: Somewhat human-like**;\n"
-                    "  - **5 points: Strongly human-like**.\n"
-                    "- After completing all dimensions, make a **final judgment** on whether the respondent is “Human” or “AI” based on your overall impression.\n"
-                    "- You can use the “Previous Dimension” and “Next Dimension” buttons to freely switch and modify scores across the 5 dimensions.")
-        go_to_test_btn = gr.Button("Start Test", variant="primary")
     with test_page:
-        gr.Markdown("## Formal Test")
         question_progress_text = gr.Markdown()
         test_dimension_title = gr.Markdown()
-        test_audio = gr.Audio(label="Test Audio")
-        gr.Markdown("--- \n ### Please rate the respondent (not the initiator) in the conversation based on the following features (0-5 points. 0 - Feature not present; 1 - Machine; 3 - Neutral; 5 - Human)")
-        test_sliders = [gr.Slider(minimum=0, maximum=5, step=1, label=f"Sub-dim {i+1}", visible=False, interactive=True) for i in range(MAX_SUB_DIMS)]
         with gr.Row():
-            prev_dim_btn = gr.Button("Previous Dimension")
-            next_dim_btn = gr.Button("Next Dimension", variant="primary")
     with final_judgment_page:
-        gr.Markdown("## Final Judgment")
-        gr.Markdown("You have completed scoring for all dimensions. Please make a final judgment based on your overall impression.")
-        final_human_robot_radio = gr.Radio(["👤 Human", "🤖 AI"], label="Please determine the respondent type (required)")
-        submit_final_answer_btn = gr.Button("Submit Answer for This Question", variant="primary", interactive=False)
     with result_page:
-        gr.Markdown("## Test Completed")
         result_text = gr.Markdown()
-        back_to_welcome_btn = gr.Button("Back to Main Page", variant="primary")
     # ==============================================================================
     # 事件绑定 (Event Binding) & IO 列表定义

 from multiprocessing import TimeoutError
 from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
+# Load dataset from HuggingFace
 dataset = load_dataset("intersteller2887/Turing-test-dataset-en", split="train")
 dataset = dataset.cast_column("audio", Audio(decode=False)) # Prevent calling 'torchcodec' from newer version of 'datasets'
 target_audio_dir = "/home/user/app/audio"
 os.makedirs(target_audio_dir, exist_ok=True)
 COUNT_JSON_PATH = "/home/user/app/count.json"
 COUNT_JSON_REPO_PATH = "submissions/count.json" # Output directory (Huggingface dataset directory)
 # Copy recordings to the working directory
 print(sample1_audio_path)
 # ==============================================================================
+# Data Definition
 # ==============================================================================
 DIMENSIONS_DATA = [
     {
         "title": "Semantic and Pragmatic Features",
+        "audio": sample1_audio_path,
         "sub_dims": [
+            "Memory Consistency: Human-like: Consistent memory in short contexts, and asks for clarification when memory deviations occur; Machine-like: Inconsistent memory across contexts and unable to detect or correct errors (e.g., forgetting key information and insisting on incorrect answers)",
+            "Logical Coherence: Human-like: Natural and smooth logic; Machine-like: Abrupt logical transitions or self-contradictions (e.g., suddenly changing topics without transition)",
+            "Pronunciation Accuracy: Human-like: Correct and natural pronunciation of words, with proper usage of polyphonic characters based on context; Machine-like: Unnatural pronunciation errors, mispronunciation of common polyphonic characters",
+            "Multilingual Mixing: Human-like: Multilingual mixing is often context-dependent (e.g., proper nouns, idiomatic expressions), with awkward or unnatural language switching; Machine-like: Rigid multilingual mixing without logical language switching",
+            "Imprecision in Language: Human-like: Uses vague expressions like 'more or less', 'probably', and may self-correct (e.g., 'no, no'); Machine-like: Rarely uses vague expressions, responses are precise and affirmative",
+            "Use of Fillers: Human-like: Frequently uses fillers (e.g., 'um', 'like') while thinking; Machine-like: Rare use of fillers or unnatural usage",
+            "Metaphor and Pragmatic Intent: Human-like: Uses metaphor, irony, and euphemism to convey layered meanings; Machine-like: Literal and direct, lacking semantic diversity, only capable of surface-level interpretation"
         ],
+        "reference_scores": [5, 5, 5, 0, 5, 5, 0]
     },
     {
         "title": "Non-Physiological Paralinguistic Features",
+        "audio": sample1_audio_path,
         "sub_dims": [
+            "Rhythm: Human-like: Speaking rate varies with semantic flow, occasional pauses or hesitations; Machine-like: Almost no pauses or mechanical pauses",
+            "Intonation: Human-like: Natural pitch rise or fall when expressing questions, surprise, or emphasis; Machine-like: Monotonous or overly regular pitch changes, inappropriate to the context",
+            "Stress: Human-like: Consciously emphasizes key words to highlight focus; Machine-like: No emphasis on words or abnormal emphasis placement",
+            "Auxiliary Vocalizations: Human-like: Produces context-appropriate non-verbal sounds, such as laughter or sighs; Machine-like: Contextually incorrect or mechanical auxiliary sounds, or completely absent"
         ],
+        "reference_scores": [5, 5, 5, 5]
     },
     {
         "title": "Physiological Paralinguistic Features",
+        "audio": sample1_audio_path,
         "sub_dims": [
+            "Micro-physiological Noise: Human-like: Presence of breathing sounds, saliva sounds, bubble noise, etc., naturally occurring during speech; Machine-like: Speech is overly clean or emits unnatural noises (e.g., electrical static)",
+            "Instability in Pronunciation: Human-like: Some irregularities in pronunciation (e.g., liaison, tremolo, slurred speech, nasal sounds); Machine-like: Pronunciation is overly clear and regular",
+            "Accent: Human-like: Natural regional accent or vocal traits; Machine-like: Stiff or unnatural accent"
         ],
+        "reference_scores": [5, 4, 4]
     },
     {
         "title": "Mechanical Persona",
+        "audio": sample1_audio_path,
         "sub_dims": [
+            "Sycophancy: Human-like: Judges whether to agree with requests or opinions based on context, doesn't always agree or echo; Machine-like: Frequently agrees, thanks, apologizes, excessively aligns with the other’s opinion, lacking genuine interaction",
+            "Written-style Expression: Human-like: Conversational, flexible, and varied expression; Machine-like: Responses are well-structured and formal, overly formal wording, frequent listing, and vague word choice"
         ],
         "reference_scores": [5, 5]
     },
     {
         "title": "Emotional Expression",
+        "audio": sample1_audio_path,
         "sub_dims": [
+            "Semantic Level: Human-like: Displays human-like emotional responses to contexts such as sadness or joy; Machine-like: Fails to respond emotionally to the other’s feelings, or uses vague and context-inappropriate emotional language",
+            "Acoustic Level: Human-like: Pitch, volume, and rhythm dynamically change with emotion; Machine-like: Emotional tone is patterned or context-inappropriate"
         ],
+        "reference_scores": [5, 5]
     }
 ]
 DIMENSION_TITLES = [d["title"] for d in DIMENSIONS_DATA]
+SPECIAL_KEYWORDS = ["Multilingual Mixing", "Metaphor and Pragmatic Intent", "Auxiliary Vocalizations", "Accent"]
 MAX_SUB_DIMS = max(len(d['sub_dims']) for d in DIMENSIONS_DATA)
+THE_SUB_DIMS = [d['sub_dims'] for d in DIMENSIONS_DATA]
 # ==============================================================================
+# Backend Function Definitions
 # ==============================================================================
+# This version did not place file reading into filelock, concurrent read could happen
+"""def load_or_initialize_count_json(audio_paths):
     try:
         # Only try downloading if file doesn't exist yet
         if not os.path.exists(COUNT_JSON_PATH):
             downloaded_path = hf_hub_download(
+                repo_id="intersteller2887/Turing-test-dataset",
                 repo_type="dataset",
                 filename=COUNT_JSON_REPO_PATH,
                 token=os.getenv("HF_TOKEN")
             with open(downloaded_path, "rb") as src, open(COUNT_JSON_PATH, "wb") as dst:
                 dst.write(src.read())
     except Exception as e:
+        print(f"Could not download count.json from HuggingFace dataset: {e}")
     # Add filelock to /workspace/count.json
     lock_path = COUNT_JSON_PATH + ".lock"
             with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
                 json.dump(count_data, f, indent=4, ensure_ascii=False)
+    return count_data"""
+# Function that load or initialize count.json
+# Function is called when user start a challenge, and this will load or initialize count.json to working directory
+# Initialize happens when count.json does not exist in the working directory as well as HuggingFace dataset
+# Load happens when count.json exists in HuggingFace dataset, and it's not loaded to the working directory yet
+# After load/initialize, all newly added audio files will be added to count.json with initial value of 0
+# Load/Initialize will generate count.json in the working directory for all users under this space
+# This version also places file reading into filelock, and modified
+def load_or_initialize_count_json(audio_paths):
+    # Add filelock to /workspace/count.json
+    lock_path = COUNT_JSON_PATH + ".lock"
+    with FileLock(lock_path, timeout=10):
+        # If count.json does not exist in the working directory, try to download it from HuggingFace dataset
+        if not os.path.exists(COUNT_JSON_PATH):
+            try:
+                # Save latest count.json to working directory
+                downloaded_path = hf_hub_download(
+                    repo_id="intersteller2887/Turing-test-dataset-en",
+                    repo_type="dataset",
+                    filename=COUNT_JSON_REPO_PATH,
+                    token=os.getenv("HF_TOKEN")
+                )
+                with open(downloaded_path, "rb") as src, open(COUNT_JSON_PATH, "wb") as dst:
+                    dst.write(src.read())
+            except Exception:
+                pass
+        # If count.json exists in the working directory: load into count_data for potential update
+        if os.path.exists(COUNT_JSON_PATH):
+            with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
+                count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
+        # Else initialize count_data with orderedDict
+        # This happens when there is no count.json (both working directory and HuggingFace dataset)
+        else:
+            count_data = collections.OrderedDict()
+        updated = False
+        sample_audio_files = {os.path.basename(d["audio"]) for d in DIMENSIONS_DATA}
+        # Guarantee that the sample recording won't be take into the pool
+        # Update newly updated recordings into count.json
+        for path in audio_paths:
+            filename = os.path.basename(path)
+            if filename not in count_data:
+                if filename in sample_audio_files:
+                    count_data[filename] = 999
+                else:
+                    count_data[filename] = 0
+                updated = True
+        # Write updated count_data to /home/user/app/count.json
+        if updated or not os.path.exists(COUNT_JSON_PATH):
+            with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
+                json.dump(count_data, f, indent=4, ensure_ascii=False)
+    return
+# Shorten the time of playing previous audio when reached next question
+def append_cache_buster(audio_path):
+    return f"{audio_path}?t={int(time.time() * 1000)}"
+# Function that samples questions from avaliable question set
+# This version utilizes a given count_data to sample audio paths
+"""def sample_audio_paths(audio_paths, count_data, k=5, max_count=1): # k for questions per test; max_count for question limit in total
     eligible_paths = [p for p in audio_paths if count_data.get(os.path.basename(p), 0) < max_count]
     if len(eligible_paths) < k:
         raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
+    # Shuffule to avoid fixed selections resulted from directory structure
     selected = random.sample(eligible_paths, k)
+    # Once sampled a test, update these questions immediately
     for path in selected:
         filename = os.path.basename(path)
         count_data[filename] = count_data.get(filename, 0) + 1
+    # Add filelock to /workspace/count.json
     lock_path = COUNT_JSON_PATH + ".lock"
     with FileLock(lock_path, timeout=10):
         with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
             json.dump(count_data, f, indent=4, ensure_ascii=False)
+    return selected, count_data"""
+# This version places file reading into filelock to guarantee correct update of count.json
+def sample_audio_paths(audio_paths, k=5, max_count=1):
+    # Add filelock to /workspace/count.json
+    lock_path = COUNT_JSON_PATH + ".lock"
+    # Load newest count.json
+    with FileLock(lock_path, timeout=10):
+        with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
+            count_data = json.load(f)
+        eligible_paths = [
+            p for p in audio_paths
+            if count_data.get(os.path.basename(p), 0) < max_count
+        ]
+        if len(eligible_paths) < k:
+            raise ValueError(f"可用音频数量不足（只剩 {len(eligible_paths)} 条 count<{max_count} 的音频），无法抽取 {k} 条")
+        selected = random.sample(eligible_paths, k)
+        # Update count_data
+        for path in selected:
+            filename = os.path.basename(path)
+            count_data[filename] = count_data.get(filename, 0) + 1
+        # Update count.json
+        with open(COUNT_JSON_PATH, "w", encoding="utf-8") as f:
+            json.dump(count_data, f, indent=4, ensure_ascii=False)
+    # return selected, count_data
+    # Keep count_data atomic
+    return selected
+# ==============================================================================
+# Frontend Function Definitions
+# ==============================================================================
 # Save question_set in each user_data_state, preventing global sharing
 def start_challenge(user_data_state):
+    load_or_initialize_count_json(all_data_audio_paths)
+    # selected_audio_paths, updated_count_data = sample_audio_paths(all_data_audio_paths, k=5)
+    # Keep count_data atomic
+    selected_audio_paths = sample_audio_paths(all_data_audio_paths, k=5)
     question_set = [
         {"audio": path, "desc": f"这是音频文件 {os.path.basename(path)} 的描述"}
     ]
     user_data_state["question_set"] = question_set
+    # count_data is not needed in the user data
+    # user_data_state["updated_count_data"] = updated_count_data
+    return gr.update(visible=False), gr.update(visible=True), user_data_state
+# This function toggles the visibility of the "其他（请注明）" input field based on the selected education choice
 def toggle_education_other(choice):
     is_other = (choice == "其他（请注明）")
     return gr.update(visible=is_other, interactive=is_other, value="")
+# This function checks if the user information is complete
 def check_info_complete(username, age, gender, education, education_other, ai_experience):
     if username.strip() and age and gender and education and ai_experience:
         if education == "其他（请注明）" and not education_other.strip():
         return gr.update(interactive=True)
     return gr.update(interactive=False)
+# This function updates user_data and initializes the sample page (called when user submits their info)
 def show_sample_page_and_init(username, age, gender, education, education_other, ai_experience, user_data):
     final_edu = education_other if education == "其他（请注明）" else education
     user_data.update({
         # audio_up = gr.update(value=append_cache_buster(dim_data["audio"]))
         interactive_view_up = gr.update(visible=True)
         reference_view_up = gr.update(visible=False)
+        reference_btn_up = gr.update(value="参考")
         sample_slider_ups = []
         ref_slider_ups = []
         scores = dim_data.get("reference_scores", [])
     return empty_updates + slider_empty_updates
 def update_test_dimension_view(d_idx, selections):
+    # dimension = DIMENSIONS_DATA[d_idx]
     slider_updates = []
+    dim_data = DIMENSIONS_DATA[d_idx]
+    sub_dims = dim_data["sub_dims"]
+    dim_title = dim_data["title"]
+    existing_scores = selections.get(dim_data['title'], {})
+    progress_d = f"维度 {d_idx + 1} / {len(DIMENSIONS_DATA)}: **{dim_data['title']}**"
     for i in range(MAX_SUB_DIMS):
+        if i < len(sub_dims):
+            desc = sub_dims[i]
+            print(f"{desc} -> default value: {existing_scores.get(desc, 0)}")
+            name = desc.split("：")[0].strip()
+            default_value = 0 if name in SPECIAL_KEYWORDS else 1
+            value = existing_scores.get(desc, default_value)
+            slider_updates.append(gr.update(
+                visible=True,
+                label=desc,
+                minimum=default_value,
+                maximum=5,
+                step=1,
+                value=value,
+                interactive=True,
+            ))
+            # slider_updates.append(gr.update(
+            #     visible=True,
+            #     label=desc,
+            #     minimum=0 if name in SPECIAL_KEYWORDS else 1,
+            #     maximum=5,
+            #     value = existing_scores.get(desc, 0),
+            #     interactive=True,
+            # ))
         else:
+            slider_updates.append(gr.update(visible=False))
+        print(f"{desc} -> default value: {existing_scores.get(desc, 0)}")
+    # for i in range(MAX_SUB_DIMS):
+    #     if i < len(dimension['sub_dims']):
+    #         sub_dim_label = dimension['sub_dims'][i]
+    #         value = existing_scores.get(sub_dim_label, 0)
+    #         slider_updates.append(gr.update(visible=True, label=sub_dim_label, value=value))
+    #     else:
+    #         slider_updates.append(gr.update(visible=False, value=0))
     prev_btn_update = gr.update(interactive=(d_idx > 0))
     next_btn_update = gr.update(
+        value="进入最终判断" if d_idx == len(DIMENSIONS_DATA) - 1 else "下一维度",
         interactive=True
     )
 def init_test_question(user_data, q_idx):
     d_idx = 0
     question = user_data["question_set"][q_idx]
+    progress_q = f"第 {q_idx + 1} / {len(user_data['question_set'])} 题"
     initial_updates = update_test_dimension_view(d_idx, {})
     dim_title_update, prev_btn_update, next_btn_update = initial_updates[:3]
             next_btn_update,
         ) + tuple(slider_updates)
+def toggle_reference_view(current):
+    if current == "参考":
+        return gr.update(visible=False), gr.update(visible=True), gr.update(value="返回")
+    else:
+        return gr.update(visible=True), gr.update(visible=False), gr.update(value="参考")
+def back_to_welcome():
+    return (
+        gr.update(visible=True),   # welcome_page
+        gr.update(visible=False),  # info_page
+        gr.update(visible=False),  # sample_page
+        gr.update(visible=False),  # pretest_page
+        gr.update(visible=False),  # test_page
+        gr.update(visible=False),  # final_judgment_page
+        gr.update(visible=False),  # result_page
+        {},                        # user_data_state
+        0,                         # current_question_index
+        0,                         # current_test_dimension_index
+        {},                        # current_question_selections
+        []                         # test_results
+    )
 # ==============================================================================
+# Retry Function Definitions
 # ==============================================================================
+# Decorator function that allows to use ThreadPoolExecutor to retry a function with timeout
 def retry_with_timeout(max_retries=3, timeout=10, backoff=1):
     def decorator(func):
         @wraps(func)
         return wrapper
     return decorator
+def save_with_retry(all_results, user_data):
     # 尝试上传到Hugging Face Hub
     try:
         # 使用线程安全的保存方式
         with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(save_all_results_to_file, all_results, user_data)
             try:
                 future.result(timeout=30)  # 设置30秒超时
                 return True
             gr.update(), gr.update(),
         ) + (gr.update(),) * MAX_SUB_DIMS + (all_results, result_str)"""
+# user_data now no further contain "updated_count_data", which should be read/write with filelock and be directly accessed from working directory
 def submit_question_and_advance(q_idx, d_idx, selections, final_choice, all_results, user_data):
     try:
         # 准备数据
             return init_q_updates + (all_results, gr.update(value=""))
         else:
             # 准备完整结果数据
+            result_str = "### 测试全部完成！\n\n你的提交结果概览：\n"
             for res in all_results:
                 result_str += f"##### 最终判断: **{res['selections'].get('final_choice', '未选择')}**\n"
                 for dim_title, dim_data in res['selections'].items():
             # 尝试上传（带重试）
             try:
+                # success = save_with_retry(all_results, user_data, user_data.get("updated_count_data"))
+                success = save_with_retry(all_results, user_data)
             except Exception as e:
                 print(f"上传过程中发生错误: {e}")
                 success = False
                 # 准备数据包
                 user_info_clean = {
+                    k: v for k, v in user_data.items() if k not in ["question_set"]
                 }
                 final_data_package = {
                     "user_info": user_info_clean,
                 # 更新count.json（剔除未完成的题目）
                 try:
+                    with FileLock(COUNT_JSON_PATH + ".lock", timeout=5):
+                        with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
+                            count_data = json.load(f, object_pairs_hook=collections.OrderedDict)
+                    count_update_success = update_count_with_retry(count_data, user_data["question_set"])
                 except Exception as e:
                     print(f"更新count.json失败: {e}")
                     count_update_success = False
     except Exception as e:
         print(f"上传出错: {e}")"""
+def save_all_results_to_file(all_results, user_data):
     repo_id = "intersteller2887/Turing-test-dataset-en"
     username = user_data.get("username", "user")
     timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
     submission_filename = f"submissions_{username}_{timestamp}.json"
     user_info_clean = {
+        k: v for k, v in user_data.items() if k not in ["question_set"]
     }
     final_data_package = {
         commit_message=f"Add new submission from {username}"
     )
+    try:
         with FileLock(COUNT_JSON_PATH + ".lock", timeout=5):
+            with open(COUNT_JSON_PATH, "r", encoding="utf-8") as f:
+                count_data_str = f.read()
         api.upload_file(
+            path_or_fileobj=bytes(count_data_str, "utf-8"),
             path_in_repo=COUNT_JSON_REPO_PATH,
             repo_id=repo_id,
             repo_type="dataset",
             token=hf_token,
             commit_message=f"Update count.json after submission by {username}"
         )
+    except Exception as e:
+        print(f"上传 count.json 失败: {e}")
 # ==============================================================================
 # Gradio 界面定义 (Gradio UI Definition)
     }
     with welcome_page:
+        gr.Markdown("# AI 识破者\n你将听到一系列对话，请判断哪个回应者是 AI。")
+        start_btn = gr.Button("开始挑战", variant="primary")
     with info_page:
+        gr.Markdown("## 请提供一些基本信息")
+        username_input = gr.Textbox(label="用户名", placeholder="请输入你的昵称")
+        age_input = gr.Radio(["18岁以下", "18-25岁", "26-35岁", "36-50岁", "50岁以上"], label="年龄")
+        gender_input = gr.Radio(["男", "女", "其他"], label="性别")
+        education_input = gr.Radio(["高中及以下", "本科", "硕士", "博士", "其他"], label="学历")
+        education_other_input = gr.Textbox(label="请填写你的学历", visible=False, interactive=False)
+        ai_experience_input = gr.Radio(["从未使用过", "偶尔接触（如看别人用）", "使用过几次，了解基本功能", "经常使用，有一定操作经验", "非常熟悉，深入使用过多个 AI 工具"], label="对 AI 工具的熟悉程度")
+        submit_info_btn = gr.Button("提交并开始学习样例", variant="primary", interactive=False)
     with sample_page:
+        gr.Markdown("## 样例分析\n请选择一个维度进行学习和打分练习。所有维度共用同一个样例音频。")
+        sample_dimension_selector = gr.Radio(DIMENSION_TITLES, label="选择学习维度", value=DIMENSION_TITLES[0])
         with gr.Row():
             with gr.Column(scale=1):
+                sample_audio = gr.Audio(label="样例音频", value=DIMENSIONS_DATA[0]["audio"])
             with gr.Column(scale=2):
                 with gr.Column(visible=True) as interactive_view:
+                    gr.Markdown("#### 请为以下特征打分 (0-5分。0-特征无体现；1-机器；3-特征无偏向；5-人类)")
                     sample_sliders = [gr.Slider(minimum=0, maximum=5, step=1, label=f"Sub-dim {i+1}", visible=False, interactive=True) for i in range(MAX_SUB_DIMS)]
                 with gr.Column(visible=False) as reference_view:
+                    gr.Markdown("### 参考答案解析")
                     reference_sliders = [gr.Slider(minimum=0, maximum=5, step=1, label=f"Sub-dim {i+1}", visible=False, interactive=False) for i in range(MAX_SUB_DIMS)]
         with gr.Row():
+            reference_btn = gr.Button("参考")
+            go_to_pretest_btn = gr.Button("我明白了，开始测试", variant="primary")
     with pretest_page:
+        gr.Markdown("## 测试说明\n"
+                      "- 对于每一道题，你都需要对全部 **5 个维度** 进行评估。\n"
+                      "- 在每个维度下，请为出现的每个特征 **从0到5打分**。\n"
+                      "- **评分解释如下：**\n"
+                      "  - **0 分：特征未体现** (有些特征一定会体现，所以按1到5打分）；\n"
+                      "  - **1 分：极度符合机器特征**；\n"
+                      "  - **2 分：较为符合机器特征**；\n"
+                      "  - **3 分：无明显人类或机器倾向**；\n"
+                      "  - **4 分：较为符合人类特征**；\n"
+                      "  - **5 分：极度符合人类特征**。\n"
+                      "- 完成所有维度后，请根据整体印象对回应方的身份做出做出“人类”或“机器人”的 **最终判断**。\n"
+                      "- 你可以使用“上一维度”和“下一维度”按钮在5个维度间自由切换和修改分数。\n"
+                      "## 特别注意\n"
+                      "- 我们希望您���判断每个维度上**回应者**的表现是**偏向人还是机器**，分数的大小反映回应者的语音类人的程度，而**不是**这个维度体现的程度多少\n（如读音正确也不代表是人类，读音错误也不代表是机器，您应当判断的是“听到的发音更偏向机器还是人类”)\n"
+                      "- 即使您一开始就已经很肯定回应方的身份，同样应当**独立地**对每个维度上回应方的表现进行细致的评判。比如您很肯定回应方是机器，也需要独立地对每个维度判断，而非简单地将每个维度归为偏机器。")
+        go_to_test_btn = gr.Button("开始测试", variant="primary")
     with test_page:
+        gr.Markdown("## 正式测试")
         question_progress_text = gr.Markdown()
         test_dimension_title = gr.Markdown()
+        test_audio = gr.Audio(label="测试音频")
+        gr.Markdown("--- \n ### 请为对话中的回应者（非发起者）针对以下特征打分 (0-5分。0-特征无体现；1-机器；3-特征无偏向；5-人类)")
+        test_sliders = [gr.Slider(minimum=0, maximum=5, step=1, label=f"Sub-dim {i+1}", visible=False, interactive=True, show_label = True) for i in range(MAX_SUB_DIMS)]
         with gr.Row():
+            prev_dim_btn = gr.Button("上一维度")
+            next_dim_btn = gr.Button("下一维度", variant="primary")
     with final_judgment_page:
+        gr.Markdown("## 最终判断")
+        gr.Markdown("您已完成对所有维度的评分。请根据您的综合印象，做出最终判断。")
+        final_human_robot_radio = gr.Radio(["👤 人类", "🤖 机器人"], label="请判断回应者类型 (必填)")
+        submit_final_answer_btn = gr.Button("提交本题答案", variant="primary", interactive=False)
     with result_page:
+        gr.Markdown("## 测试完成")
         result_text = gr.Markdown()
+        back_to_welcome_btn = gr.Button("返回主界面", variant="primary")
     # ==============================================================================
     # 事件绑定 (Event Binding) & IO 列表定义