import random random.seed(42) mcqa_example_pool = [ { "Q": "What does Jon Snow use to fight with Ramsay Bolton?", "Options": ["A. A shield.", "B. A sword.", "C. An Axe.", "D. A spear."], "Answer": "A. A shield" }, { "Q": "What card does the male judge pick?", "Options": ["A. 2 of spades.", "B. 2 of diamonds.", "C. 2 of hearts.", "D. 2 of clubs."], "Answer": "A" }, { "Q": "Who finally find the lost city?", "Options": ["A. Terra preta.", "B. Fawcett.", "C. European expeditions.", "D. Dr.Michael Heckenberger."], "Answer": "D. Dr.Michael Heckenberger." }, { "Q": "What sport are the two teams of athletes playing?", "Options": ["A. Ice hockey.", "B. Soccer.", "C. Rugby.", "D. Basketball."], "Answer": "C" }, { "Q": "What item is not used to decorate the Christmas tree?", "Options": ["A. Red balls.", "B. Lights.", "C. Green stars.", "D. Icicles."], "Answer": "C. Green stars." }, { "Q": "What is the main subject matter of the advertisement featured in the video?", "Options": ["A. Audible app.", "B. Music listening app.", "C. Shopping app.", "D. Video online playing app."], "Answer": "A" }, { "Q": "What country's practice game is this?", "Options": ["A. UK.", "B. USA.", "C. Canada.", "D. Australia."], "Answer": "B. USA." }, { "Q": "According to the video, which team ultimately won?", "Options": ["A. China.", "B. Italy.", "C. USA.", "D. France."], "Answer": "A" }, { "Q": "Which cellular structure is responsible for receiving proteins according to the video?", "Options": ["A. Golgi apparatus (Golgi body).", "B. Nucleus.", "C. Ribosome.", "D. Mitochondrion."], "Answer": "A. Golgi apparatus (Golgi body)." }, { "Q": "At the beginning, what is the player's rank?", "Options": ["A. Third.", "B. First.", "C. Second.", "D. Last."], "Answer": "D" }, { "Q": "Which team in the video reached the finish line first?", "Options": ["A. USA team.", "B. Canadian team.", "C. Ghana team.", "D. South Africa team."], "Answer": "B" }, { "Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?", "Options": ["A. He is an athlete of the Chinese team.", "B. He is an athlete of the Jamaican team.", "C. He is a neutral individual athlete.", "D. It is not mentioned in the video."], "Answer": "C. He is a neutral individual athlete." }, { "Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?", "Options": ["A. Lunar Ridge.", "B. Collapsed lava tubes.", "C. Rift valley systems.", "D. Scratch marks."], "Answer": "B" }, { "Q": "Which woman works as a chef?", "Options": ["A. Diamante.", "B. Carola Ordenes.", "C. Amina.", "D. Ghizlane."], "Answer": "A" }, { "Q": "What kind of chess are the old people in the video playing?", "Options": ["A. Mahjong.", "B. Go.", "C. Chinese chess.", "D. Five-in-a-row."], "Answer": "C. Chinese chess." }, { "Q": "Which ingredient is not used in the video?", "Options": ["A. Hot glue.", "B. Pieces of burlap.", "C. Florals.", "D. Plastic bottles."], "Answer": "D" }, { "Q": "Who does the video focus on regarding their work with globular clusters?", "Options": ["A. Harlow Shapley.", "B. Walter Baade.", "C. William Herschel.", "D. Henrietta Swan Levitt."], "Answer": "A" } ] def prompt_miradata_based_text_constraint_mcqa(dense_caption, background_caption, main_object_caption): task_inst_part = ( "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n" "## TASK:\n" "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n" "You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n" "## CRITICAL RULES:" "1. **Uniqueness Guarantee**: Each question must include either:" " - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR" " - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR" " - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')." "2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n" "3. **Description DescrGroundingiption**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n" "4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n" "5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n" "6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n" "7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n" "## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n" " - `'Q'`: The question.\n" " - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n" " - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n" "\n" ) choosed_example_pool = random.sample(mcqa_example_pool, 3) example_part_header = "## EXAMPLES:\n" for idx, example in enumerate(choosed_example_pool): Q = example['Q'] Options = example['Options'] Answer = example['Answer'] body = ( f"{idx+1}. {{'Q': '{Q}',\n" " 'Options': [\n" f" '{Options[0]}',\n" f" '{Options[1]}',\n" f" '{Options[2]}',\n" f" '{Options[3]}'\n" " ],\n" f" 'Answer': '{Answer}'}}\n" "\n" ) example_part_header = example_part_header + body example_part = example_part_header system_prompt = task_inst_part + example_part user_prompt = ( "I have provided you with three different aspect description of a specific clip from a long video. Below is these description:\n\n" "**Dense Description:**\n" f"{dense_caption}\n\n" "**Background Description:**\n" f"{background_caption}\n\n" "**Main Object Description:**\n" f"{main_object_caption}\n\n" "Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n" "## REQUIREMENTS:\n" "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n" "- You must use an action, event or composite feature in question to constrain the question, thereby ensuring answer uniqueness.\n" "- The answer must be directly observable in the description without any reasoning or inference.\n\n" "## OUTPUT FORMAT:\n" "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n" "**Only return the QA pair in the specified JSON list format.**" ) return system_prompt, user_prompt def prompt_miradata_based_text_mcqa(dense_caption, background_caption, main_object_caption): task_inst_part = ( "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n" "## TASK:\n" "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n" "You must also provide **4 answer options (A–D)**, with only one correct answer, which is clearly supported by the visual or narrative content of the video description.\n\n" "## INSTRUCTIONS:\n" "- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n" "- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n" "- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n" "- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n" "- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n" "- **Output Format**: Format the output as a list of dictionaries with the following keys:\n" " - `'Q'`: The question.\n" " - `'Options'`: A list of four answer options labeled 'A', 'B', 'C', and 'D'.\n" " - `'Answer'`: The correct answer (e.g., `'A'`, `'B'`, etc.).\n" "\n" ) choosed_example_pool = random.sample(mcqa_example_pool, 3) example_part_header = "## EXAMPLES:\n" for idx, example in enumerate(choosed_example_pool): Q = example['Q'] Options = example['Options'] Answer = example['Answer'] body = ( f"{idx+1}. {{'Q': '{Q}',\n" " 'Options': [\n" f" '{Options[0]}',\n" f" '{Options[1]}',\n" f" '{Options[2]}',\n" f" '{Options[3]}'\n" " ],\n" f" 'Answer': '{Answer}'}}\n" "\n" ) example_part_header = example_part_header + body example_part = example_part_header guidelines_part = ( "## GUIDELINES FOR CREATING QUESTIONS:\n" "- **Specificity**: Ask about singular, clearly defined object.\n" "- **Visual Certainty**: Ensure the correct answer is unambiguous.\n" "- **Description Grounding**: Base all questions and answers on the video description.\n" "- **Plausible Distractors**: Wrong options should be visually similar (e.g., other kitchen tools if asking about a pan).\n" "- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n" "\n" "## OUTPUT FORMAT:\n" "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]") system_prompt = task_inst_part + example_part + guidelines_part user_prompt = ( "I have provided you with three different aspect description of a specific clip in a video. Below is these description:\n\n" "**Dense Description:**\n" f"{dense_caption}\n\n" "**Background Description:**\n" f"{background_caption}\n\n" "**Main Object Description:**\n" f"{main_object_caption}\n\n" "Based on these description and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n" "## REQUIREMENTS:\n" "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n" "- The answer must be directly observable in the description without any reasoning or inference.\n\n" "## OUTPUT FORMAT:\n" "[{'Q': 'Your question here...', 'Options': ['A. ...', 'B. ...', 'C. ...', 'D. ...'], 'Answer': 'Correct answer here...'}]\n\n" "**Only return the QA pair in the specified JSON list format.**" ) return system_prompt, user_prompt openqa_example_pool = [ { "Q": "What does Jon Snow use to fight with Ramsay Bolton?", "Answer": "Jon Snow uses a shield to fight with Ramsay Bolton." }, { "Q": "What card does the male judge pick?", "Answer": "The male judge picks the 2 of spades." }, { "Q": "Who finally finds the lost city?", "Answer": "Dr. Michael Heckenberger is the person who finally finds the lost city." }, { "Q": "What sport are the two teams of athletes playing?", "Answer": "The two teams of athletes are playing rugby." }, { "Q": "What item is not used to decorate the Christmas tree?", "Answer": "Green stars are not used to decorate the Christmas tree." }, { "Q": "What is the main subject matter of the advertisement featured in the video?", "Answer": "The main subject matter of the advertisement featured in the video is the Audible app." }, { "Q": "What country's practice game is this?", "Answer": "This is a practice game from the USA." }, { "Q": "According to the video, which team ultimately won?", "Answer": "According to the video, the team that ultimately won is China." }, { "Q": "Which cellular structure is responsible for receiving proteins according to the video?", "Answer": "According to the video, the Golgi apparatus (Golgi body) is responsible for receiving proteins." }, { "Q": "At the beginning, what is the player's rank?", "Answer": "At the beginning, the player's rank is last." }, { "Q": "Which team in the video reached the finish line first?", "Answer": "In the video, the Canadian team reached the finish line first." }, { "Q": "What is the identity of the athlete in the video who committed fouls on all attempts except the first one?", "Answer": "The athlete in the video who committed fouls on all attempts except the first one is a neutral individual athlete." }, { "Q": "The main character of the video is observing the surface of the moon when he notices a straight line, what is it?", "Answer": "The straight line that the main character notices on the surface of the moon is collapsed lava tubes." }, { "Q": "Which woman works as a chef?", "Answer": "The woman who works as a chef is Diamante." }, { "Q": "What kind of chess are the old people in the video playing?", "Answer": "The old people in the video are playing Chinese chess." }, { "Q": "Which ingredient is not used in the video?", "Answer": "Plastic bottles are not used in the video." }, { "Q": "Who does the video focus on regarding their work with globular clusters?", "Answer": "The video focuses on Harlow Shapley regarding his work with globular clusters." } ] def prompt_miradata_based_text_constraint_openqa(dense_caption, background_caption, main_object_caption): task_inst_part = ( "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n" "## TASK:\n" "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc. **while using an event, action or composite feature to constrain the question, thereby ensuring answer uniqueness** in long videos.\n\n" "## CRITICAL RULES:" "1. **Uniqueness Guarantee**: Each question must include either:" " - A **specific action** (e.g., 'What does the woman use to cut the ribbon?'), OR" " - A **specific event** (e.g., 'What falls off the table when the dog bumps into it?'), OR" " - A **composite feature** (e.g., 'What does the girl in the red dress hold in her hand?')." "2. **Visual Grounding**: Answers must be verifiable from a single frame or short clip.\n" "3. **Description Grounding**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n" "4. **No Temporal Reasoning**: Avoid questions requiring comparing frames (e.g., 'what happened next?').\n" "5. **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n" "6. **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n" "7. **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n\n" "## OUTPUT FORMAT: Format the output as a list of dictionaries with the following keys:\n" " - `'Q'`: The question.\n" " - `'Answer'`: The correct answer as a complete sentence.\n" "\n" ) # 使用 OpenQA 示例池 choosed_example_pool = random.sample(openqa_example_pool, 3) example_part_header = "## EXAMPLES:\n" for idx, example in enumerate(choosed_example_pool): Q = example['Q'] Answer = example['Answer'] body = ( f"{idx+1}. {{'Q': '{Q}',\n" f" 'Answer': '{Answer}'}}\n" "\n" ) example_part_header = example_part_header + body example_part = example_part_header system_prompt = task_inst_part + example_part user_prompt = ( "I have provided you with three different aspect descriptions of a specific clip from a long video. Below are these descriptions:\n\n" "**Dense Description:**\n" f"{dense_caption}\n\n" "**Background Description:**\n" f"{background_caption}\n\n" "**Main Object Description:**\n" f"{main_object_caption}\n\n" "Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n" "## REQUIREMENTS:\n" "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n" "- You must use an action, event or composite feature in the question to constrain the question, thereby ensuring answer uniqueness.\n" "- The answer must be directly observable in the description without any reasoning or inference.\n\n" "## OUTPUT FORMAT:\n" "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n" "**Only return the QA pair in the specified JSON list format.**" ) return system_prompt, user_prompt import random def prompt_miradata_based_text_openqa(dense_caption, background_caption, main_object_caption): task_inst_part = ( "You are an AI assistant tasked with generating **high-quality object recognition questions** based on a video snippet description from a long video.\n\n" "## TASK:\n" "Generate **one** high-quality **object recognition question** that requires identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.\n\n" "The answer must be provided as a complete sentence, clearly supported by the visual or narrative content of the video description.\n\n" "## INSTRUCTIONS:\n" "- **Focus on Visual Entities**: The question must test the model’s ability to recognize **objects**.\n" "- **Ground in Visuals**: All answers must be verifiable by pausing a single frame or short clip. Avoid actions, motivations, or temporal reasoning.\n" "- **Ground in the Description**: Ensure that the answer is grounded in the video's description, not general knowledge or external information.\n" "- **Avoid Extraneous Information**: Do not rely on subtitles, voiceovers, or audio cues unless explicitly mentioned in the description.\n" "- **Clear and Logical Phrasing**: Keep the question clear, specific, and logically phrased to avoid ambiguity.\n" "- **Output Format**: Format the output as a list of dictionaries with the following keys:\n" " - `'Q'`: The question.\n" " - `'Answer'`: The correct answer as a complete sentence.\n" "\n" ) # 使用 OpenQA 示例池 choosed_example_pool = random.sample(openqa_example_pool, 3) example_part_header = "## EXAMPLES:\n" for idx, example in enumerate(choosed_example_pool): Q = example['Q'] Answer = example['Answer'] body = ( f"{idx+1}. {{'Q': '{Q}',\n" f" 'Answer': '{Answer}'}}\n" "\n" ) example_part_header = example_part_header + body example_part = example_part_header guidelines_part = ( "## GUIDELINES FOR CREATING QUESTIONS:\n" "- **Specificity**: Ask about singular, clearly defined objects.\n" "- **Visual Certainty**: Ensure the correct answer is unambiguous and directly observable in the description.\n" "- **Description Grounding**: Base all questions and answers on the video description.\n" "- **No Implicit Knowledge**: Avoid questions requiring domain knowledge (e.g., 'What brand is the car?' is invalid unless the logo is visible).\n" "- **Complete Sentence Answers**: Always provide the answer as a grammatically correct, complete sentence.\n" "\n" "## OUTPUT FORMAT:\n" "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]" ) system_prompt = task_inst_part + example_part + guidelines_part user_prompt = ( "I have provided you with three different aspect descriptions of a specific clip in a video. Below are these descriptions:\n\n" "**Dense Description:**\n" f"{dense_caption}\n\n" "**Background Description:**\n" f"{background_caption}\n\n" "**Main Object Description:**\n" f"{main_object_caption}\n\n" "Based on these descriptions and the system instructions, generate **one** high-quality object recognition question-and-answer pair.\n\n" "## REQUIREMENTS:\n" "- The question must focus on **identifying visible objects, such as people, vehicles, animals, furniture, tools, electronic devices, clothing, food, household items, etc.**\n" "- The answer must be directly observable in the description without any reasoning or inference.\n\n" "## OUTPUT FORMAT:\n" "[{'Q': 'Your question here...', 'Answer': 'Your complete sentence answer here...'}]\n\n" "**Only return the QA pair in the specified JSON list format.**" ) return system_prompt, user_prompt