File size: 4,819 Bytes
54216bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import argparse
import json
import os
from tqdm import tqdm
from src.run_gpt import run_gpt
"""
Extract keywords from the given question and options
Sample Run
python3 extractKeyword.py --output-dir ego_base_link --question questions/500questions.jsonl --gptmodel "gpt-4-1106-preview"
"""
# You may add multiple keys if you want parallel calls
dict_api = {
"api_key": "ADD",
}
PROMPT = (
"Think step-by-step and for each option, identify all the specified activities. "
"Each description of activity should use active voice with plain verbs, contain fewer than six words, "
"and retains as many original terms from the options as possible.\n"
"Here are the options:\n\n"
"option 0: {Putop0}\n"
"option 1: {Putop1}\n"
"option 2: {Putop2}\n"
"option 3: {Putop3}\n"
"option 4: {Putop4}\n"
"option 5: {Putquestion}.\n"
"All the options were introduced. 'C' represents the camera operator in the options. "
"Your answer should follow the JSON format shown below and should only include the JSON result. "
"Do not output any warnings or notes under any circumstances. Instead, adhere strictly to the provided JSON format example.\n"
"This is one example output format.\n"
"{\"option 0\": [\"plays soccer\", \"go to school\"], \"option 1\": [\"go to the gym\", \"go to school\"], "
"\"option 2\": [\"go to school\", \"dry hair\"], \"option 3\": [\"plays basketball\", \"look at the tree\"], "
"\"option 4\": [\"plays soccer\", \"drop the ball\"], \"option 5\": [\"turn the table\", \"go to school\"]}"
)
def main(args):
# 1. Create output directories
os.makedirs(args.output_dir, exist_ok=True)
job_dir = os.path.join(args.output_dir, "extractedKeywords")
os.makedirs(job_dir, exist_ok=True)
# 2. Build the output file name (based on --question)
question_file_name = os.path.basename(args.question).replace(".jsonl", "")
output_summary_path = os.path.join(job_dir, f"{question_file_name}.jsonl")
print(f"Saving outputs to: {output_summary_path}")
# 3. Read the question file
with open(os.path.expanduser(args.question), "r") as f:
question_data = [json.loads(line) for line in f]
# 4. Construct final prompts
final_prompts = []
final_info = []
for entry in tqdm(question_data, desc="Building prompts"):
q_uid = entry["q_uid"]
# Insert each option + question into the embedded prompt
cur_prompt = (
PROMPT
.replace("{Putop0}", entry["option 0"])
.replace("{Putop1}", entry["option 1"])
.replace("{Putop2}", entry["option 2"])
.replace("{Putop3}", entry["option 3"])
.replace("{Putop4}", entry["option 4"])
.replace("{Putquestion}", entry["question"])
)
final_prompts.append(cur_prompt)
# Track data for JSON output
info = {
"q_uid": q_uid,
"prompt": cur_prompt,
"option 0": entry["option 0"],
"option 1": entry["option 1"],
"option 2": entry["option 2"],
"option 3": entry["option 3"],
"option 4": entry["option 4"],
"question": entry["question"],
}
# Include ground-truth label if present
if "CA" in entry:
info["CA"] = entry["CA"]
final_info.append(info)
# 5. Call GPT
print("Sending prompts to GPT. This may take a while...")
output_results = run_gpt(
texts=final_prompts,
api_keys=list(dict_api.values()),
max_tokens=2000,
model=args.gptmodel,
temperature=args.temperature,
num_threads=5, # Adjust as needed
backoff_time=60, # Adjust as needed
silent=False,
dataset="extractKeyword",
)
output_results = list(output_results)
# 6. Save results
with open(output_summary_path, "w") as outfile:
for i, info in enumerate(final_info):
info["answer"] = output_results[i]
outfile.write(json.dumps(info) + "\n")
print(f"Done! Results written to {output_summary_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output-dir", type=str, required=True,
help="Directory to store the resulting JSONL file.")
parser.add_argument("--question", type=str, required=True,
help="Path to the JSONL file with question data (e.g., 500questions.jsonl).")
parser.add_argument("--gptmodel", type=str, default="gpt-4-1106-preview",
help="The GPT model to call.")
parser.add_argument("--temperature", type=float, default=None,
help="Temperature parameter for GPT.")
args = parser.parse_args()
main(args)
|