| import json | |
| from openai import OpenAI | |
| from transformers import GPT2Tokenizer | |
| def openai_complete_if_cache( | |
| model="gpt-4o", prompt=None, system_prompt=None, history_messages=[], **kwargs | |
| ) -> str: | |
| openai_client = OpenAI() | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| messages.extend(history_messages) | |
| messages.append({"role": "user", "content": prompt}) | |
| response = openai_client.chat.completions.create( | |
| model=model, messages=messages, **kwargs | |
| ) | |
| return response.choices[0].message.content | |
| tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |
| def get_summary(context, tot_tokens=2000): | |
| tokens = tokenizer.tokenize(context) | |
| half_tokens = tot_tokens // 2 | |
| start_tokens = tokens[1000 : 1000 + half_tokens] | |
| end_tokens = tokens[-(1000 + half_tokens) : 1000] | |
| summary_tokens = start_tokens + end_tokens | |
| summary = tokenizer.convert_tokens_to_string(summary_tokens) | |
| return summary | |
| clses = ["agriculture"] | |
| for cls in clses: | |
| with open(f"../datasets/unique_contexts/{cls}_unique_contexts.json", mode="r") as f: | |
| unique_contexts = json.load(f) | |
| summaries = [get_summary(context) for context in unique_contexts] | |
| total_description = "\n\n".join(summaries) | |
| prompt = f""" | |
| Given the following description of a dataset: | |
| {total_description} | |
| Please identify 5 potential users who would engage with this dataset. For each user, list 5 tasks they would perform with this dataset. Then, for each (user, task) combination, generate 5 questions that require a high-level understanding of the entire dataset. | |
| Output the results in the following structure: | |
| - User 1: [user description] | |
| - Task 1: [task description] | |
| - Question 1: | |
| - Question 2: | |
| - Question 3: | |
| - Question 4: | |
| - Question 5: | |
| - Task 2: [task description] | |
| ... | |
| - Task 5: [task description] | |
| - User 2: [user description] | |
| ... | |
| - User 5: [user description] | |
| ... | |
| """ | |
| result = openai_complete_if_cache(model="gpt-4o", prompt=prompt) | |
| file_path = f"../datasets/questions/{cls}_questions.txt" | |
| with open(file_path, "w") as file: | |
| file.write(result) | |
| print(f"{cls}_questions written to {file_path}") | |