from typing import List import argparse import json import os import random import openai from datasets import Dataset, load_dataset from multi_token.constants import ROLE_ASSISTANT, ROLE_USER PROMPT = """ You are helping write captions for audio clips. Here are the tags for the audio clip you are captioning: {captions} Write a brief caption for the audio clip. """ PRETRAIN_PHRASES = [ "What is happening in ?", "Describe the sound. ", " Provide a description of the audio.", "Can you interpret ?", "Please explain what's happening in ", "What does represent?", "Could you describe for me?", "What's the content of ?", "Can you depict ?", "What is ?", "In the audo clip, , what is happening?", "Provide a description of the sound. ", "Provide a caption for the sound. ", ] OPENAI_TOOLS = [ { "type": "function", "function": { "name": "write_caption", "description": "Write a caption for an audio clip", "parameters": { "type": "object", "properties": { "caption": { "type": "string", }, }, "required": ["caption"], }, }, } ] def _build_convo(row) -> List: client = openai.Client() captions = [row["metadataTags"]] sounds = [row["url"]] captions_text = "\n".join([f'Tags: "{cap}"' for i, cap in enumerate(captions)]) prompt = PROMPT.format(captions=captions_text).strip() completion = client.chat.completions.create( model="gpt-3.5-turbo-1106", messages=[{"role": "system", "content": prompt}], tools=OPENAI_TOOLS, tool_choice={"type": "function", "function": {"name": "write_caption"}}, ) resp = json.loads(completion.choices[0].message.tool_calls[0].function.arguments) caption = resp["caption"] q = random.choice(PRETRAIN_PHRASES) example = { "sounds": sounds, "messages": [ { "role": ROLE_USER, "content": q, }, { "role": ROLE_ASSISTANT, "content": caption, }, ], } return example def main(args): data = load_dataset("Chr0my/Epidemic_sounds", split="train") os.makedirs(args.cache_folder, exist_ok=True) def gen(seeds): cache = open( os.path.join(args.cache_folder, f"gpt-cache.{seeds[0]}.jsonl"), "a" ) for s in seeds: selected_row = data[s] try: example = _build_convo(selected_row) cache.write(json.dumps(example) + "\n") yield example except Exception as e: print(e) continue cache.close() idxs = list(range(len(data))) random.shuffle(idxs) ds = Dataset.from_generator( gen, num_proc=args.num_proc, gen_kwargs={"seeds": idxs}, ) ds.save_to_disk(args.output_folder) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-o", "--output_folder", type=str, default="/data/clap-gpt-pretrain", ) parser.add_argument( "-c", "--cache_folder", type=str, default="/data/clap-gpt-pretrain-cache", ) parser.add_argument("-n", "--num_examples", type=int, default=500_000) parser.add_argument("-p", "--num_proc", type=int, default=10) args = parser.parse_args() main(args)