Spaces:
Runtime error
Runtime error
File size: 3,711 Bytes
12f2e48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from typing import List
import argparse
import json
import os
import random
import openai
from datasets import Dataset, load_dataset
from multi_token.constants import ROLE_ASSISTANT, ROLE_USER
PROMPT = """
You are helping write captions for audio clips.
Here are the tags for the audio clip you are captioning:
{captions}
Write a brief caption for the audio clip.
"""
PRETRAIN_PHRASES = [
"What is happening in <sound>?",
"Describe the sound. <sound>",
"<sound> Provide a description of the audio.",
"Can you interpret <sound>?",
"Please explain what's happening in <sound>",
"What does <sound> represent?",
"Could you describe <sound> for me?",
"What's the content of <sound>?",
"Can you depict <sound>?",
"What is <sound>?",
"In the audo clip, <sound>, what is happening?",
"Provide a description of the sound. <sound>",
"Provide a caption for the sound. <sound>",
]
OPENAI_TOOLS = [
{
"type": "function",
"function": {
"name": "write_caption",
"description": "Write a caption for an audio clip",
"parameters": {
"type": "object",
"properties": {
"caption": {
"type": "string",
},
},
"required": ["caption"],
},
},
}
]
def _build_convo(row) -> List:
client = openai.Client()
captions = [row["metadataTags"]]
sounds = [row["url"]]
captions_text = "\n".join([f'Tags: "{cap}"' for i, cap in enumerate(captions)])
prompt = PROMPT.format(captions=captions_text).strip()
completion = client.chat.completions.create(
model="gpt-3.5-turbo-1106",
messages=[{"role": "system", "content": prompt}],
tools=OPENAI_TOOLS,
tool_choice={"type": "function", "function": {"name": "write_caption"}},
)
resp = json.loads(completion.choices[0].message.tool_calls[0].function.arguments)
caption = resp["caption"]
q = random.choice(PRETRAIN_PHRASES)
example = {
"sounds": sounds,
"messages": [
{
"role": ROLE_USER,
"content": q,
},
{
"role": ROLE_ASSISTANT,
"content": caption,
},
],
}
return example
def main(args):
data = load_dataset("Chr0my/Epidemic_sounds", split="train")
os.makedirs(args.cache_folder, exist_ok=True)
def gen(seeds):
cache = open(
os.path.join(args.cache_folder, f"gpt-cache.{seeds[0]}.jsonl"), "a"
)
for s in seeds:
selected_row = data[s]
try:
example = _build_convo(selected_row)
cache.write(json.dumps(example) + "\n")
yield example
except Exception as e:
print(e)
continue
cache.close()
idxs = list(range(len(data)))
random.shuffle(idxs)
ds = Dataset.from_generator(
gen,
num_proc=args.num_proc,
gen_kwargs={"seeds": idxs},
)
ds.save_to_disk(args.output_folder)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-o",
"--output_folder",
type=str,
default="/data/clap-gpt-pretrain",
)
parser.add_argument(
"-c",
"--cache_folder",
type=str,
default="/data/clap-gpt-pretrain-cache",
)
parser.add_argument("-n", "--num_examples", type=int, default=500_000)
parser.add_argument("-p", "--num_proc", type=int, default=10)
args = parser.parse_args()
main(args)
|