File size: 3,711 Bytes
12f2e48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from typing import List
import argparse
import json
import os
import random
import openai

from datasets import Dataset, load_dataset

from multi_token.constants import ROLE_ASSISTANT, ROLE_USER

PROMPT = """
You are helping write captions for audio clips.

Here are the tags for the audio clip you are captioning:
{captions}

Write a brief caption for the audio clip.
"""

PRETRAIN_PHRASES = [
    "What is happening in <sound>?",
    "Describe the sound. <sound>",
    "<sound> Provide a description of the audio.",
    "Can you interpret <sound>?",
    "Please explain what's happening in <sound>",
    "What does <sound> represent?",
    "Could you describe <sound> for me?",
    "What's the content of <sound>?",
    "Can you depict <sound>?",
    "What is <sound>?",
    "In the audo clip, <sound>, what is happening?",
    "Provide a description of the sound. <sound>",
    "Provide a caption for the sound. <sound>",
]

OPENAI_TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "write_caption",
            "description": "Write a caption for an audio clip",
            "parameters": {
                "type": "object",
                "properties": {
                    "caption": {
                        "type": "string",
                    },
                },
                "required": ["caption"],
            },
        },
    }
]


def _build_convo(row) -> List:
    client = openai.Client()

    captions = [row["metadataTags"]]
    sounds = [row["url"]]

    captions_text = "\n".join([f'Tags: "{cap}"' for i, cap in enumerate(captions)])
    prompt = PROMPT.format(captions=captions_text).strip()

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[{"role": "system", "content": prompt}],
        tools=OPENAI_TOOLS,
        tool_choice={"type": "function", "function": {"name": "write_caption"}},
    )
    resp = json.loads(completion.choices[0].message.tool_calls[0].function.arguments)
    caption = resp["caption"]

    q = random.choice(PRETRAIN_PHRASES)

    example = {
        "sounds": sounds,
        "messages": [
            {
                "role": ROLE_USER,
                "content": q,
            },
            {
                "role": ROLE_ASSISTANT,
                "content": caption,
            },
        ],
    }
    return example


def main(args):
    data = load_dataset("Chr0my/Epidemic_sounds", split="train")

    os.makedirs(args.cache_folder, exist_ok=True)

    def gen(seeds):
        cache = open(
            os.path.join(args.cache_folder, f"gpt-cache.{seeds[0]}.jsonl"), "a"
        )
        for s in seeds:
            selected_row = data[s]
            try:
                example = _build_convo(selected_row)
                cache.write(json.dumps(example) + "\n")
                yield example
            except Exception as e:
                print(e)
                continue

        cache.close()

    idxs = list(range(len(data)))
    random.shuffle(idxs)

    ds = Dataset.from_generator(
        gen,
        num_proc=args.num_proc,
        gen_kwargs={"seeds": idxs},
    )
    ds.save_to_disk(args.output_folder)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-o",
        "--output_folder",
        type=str,
        default="/data/clap-gpt-pretrain",
    )
    parser.add_argument(
        "-c",
        "--cache_folder",
        type=str,
        default="/data/clap-gpt-pretrain-cache",
    )
    parser.add_argument("-n", "--num_examples", type=int, default=500_000)
    parser.add_argument("-p", "--num_proc", type=int, default=10)
    args = parser.parse_args()
    main(args)