File size: 3,316 Bytes
12f2e48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import json
import os
import random
from datasets import Dataset

PRETRAIN_PHRASES = [
    "What is happening in the given music <sound>?",
    "Describe the sound. <sound>",
    "Describe the music. <sound>",
    "<sound> Provide a description of the music.",
    "<sound> Provide a description of the sound.",
    "Can you interpret <sound>?",
    "Please explain what's happening in <sound>",
    "What does <sound> represent?",
    "Could you describe <sound> for me?",
    "What's the content of <sound>?",
    "Can you depict <sound>?",
    "What is <sound>?",
    "In the music clip, <sound>, what is happening?",
    "Provide a description of the music. <sound>",
    "Provide a description of the sound. <sound>",
    "Provide a caption for the sound. <sound>",
    "Provide a caption for the music. <sound>",
]

def convert_json_to_dataset(input_file, output_folder, train_ratio=0.8):
    with open(input_file, 'r') as f:
        data = [json.loads(line.strip()) for line in f]

    os.makedirs(output_folder, exist_ok=True)
    cache_path = os.path.join(output_folder, "gpt-cache.jsonl")
    cache = open(cache_path, "a")

    # Shuffle the data
    random.shuffle(data)

    train_size = int(len(data) * train_ratio)
    train_data = data[:train_size]
    val_data = data[train_size:]

    def gen(entries):
        for idx, entry in enumerate(entries):
            sound_location = entry["location"]
            main_caption = entry["main_caption"]
            alt_caption = entry["alt_caption"]
            
            # Randomly select a pretrain phrase for user content
            user_content = random.choice(PRETRAIN_PHRASES)

            # Construct the main example
            example_1 = {
                "id": f"{2*idx + 1:07}",
                "sounds": [sound_location],
                "messages": [
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": main_caption}
                ]
            }

            cache.write(json.dumps(example_1) + "\n")
            yield example_1

            # Construct the alt example
            example_2 = {
                "id": f"{2*idx+2:07}",
                "sounds": [sound_location],
                "messages": [
                    {"role": "user", "content": user_content},
                    {"role": "assistant", "content": alt_caption}
                ]
            }

            cache.write(json.dumps(example_2) + "\n")
            yield example_2

    train_ds = Dataset.from_generator(
        gen,
        num_proc=1,  # Set num_proc to adjust parallel processing
        gen_kwargs={"entries": train_data},
    )
    train_ds.save_to_disk(os.path.join(output_folder, "train"))

    val_ds = Dataset.from_generator(
        gen,
        num_proc=1,  # Set num_proc to adjust parallel processing
        gen_kwargs={"entries": val_data},
    )
    val_ds.save_to_disk(os.path.join(output_folder, "val"))

    cache.close()

if __name__ == "__main__":
    input_file = "/Users/anuradhachopra/Downloads/MusicBench_train.json"  # Change this to your input JSON file path
    output_folder = "/Users/anuradhachopra/data/musicbench_multitoken"    # Change this to your desired output folder path

    convert_json_to_dataset(input_file, output_folder, train_ratio=0.8)