Spaces:
Runtime error
Runtime error
import json | |
import os | |
import random | |
from datasets import Dataset | |
PRETRAIN_PHRASES = [ | |
"What is happening in the given music <sound>?", | |
"Describe the sound. <sound>", | |
"Describe the music. <sound>", | |
"<sound> Provide a description of the music.", | |
"<sound> Provide a description of the sound.", | |
"Can you interpret <sound>?", | |
"Please explain what's happening in <sound>", | |
"What does <sound> represent?", | |
"Could you describe <sound> for me?", | |
"What's the content of <sound>?", | |
"Can you depict <sound>?", | |
"What is <sound>?", | |
"In the music clip, <sound>, what is happening?", | |
"Provide a description of the music. <sound>", | |
"Provide a description of the sound. <sound>", | |
"Provide a caption for the sound. <sound>", | |
"Provide a caption for the music. <sound>", | |
] | |
def convert_json_to_dataset(input_file, output_folder, train_ratio=0.8): | |
with open(input_file, 'r') as f: | |
data = [json.loads(line.strip()) for line in f] | |
os.makedirs(output_folder, exist_ok=True) | |
cache_path = os.path.join(output_folder, "gpt-cache.jsonl") | |
cache = open(cache_path, "a") | |
# Shuffle the data | |
random.shuffle(data) | |
train_size = int(len(data) * train_ratio) | |
train_data = data[:train_size] | |
val_data = data[train_size:] | |
def gen(entries): | |
for idx, entry in enumerate(entries): | |
sound_location = entry["location"] | |
main_caption = entry["main_caption"] | |
alt_caption = entry["alt_caption"] | |
# Randomly select a pretrain phrase for user content | |
user_content = random.choice(PRETRAIN_PHRASES) | |
# Construct the main example | |
example_1 = { | |
"id": f"{2*idx + 1:07}", | |
"sounds": [sound_location], | |
"messages": [ | |
{"role": "user", "content": user_content}, | |
{"role": "assistant", "content": main_caption} | |
] | |
} | |
cache.write(json.dumps(example_1) + "\n") | |
yield example_1 | |
# Construct the alt example | |
example_2 = { | |
"id": f"{2*idx+2:07}", | |
"sounds": [sound_location], | |
"messages": [ | |
{"role": "user", "content": user_content}, | |
{"role": "assistant", "content": alt_caption} | |
] | |
} | |
cache.write(json.dumps(example_2) + "\n") | |
yield example_2 | |
train_ds = Dataset.from_generator( | |
gen, | |
num_proc=1, # Set num_proc to adjust parallel processing | |
gen_kwargs={"entries": train_data}, | |
) | |
train_ds.save_to_disk(os.path.join(output_folder, "train")) | |
val_ds = Dataset.from_generator( | |
gen, | |
num_proc=1, # Set num_proc to adjust parallel processing | |
gen_kwargs={"entries": val_data}, | |
) | |
val_ds.save_to_disk(os.path.join(output_folder, "val")) | |
cache.close() | |
if __name__ == "__main__": | |
input_file = "/Users/anuradhachopra/Downloads/MusicBench_train.json" # Change this to your input JSON file path | |
output_folder = "/Users/anuradhachopra/data/musicbench_multitoken" # Change this to your desired output folder path | |
convert_json_to_dataset(input_file, output_folder, train_ratio=0.8) | |