Phương commited on
Commit
0dc226d
·
1 Parent(s): 099442b

Upload folder using huggingface_hub

Browse files
jsonl.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ input_file = "output.txt"
4
+ model_prefix = "my_tokenizer"
5
+ vocab_size = 18816
6
+ model_type = "word"
7
+ input_sentence_size = 1000000
8
+ shuffle_input_sentence = True
9
+
10
+ pad_token = '<pad>'
11
+ bos_token = '<start>'
12
+ eos_token = '<end>'
13
+ unk_token = '<unk>'
14
+
15
+ spm.SentencePieceTrainer.train(
16
+ f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --input_sentence_size={input_sentence_size} --shuffle_input_sentence={shuffle_input_sentence} --max_sentence_length=40000 --pad_id=0 --pad_piece={pad_token} --unk_id=1 --unk_piece={unk_token} --bos_id=2 --bos_piece={bos_token} --eos_id=3 --eos_piece={eos_token}"
17
+ )
18
+
19
+ tokenizer = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")
jsonl_to_parquet.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ # Load the dataset with only the "text" column
4
+ dataset = load_dataset("nRuaif/MusicLM", split="train", columns=["text"], streaming=True)
5
+
6
+ # Open a file to write the text data to
7
+ with open("output.txt", "w") as f:
8
+ # Stream through the dataset and write the "text" column to the file, separated by newline
9
+ for example in dataset:
10
+ f.write(example["text"] + "\n")
my_tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9a99c4bc4789fe99b2d681e4580277310b6afb7c5879dd06542461054a2cad2
3
+ size 578324
my_tokenizer.vocab ADDED
The diff for this file is too large to render. See raw diff
 
tmp_tf_gcs_fs_pointer_53013 ADDED
Binary file (8 Bytes). View file