Phương
commited on
Commit
·
0dc226d
1
Parent(s):
099442b
Upload folder using huggingface_hub
Browse files- jsonl.py +19 -0
- jsonl_to_parquet.py +10 -0
- my_tokenizer.model +3 -0
- my_tokenizer.vocab +0 -0
- tmp_tf_gcs_fs_pointer_53013 +0 -0
jsonl.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sentencepiece as spm
|
2 |
+
|
3 |
+
input_file = "output.txt"
|
4 |
+
model_prefix = "my_tokenizer"
|
5 |
+
vocab_size = 18816
|
6 |
+
model_type = "word"
|
7 |
+
input_sentence_size = 1000000
|
8 |
+
shuffle_input_sentence = True
|
9 |
+
|
10 |
+
pad_token = '<pad>'
|
11 |
+
bos_token = '<start>'
|
12 |
+
eos_token = '<end>'
|
13 |
+
unk_token = '<unk>'
|
14 |
+
|
15 |
+
spm.SentencePieceTrainer.train(
|
16 |
+
f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --input_sentence_size={input_sentence_size} --shuffle_input_sentence={shuffle_input_sentence} --max_sentence_length=40000 --pad_id=0 --pad_piece={pad_token} --unk_id=1 --unk_piece={unk_token} --bos_id=2 --bos_piece={bos_token} --eos_id=3 --eos_piece={eos_token}"
|
17 |
+
)
|
18 |
+
|
19 |
+
tokenizer = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")
|
jsonl_to_parquet.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
|
3 |
+
# Load the dataset with only the "text" column
|
4 |
+
dataset = load_dataset("nRuaif/MusicLM", split="train", columns=["text"], streaming=True)
|
5 |
+
|
6 |
+
# Open a file to write the text data to
|
7 |
+
with open("output.txt", "w") as f:
|
8 |
+
# Stream through the dataset and write the "text" column to the file, separated by newline
|
9 |
+
for example in dataset:
|
10 |
+
f.write(example["text"] + "\n")
|
my_tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9a99c4bc4789fe99b2d681e4580277310b6afb7c5879dd06542461054a2cad2
|
3 |
+
size 578324
|
my_tokenizer.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tmp_tf_gcs_fs_pointer_53013
ADDED
Binary file (8 Bytes). View file
|
|