Yuchan
commited on
Update AlphaS2S.py
Browse files- AlphaS2S.py +8 -6
AlphaS2S.py
CHANGED
|
@@ -51,22 +51,25 @@ TOKENIZER_PATH = "ko_unigram.model"
|
|
| 51 |
|
| 52 |
if not os.path.exists(DATA_PATH):
|
| 53 |
download_file(
|
| 54 |
-
"https://huggingface.co/datasets/Yuchan5386/
|
| 55 |
DATA_PATH
|
| 56 |
)
|
| 57 |
|
| 58 |
if not os.path.exists(TOKENIZER_PATH):
|
| 59 |
download_file(
|
| 60 |
-
"https://huggingface.co/datasets/Yuchan5386/
|
| 61 |
TOKENIZER_PATH
|
| 62 |
)
|
| 63 |
|
| 64 |
sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
|
| 65 |
|
| 66 |
pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
|
| 67 |
-
start_id = sp.piece_to_id("<
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
| 70 |
unk_id = sp.piece_to_id("<unk>")
|
| 71 |
vocab_size = sp.get_piece_size()
|
| 72 |
print(f"โ
Vocabulary size: {vocab_size}")
|
|
@@ -77,7 +80,6 @@ def text_to_ids(text):
|
|
| 77 |
def ids_to_text(ids):
|
| 78 |
return sp.decode(ids)
|
| 79 |
|
| 80 |
-
|
| 81 |
# =======================
|
| 82 |
# 2) ๋ฐ์ดํฐ์
์์ฑ ํจ์ (๊ธฐ์กด ์ฝ๋์ ๋์ผ)
|
| 83 |
# =======================
|
|
|
|
| 51 |
|
| 52 |
if not os.path.exists(DATA_PATH):
|
| 53 |
download_file(
|
| 54 |
+
"https://huggingface.co/datasets/Yuchan5386/Multiturn/resolve/main/dataset_shuffled.jsonl?download=true",
|
| 55 |
DATA_PATH
|
| 56 |
)
|
| 57 |
|
| 58 |
if not os.path.exists(TOKENIZER_PATH):
|
| 59 |
download_file(
|
| 60 |
+
"https://huggingface.co/datasets/Yuchan5386/Multiturn/resolve/main/unigram.model?download=true",
|
| 61 |
TOKENIZER_PATH
|
| 62 |
)
|
| 63 |
|
| 64 |
sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
|
| 65 |
|
| 66 |
pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
|
| 67 |
+
start_id = sp.piece_to_id("<sos>")
|
| 68 |
+
context_s_id = sp.piece_to_id("<context>")
|
| 69 |
+
context_e_id = sp.piece_to_id("</context>")
|
| 70 |
+
user_s_id = sp.piece_to_id("<user>")
|
| 71 |
+
user_e_id = sp.piece_to_id("</user>")
|
| 72 |
+
end_id = sp.piece_to_id("<eos>")
|
| 73 |
unk_id = sp.piece_to_id("<unk>")
|
| 74 |
vocab_size = sp.get_piece_size()
|
| 75 |
print(f"โ
Vocabulary size: {vocab_size}")
|
|
|
|
| 80 |
def ids_to_text(ids):
|
| 81 |
return sp.decode(ids)
|
| 82 |
|
|
|
|
| 83 |
# =======================
|
| 84 |
# 2) ๋ฐ์ดํฐ์
์์ฑ ํจ์ (๊ธฐ์กด ์ฝ๋์ ๋์ผ)
|
| 85 |
# =======================
|