OpenLab-NLP
/

model-prototype

Model card Files Files and versions

Yuchan commited on Nov 24

Commit

19949b0

·

verified ·

1 Parent(s): 42abc8c

Update AlphaS2S.py

Files changed (1) hide show

AlphaS2S.py +8 -6

AlphaS2S.py CHANGED Viewed

@@ -51,22 +51,25 @@ TOKENIZER_PATH = "ko_unigram.model"
 if not os.path.exists(DATA_PATH):
     download_file(
-        "https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/output.jsonl?download=true",
         DATA_PATH
     )
 if not os.path.exists(TOKENIZER_PATH):
     download_file(
-        "https://huggingface.co/datasets/Yuchan5386/TinyInst/resolve/main/ko_unigram.model?download=true",
         TOKENIZER_PATH
     )
 sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
 pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
-start_id = sp.piece_to_id("<start>")
-sep_id = sp.piece_to_id("<sep>")
-end_id = sp.piece_to_id("<end>")
 unk_id = sp.piece_to_id("<unk>")
 vocab_size = sp.get_piece_size()
 print(f"✅ Vocabulary size: {vocab_size}")
@@ -77,7 +80,6 @@ def text_to_ids(text):
 def ids_to_text(ids):
     return sp.decode(ids)
 # =======================
 # 2) 데이터셋 생성 함수 (기존 코드와 동일)
 # =======================

 if not os.path.exists(DATA_PATH):
     download_file(
+        "https://huggingface.co/datasets/Yuchan5386/Multiturn/resolve/main/dataset_shuffled.jsonl?download=true",
         DATA_PATH
     )
 if not os.path.exists(TOKENIZER_PATH):
     download_file(
+        "https://huggingface.co/datasets/Yuchan5386/Multiturn/resolve/main/unigram.model?download=true",
         TOKENIZER_PATH
     )
 sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
 pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
+start_id = sp.piece_to_id("<sos>")
+context_s_id = sp.piece_to_id("<context>")
+context_e_id = sp.piece_to_id("</context>")
+user_s_id = sp.piece_to_id("<user>")
+user_e_id = sp.piece_to_id("</user>")
+end_id = sp.piece_to_id("<eos>")
 unk_id = sp.piece_to_id("<unk>")
 vocab_size = sp.get_piece_size()
 print(f"✅ Vocabulary size: {vocab_size}")
 def ids_to_text(ids):
     return sp.decode(ids)
 # =======================
 # 2) 데이터셋 생성 함수 (기존 코드와 동일)
 # =======================