OpenLab-NLP
/

model-prototype

Model card Files Files and versions

xet

Community

Yuchan commited on Nov 24

Commit

a638654

verified ·

1 Parent(s): 19949b0

Update AlphaS2S.py

Browse files

Files changed (1) hide show

AlphaS2S.py +46 -70

AlphaS2S.py CHANGED Viewed

@@ -81,87 +81,61 @@ def ids_to_text(ids):
     return sp.decode(ids)
 # =======================
-# 2) 데이터셋 생성 함수 (기존 코드와 동일)
 # =======================
 def jsonl_stream(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
             data = json.loads(line)
-            conversations = data.get("conversations", [])
-            for i in range(0, len(conversations) - 1, 2):
-                human_msg = conversations[i]
-                gpt_msg   = conversations[i + 1]
-                if human_msg.get("from") != "human" or gpt_msg.get("from") != "gpt":
-                    continue
-                prompt   = human_msg.get("value", "").strip()
-                response = gpt_msg.get("value", "").strip()
-                full = f"<start> {prompt} <sep> {response} <end>"
-                if "<sep>" not in full:
-                    continue
-                sep_index  = full.index("<sep>")
-                # 인코더 입력은 <start> 프롬프트 <sep> 부분, 디코더 입력은 <sep> 응답 <end> 부분
-                # (Unified Input: 인코더/디코더 입력 모두 full_input을 사용)
-                input_text = full
-                # 타겟 시퀀스는 응답 시작 부분부터 <end>까지이며, 입력보다 한 칸 시프트됨
-                # 여기서 target_text는 응답 부분만 추출하여 타겟 마스킹에 사용됩니다.
-                target_text_raw = full[sep_index + len("<sep>"):]
-                input_ids  = text_to_ids(input_text) # 전체 시퀀스
-                target_ids_raw = text_to_ids(target_text_raw) # 응답 부분만
-                # 길이 처리 및 마스킹 로직은 기존 코드를 그대로 유지
-                full_input = input_ids[:max_len]
-                target_ids = target_ids_raw[:max_len - len(input_ids)]
-                available_len = max_len - len(input_ids)
-                if available_len <= 0:
-                    input_ids = input_ids[-max_len:]
-                    target_ids = []
-                    target_mask = [0] * len(input_ids)
-                else:
-                    target_ids = target_ids[:available_len]
-                    target_mask = [0] * len(input_ids) + [1] * len(target_ids)
-                full_input = input_ids + target_ids
-                pad_len = max_len - len(full_input)
-                full_input += [pad_id] * pad_len
-                target_mask += [0] * pad_len
-                # 타겟 시퀀스는 입력 시퀀스보다 한 칸 시프트된 형태
-                target_seq = full_input[1:] + [end_id]
-                target_seq = target_seq[:max_len]
-                # 마스킹된 타겟 생성 (프롬프트/패딩 부분은 pad_id로 대체)
-                masked_target = [
-                    t if m == 1 else pad_id
-                    for t, m in zip(target_seq, target_mask)
-                ]
-                # AlphaS2S는 인코더/디코더 입력으로 같은 시퀀스를 사용
-                # 입력 시퀀스 = full_input
-                # 타겟 시퀀스 = masked_target
-                yield (
-                    tf.convert_to_tensor(full_input, dtype=tf.int32),
-                    tf.convert_to_tensor(full_input, dtype=tf.int32), # 디코더 입력도 동일하게 전달
-                    tf.convert_to_tensor(masked_target, dtype=tf.int32) # 실제 타겟
-                )
 dataset = tf.data.Dataset.from_generator(
     lambda: jsonl_stream(DATA_PATH),
     output_signature=(
-        tf.TensorSpec(shape=(max_len,), dtype=tf.int32), # enc_inputs
-        tf.TensorSpec(shape=(max_len,), dtype=tf.int32), # dec_inputs
-        tf.TensorSpec(shape=(max_len,), dtype=tf.int32), # target
-    ),
 )
-# 학습을 위해 딕셔너리 형태로 맵핑
 def map_fn(enc_input, dec_input, dec_target):
     return {"enc_inputs": enc_input, "dec_inputs": dec_input}, dec_target
@@ -171,6 +145,8 @@ dataset = dataset.shuffle(1000, seed=SEED).batch(batch_size, drop_remainder=True
 with strategy.scope():
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
 # =======================
 # 3) 모델 레이어 (기존 코드 유지)
 # =======================

     return sp.decode(ids)
 # =======================
+# JSONL → TF Dataset 로드 (ID 레벨 특수 토큰 포함)
 # =======================
 def jsonl_stream(file_path):
     with open(file_path, "r", encoding="utf-8") as f:
         for line in f:
             data = json.loads(line)
+            context = data["context"]
+            prompt = data["prompt"]
+            answer = data["answer"]
+            # =======================
+            # Encoder input: ID 레벨에서 특수 토큰 명시
+            # =======================
+            enc_ids = [context_s_id] + text_to_ids(context) + [context_e_id] + \
+                      [user_s_id] + text_to_ids(prompt) + [user_e_id]
+            enc_ids = enc_ids[:max_len]  # max_len 제한
+            # =======================
+            # Decoder input: <sos> + answer
+            # =======================
+            dec_input_ids = [start_id] + text_to_ids(answer)
+            dec_input_ids = dec_input_ids[:max_len]
+            # =======================
+            # Target: answer + <eos>
+            # =======================
+            target_ids = text_to_ids(answer) + [end_id]
+            target_ids = target_ids[:max_len]
+            # =======================
+            # Padding
+            # =======================
+            enc_ids += [pad_id] * (max_len - len(enc_ids))
+            dec_input_ids += [pad_id] * (max_len - len(dec_input_ids))
+            target_ids += [pad_id] * (max_len - len(target_ids))
+            yield (
+                tf.convert_to_tensor(enc_ids, dtype=tf.int32),
+                tf.convert_to_tensor(dec_input_ids, dtype=tf.int32),
+                tf.convert_to_tensor(target_ids, dtype=tf.int32),
+            )
+# =======================
+# TF Dataset 생성
+# =======================
 dataset = tf.data.Dataset.from_generator(
     lambda: jsonl_stream(DATA_PATH),
     output_signature=(
+        tf.TensorSpec(shape=(max_len,), dtype=tf.int32),  # enc_inputs
+        tf.TensorSpec(shape=(max_len,), dtype=tf.int32),  # dec_inputs
+        tf.TensorSpec(shape=(max_len,), dtype=tf.int32),  # target
+    )
 )
+# 학습을 위해 딕셔너리 형태로 매핑
 def map_fn(enc_input, dec_input, dec_target):
     return {"enc_inputs": enc_input, "dec_inputs": dec_input}, dec_target
 with strategy.scope():
     dist_dataset = strategy.experimental_distribute_dataset(dataset)
+print("✅ ID 레벨 특수 토큰 적용 Dataset 로드 완료:", dist_dataset)
 # =======================
 # 3) 모델 레이어 (기존 코드 유지)
 # =======================