#!/usr/bin/env python3 import json from pathlib import Path import re import torch from transformers import AutoTokenizer, Gemma3ForCausalLM from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction from tqdm import tqdm import os import torch._dynamo torch._dynamo.config.suppress_errors = True torch.set_float32_matmul_precision('high') # Also address the warning you saw # ─── CONFIG ───────────────────────────────────────────────────────── SRC_LANG, TGT_LANG = "en", "kk" MODEL_PATH = "/raid/srp_base_model_training/abai_workspace/models/sync_kk_en/checkpoint-final" TEST_FILE = "/raid/srp_base_model_training/abai_workspace/data/flores/en_to_kk_formatted.jsonl" # JSONL with fields system,user,assistant OUTPUT_JSON = f"eval_sync_KKEN_data_{SRC_LANG}_to_{TGT_LANG}.json" MAX_NEW_TOKS = 64 os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4,5" # specify your GPU IDs here DEVICE = "cuda" # or "cpu" # ──────────────────────────────────────────────────────────────────── # Add this helper near the top def clean_user_field(user_str: str) -> str: """ Remove leading tags and any whitespace/newlines after them. """ # This pattern matches plus any whitespace/newline return re.sub(r'^]+>]+>\s*', '', user_str) def load_model_and_tokenizer(): print(f"Loading model/tokenizer from {MODEL_PATH} …") tok = AutoTokenizer.from_pretrained(MODEL_PATH) model = Gemma3ForCausalLM.from_pretrained( MODEL_PATH, torch_dtype=torch.bfloat16, device_map="auto", # auto‐shard if you have multiple GPUs ) model.eval() return tok, model def build_prompt(system: str, user: str) -> str: return ( f"system\n{system}\n" f"user\n{user}\n" f"assistant" ) def run_inference(tok, model, system: str, user: str) -> str: prompt = build_prompt(system, user) inputs = tok(prompt, return_tensors="pt", truncation=True).to(model.device) input_len = inputs["input_ids"].shape[-1] with torch.inference_mode(): out = model.generate( **inputs, max_new_tokens=MAX_NEW_TOKS, do_sample=False, eos_token_id=tok.convert_tokens_to_ids(""), pad_token_id=tok.eos_token_id, ) gen_ids = out[0][input_len:] return tok.decode(gen_ids, skip_special_tokens=True).strip() def load_test_examples(path: str): examples = [] for line in open(path, encoding="utf-8"): obj = json.loads(line) examples.append((obj["system"].strip(), obj["user"].strip(), obj["assistant"].strip())) return examples def evaluate_bleu_nltk(hyps, refs): """ Compute corpus-level 4-gram BLEU using NLTK. - hyps: list of hypothesis strings - refs: list of reference strings Returns BLEU in percentage (e.g. 27.53). """ # 1) tokenize on whitespace tokenized_hyps = [hyp.split() for hyp in hyps] # NLTK expects a list of references per hypothesis tokenized_refs = [[ref.split()] for ref in refs] # 2) smoothing to avoid zero scores on short examples smoothing = SmoothingFunction().method1 # 3) compute corpus_bleu with uniform 4-gram weights score = corpus_bleu( tokenized_refs, tokenized_hyps, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing, ) # convert to percentage return round(score, 4) def main(): tok, model = load_model_and_tokenizer() examples = load_test_examples(TEST_FILE) hyps, refs, users = [], [], [] for system, user, assistant in tqdm(examples, desc="Translating"): clean_user = clean_user_field(user) hyp = run_inference(tok, model, system, clean_user) hyps.append(hyp) refs.append(assistant) users.append(clean_user) # collect cleaned user texts bleu_score = evaluate_bleu_nltk(hyps, refs) # Save results, using cleaned user strings out = { "model": MODEL_PATH, "bleu": bleu_score, "examples": [] } for (s, _, r), u_clean, h in zip(examples, users, hyps): out["examples"].append({ "system": s, "user": u_clean, # no tags here "reference": r, "hypothesis": h }) with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(out, f, ensure_ascii=False, indent=2) print(f"✅ Saved cleaned evaluation to {OUTPUT_JSON}") if __name__ == "__main__": main()