| | import argparse |
| | import os |
| | import json |
| | import sys |
| | sys.path.append(os.path.abspath('/home/mshahidul/')) |
| | from gpu_selection import _gpu_selection_ |
| | |
| | parser = argparse.ArgumentParser(description="Translation Evaluation") |
| | parser.add_argument("--path", type=str, default="/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json", help="Path to the JSON file") |
| | parser.add_argument("--cuda", type=str, default="3", help="CUDA device id, e.g., '0' or '0,1' for multiple GPUs") |
| | args = parser.parse_args() |
| |
|
| | if args.cuda is not None: |
| | os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda |
| | print(f"🎮🎮 Using CUDA device: {args.cuda}") |
| | else: |
| | _gpu_selection_() |
| |
|
| | |
| | out_dir = "/home/mshahidul/readctrl/results/" |
| | os.makedirs(os.path.dirname(out_dir), exist_ok=True) |
| | file_name = os.path.basename(args.path) |
| | out_path = os.path.join(out_dir, file_name) |
| |
|
| | |
| | results = [] |
| | completed_keys = set() |
| | if os.path.exists(out_path): |
| | with open(out_path, "r", encoding="utf-8") as f: |
| | results = json.load(f) |
| | for r in results: |
| | completed_keys.add((r["article"], r["gold_summary"])) |
| |
|
| | |
| | with open(args.path, "r", encoding="utf-8") as f: |
| | dataset = json.load(f) |
| | from unsloth import FastLanguageModel |
| | import torch |
| | |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name = "/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v1", |
| | max_seq_length = 4092, |
| | load_in_4bit = True, |
| | load_in_8bit = False, |
| | full_finetuning = False, |
| | ) |
| | from prompt_generate import generate_prompt |
| | |
| | import tqdm |
| | for item in tqdm.tqdm(dataset): |
| | key = (item["article"], item["gold_summary"]) |
| | if key in completed_keys: |
| | continue |
| |
|
| | for band in ["B1", "B2", "B3"]: |
| | prompt = generate_prompt(item['article'],item['gold_summary'],band,"es") |
| |
|
| | messages = [{"role": "user", "content": prompt+"\n"}] |
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True, |
| | enable_thinking=False, |
| | ) |
| | inputs = tokenizer(text, return_tensors="pt").to("cuda") |
| | output_ids = model.generate( |
| | **inputs, |
| | max_new_tokens=1000, |
| | temperature=0.1, |
| | top_p=0.8, |
| | top_k=5, |
| | ) |
| | output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| | |
| |
|
| | results.append({ |
| | "article": item["article"], |
| | "gold_summary": item["gold_summary"], |
| | "band": band, |
| | "lang": "es", |
| | "synthetic_summary": output_text, |
| | }) |
| | completed_keys.add(key) |
| | |
| | if len(results) % 30 == 0: |
| | with open(out_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=2) |
| |
|
| | |
| | with open(out_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=2) |