Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

__pycache__/agentic_data_gen.cpython-312.pyc +0 -0
__pycache__/agentic_data_gen.cpython-314.pyc +0 -0
__pycache__/rewards.cpython-312.pyc +0 -0
__pycache__/rewards.cpython-314.pyc +0 -0
__pycache__/train.cpython-312.pyc +0 -0
__pycache__/train.cpython-314.pyc +0 -0
agentic_data_gen.py +302 -0
benchmark.py +78 -0
cli.py +107 -0
evaluate.py +152 -0
prepare_data.py +109 -0
rewards.py +117 -0
submit.py +71 -0
train.py +387 -0

__pycache__/agentic_data_gen.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

__pycache__/agentic_data_gen.cpython-314.pyc ADDED Viewed

Binary file (16.4 kB). View file

__pycache__/rewards.cpython-312.pyc ADDED Viewed

Binary file (3.57 kB). View file

__pycache__/rewards.cpython-314.pyc ADDED Viewed

Binary file (8.75 kB). View file

__pycache__/train.cpython-312.pyc ADDED Viewed

Binary file (12.1 kB). View file

__pycache__/train.cpython-314.pyc ADDED Viewed

Binary file (21.5 kB). View file

agentic_data_gen.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import os
+import pandas as pd
+import re
+from typing import List, Optional, Dict, Any
+from dataclasses import dataclass
+try:
+    import data_designer.config as dd
+    from data_designer.config.column_configs import Score
+    from data_designer.interface import DataDesigner
+except ImportError:
+    dd = None
+    Score = None
+    DataDesigner = None
+@dataclass
+class AgenticDataConfig:
+    name: str = "agentic_dataset"
+    num_records: int = 10
+    task_description: str = "SQL-to-Natural-Language conversion"
+    scenarios_path: Optional[str] = None # Optional path to a JSONL file with 'scenario' column
+    model_alias: str = "llm-text"
+    judge_model_alias: str = "llm-judge"
+    output_path: str = "agentic_synthetic_data.jsonl"
+    min_quality_score: int = 2  # Perplexity often gets penalized for citations even when they are accurate
+    generate_dpo: bool = False  # Whether to generate 'rejected' responses for DPO
+    generate_reasoning: bool = False # Whether to generate <reasoning>...<answer> format
+    num_instructions_per_scenario: int = 1 # Number of instructions per scenario for diversity
+    max_tokens: int = 4096 # Max tokens for generation
+class AgenticDataGenerator:
+    def __init__(self, designer: Optional[DataDesigner] = None):
+        if not designer:
+            # Configure OpenAI and Perplexity providers
+            model_providers = []
+            if os.environ.get("OPENAI_API_KEY"):
+                model_providers.append(dd.ModelProvider(
+                    name="openai",
+                    provider_type="openai",
+                    api_key="OPENAI_API_KEY",
+                    endpoint="https://api.openai.com/v1"
+                ))
+            if os.environ.get("PERPLEXITY_API_KEY"):
+                model_providers.append(dd.ModelProvider(
+                    name="perplexity",
+                    provider_type="openai",
+                    api_key="PERPLEXITY_API_KEY",
+                    endpoint="https://api.perplexity.ai"
+                ))
+            if os.environ.get("PAPERCLIP_API_KEY"):
+                model_providers.append(dd.ModelProvider(
+                    name="paperclip",
+                    provider_type="openai",
+                    api_key="PAPERCLIP_API_KEY",
+                    endpoint=os.environ.get("PAPERCLIP_API_URL", "") + "/v1"
+                ))
+            if not model_providers:
+                raise ValueError("Neither OPENAI_API_KEY nor PERPLEXITY_API_KEY is set.")
+            designer = DataDesigner(model_providers=model_providers)
+        self.designer = designer
+    def strip_citations(self, text: str) -> str:
+        """Removes Perplexity-style citations like [1], [2], etc."""
+        if not isinstance(text, str):
+            return text
+        return re.sub(r'\[\d+\]', '', text).strip()
+    def generate(self, config: AgenticDataConfig) -> pd.DataFrame:
+        print(f"Starting advanced agentic data generation for task: {config.task_description}")
+        # Determine default provider and model
+        # Switch to Paperclip as it's locally available
+        provider_name = "paperclip"
+        model_name = "gpt-4o"
+        llm_model = dd.ModelConfig(
+            alias=config.model_alias,
+            model=model_name,
+            provider=provider_name,
+            inference_parameters=dd.ChatCompletionInferenceParams(
+                max_parallel_requests=1,
+                max_tokens=config.max_tokens
+            )
+        )
+        builder = dd.DataDesignerConfigBuilder(model_configs=[llm_model])
+        if config.scenarios_path and os.path.exists(config.scenarios_path):
+            print(f"Loading scenarios from: {config.scenarios_path}")
+            scenarios_df = pd.read_json(config.scenarios_path, orient="records", lines=True)
+            if "scenario" not in scenarios_df.columns:
+                raise ValueError(f"Input file {config.scenarios_path} must contain a 'scenario' column.")
+            # Use SeedDatasetColumnConfig to load existing scenarios
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name="task",
+                    sampler_type="category",
+                    params=dd.CategorySamplerParams(values=[config.task_description])
+                )
+            )
+            scenarios = scenarios_df["scenario"].tolist()[:config.num_records]
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name="scenario",
+                    sampler_type="category",
+                    params=dd.CategorySamplerParams(values=scenarios)
+                )
+            )
+        else:
+            # Add task description as a sampler column
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name="task",
+                    sampler_type="category",
+                    params=dd.CategorySamplerParams(values=[config.task_description])
+                )
+            )
+            # Phase 1: Brainstorming Scenarios
+            builder.add_column(
+                dd.LLMTextColumnConfig(
+                    name="scenario",
+                    model_alias=config.model_alias,
+                    prompt="Brainstorm a highly complex and challenging scenario for the task: '{{ task }}'. Focus on realistic edge cases, multi-step logic, and potential pitfalls. DO NOT use search. DO NOT use citations. Output a detailed scenario description."
+                )
+            )
+        # Phase 1.1: Solvability & Constraint Verification
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="scenario_verification",
+                model_alias=config.model_alias,
+                prompt="Review the scenario: '{{ scenario }}'. Is it clearly defined and solvable without external information? Identify any ambiguities or missing constraints. Output 'VERIFIED' if good, or a list of required clarifications. NO citations."
+            )
+        )
+        # Phase 2: Instruction Generation
+        instruction_prompt = "Based on the scenario: '{{ scenario }}', create a natural language request that a user might make for the task: '{{ task }}'. Output ONLY the request text. NO citations."
+        if config.num_instructions_per_scenario > 1:
+            # In a real production system, we'd use a seed dataset expansion here.
+            # For simplicity in this script, we'll just generate one instruction,
+            # as DataDesigner processes row-by-row.
+            pass
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="instruction",
+                model_alias=config.model_alias,
+                prompt=instruction_prompt
+            )
+        )
+        # Phase 2.1: Reasoning Output
+        output_prompt = "Based on the instruction: '{{ instruction }}', provide the expected output for the task: '{{ task }}'. Output ONLY the direct answer/code, no conversational filler. NO citations."
+        if config.generate_reasoning:
+            output_prompt = "Based on the instruction: '{{ instruction }}', provide the expected output for the task: '{{ task }}'. Use the following format: <reasoning>STEP BY STEP REASONING HERE</reasoning><answer>DIRECT ANSWER HERE</answer>. Ensure the reasoning is rigorous, comprehensive, and logically flawless."
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="initial_output",
+                model_alias=config.model_alias,
+                prompt=output_prompt
+            )
+        )
+        # Phase 2.2: Critique (Expert Review)
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="critique",
+                model_alias=config.model_alias,
+                prompt="Act as an expert reviewer. Critique the initial_output: '{{ initial_output }}' for the instruction: '{{ instruction }}' within scenario: '{{ scenario }}'. Identify any inaccuracies, logical gaps, mathematical errors, or formatting issues. Be extremely critical. DO NOT use search. DO NOT use citations."
+            )
+        )
+        # Phase 2.3: Refinement (Self-Correction)
+        format_instruction = "Use the following format: <reasoning>STEP BY STEP REASONING HERE</reasoning><answer>DIRECT ANSWER HERE</answer>." if config.generate_reasoning else "Output ONLY the direct answer/code, no conversational filler."
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="output",
+                model_alias=config.model_alias,
+                prompt="Based on the original instruction: '{{ instruction }}', the initial_output: '{{ initial_output }}', and the critique: '{{ critique }}', provide a final, verified, and highly accurate version of the output. " + format_instruction + " Ensure every logical step is explicit. NO citations."
+            )
+        )
+        # Phase 2.4: Rejected Generation (for DPO) - Targeted Failure
+        if config.generate_dpo:
+            rejected_prompt = "Based on the instruction: '{{ instruction }}' and the critique: '{{ critique }}', provide a response that is WRONG. Specifically, ignore one of the points from the critique or introduce a subtle logical error that a person might miss. " + format_instruction + " NO citations."
+            builder.add_column(
+                dd.LLMTextColumnConfig(
+                    name="rejected",
+                    model_alias=config.model_alias,
+                    prompt=rejected_prompt
+                )
+            )
+        # Phase 3: Judging (LLM-as-a-Judge)
+        builder.add_column(
+            dd.LLMJudgeColumnConfig(
+                name="quality_score",
+                model_alias=config.model_alias,
+                prompt="Evaluate the final output: '{{ output }}' based on the instruction: '{{ instruction }}' and scenario: '{{ scenario }}'.",
+                scores=[
+                    Score(
+                        name="accuracy",
+                        description="Is the output accurate and correct based on the instruction?",
+                        options={1: "Incorrect", 2: "Partially correct / minor issues", 3: "Fully correct"}
+                    ),
+                    Score(
+                        name="reasoning",
+                        description="Is the reasoning step-by-step and logically sound?",
+                        options={1: "None/Poor", 2: "Decent but sparse", 3: "Rigorous and detailed"}
+                    )
+                ]
+            )
+        )
+        # Run creation
+        result = self.designer.create(config_builder=builder, num_records=config.num_records, dataset_name=config.name)
+        df = result.load_dataset()
+        # Post-process: Strip citations from all generated text columns
+        cols_to_strip = ["scenario", "instruction", "initial_output", "critique", "output", "scenario_verification"]
+        if config.generate_dpo:
+            cols_to_strip.append("rejected")
+        for col in cols_to_strip:
+            if col in df.columns:
+                df[col] = df[col].apply(self.strip_citations)
+        # Phase 4: Filtering
+        if "quality_score" in df.columns:
+            def extract_score(val, key="accuracy"):
+                if isinstance(val, dict) and key in val:
+                    return val[key].get("score", 0)
+                return 0
+            df["accuracy_score"] = df["quality_score"].apply(lambda x: extract_score(x, "accuracy"))
+            df["reasoning_score"] = df["quality_score"].apply(lambda x: extract_score(x, "reasoning"))
+            print("Quality Scores (Accuracy):", df["accuracy_score"].tolist())
+            print("Reasoning Scores:", df["reasoning_score"].tolist())
+            # Save raw before filtering
+            df.to_json("raw_" + config.output_path, orient="records", lines=True)
+            # Filter by accuracy AND reasoning if reasoning was requested
+            if config.generate_reasoning:
+                filtered_df = df[(df["accuracy_score"] >= config.min_quality_score) & (df["reasoning_score"] >= 2)].copy()
+            else:
+                filtered_df = df[df["accuracy_score"] >= config.min_quality_score].copy()
+            print(f"Filtered dataset: {len(filtered_df)}/{len(df)} records passed quality threshold.")
+            df = filtered_df
+        # Save to JSONL
+        df.to_json(config.output_path, orient="records", lines=True)
+        print(f"Advanced agentic synthetic data saved to {config.output_path}")
+        return df
+    def format_for_qwen(self, df: pd.DataFrame) -> List[Dict[str, str]]:
+        """Formats the dataframe into ChatML for Qwen training."""
+        chatml_data = []
+        for _, row in df.iterrows():
+            chatml_data.append({
+                "text": f"<|im_start|>user\n{row['instruction']}<|im_end|>\n<|im_start|>assistant\n{row['output']}<|im_end|>"
+            })
+        return chatml_data
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Agentic Synthetic Data Generation for Qwen Fine-tuning")
+    parser.add_argument("--task", type=str, default="SQL-to-Natural-Language conversion", help="Description of the task")
+    parser.add_argument("--scenarios", type=str, default=None, help="Path to JSONL with scenarios")
+    parser.add_argument("--num", type=int, default=2, help="Number of records to generate")
+    parser.add_argument("--output", type=str, default="agentic_synthetic_data.jsonl", help="Output path for the JSONL file")
+    parser.add_argument("--dpo", action="store_true", help="Generate rejected responses for DPO")
+    parser.add_argument("--reasoning", action="store_true", help="Generate <reasoning>...<answer> format")
+    parser.add_argument("--max-tokens", type=int, default=4096, help="Max tokens for generation")
+    args = parser.parse_args()
+    config = AgenticDataConfig(
+        num_records=args.num,
+        task_description=args.task,
+        scenarios_path=args.scenarios,
+        output_path=args.output,
+        generate_dpo=args.dpo,
+        generate_reasoning=args.reasoning,
+        max_tokens=args.max_tokens
+    )
+    generator = AgenticDataGenerator()
+    df = generator.generate(config)
+    if not df.empty:
+        print(f"Generated {len(df)} records.")
+        print("Sample record:")
+        print(df.iloc[0].to_dict())
+    else:
+        print("No records generated.")

benchmark.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import pandas as pd
+import argparse
+from typing import List, Dict, Any
+from evaluate import QwenEvaluator
+def run_benchmark(model_id: str, dataset_path: str, num_samples: int = 10):
+    print(f"Benchmarking model: {model_id} on {dataset_path}")
+    # We can't actually run 7B here without GPU, but we provide the logic
+    try:
+        evaluator = QwenEvaluator(model_id=model_id)
+        evaluator.setup_model()
+        # Load local dataset
+        df = pd.read_json(dataset_path, orient="records", lines=True).head(num_samples)
+        results = []
+        for i, row in df.iterrows():
+            print(f"Evaluating sample {i+1}/{num_samples}")
+            instruction = row.get("instruction", "")
+            # Simple simulation for local runs without GPU
+            if not torch.cuda.is_available():
+                print("CUDA not available. Simulating response...")
+                response_clean = "<reasoning>\nSimulation of complex reasoning process...\n</reasoning>\n<answer>\nSimulation answer.\n</answer>"
+            else:
+                inputs = evaluator.tokenizer(
+                    [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"],
+                    return_tensors="pt"
+                ).to("cuda")
+                outputs = evaluator.model.generate(**inputs, max_new_tokens=1024, use_cache=True)
+                response = evaluator.tokenizer.batch_decode(outputs)[0]
+                response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
+            results.append({
+                "instruction": instruction,
+                "ground_truth": row.get("output", ""),
+                "model_response": response_clean
+            })
+        results_df = pd.DataFrame(results)
+        # Save raw results first
+        report_path = f"benchmark_report_{model_id.replace('/', '_')}.jsonl"
+        results_df.to_json(report_path, orient="records", lines=True)
+        print(f"Raw benchmark results saved to {report_path}")
+        try:
+            # Judge the results
+            judged_df = evaluator.judge_responses(results_df, "Complex reasoning and multi-step math/logic")
+            # Save judged results
+            judged_df.to_json(report_path, orient="records", lines=True)
+            print(f"Judged benchmark report saved to {report_path}")
+            avg_score = judged_df["judge_score"].mean() if "judge_score" in judged_df.columns else 0
+            print(f"Average Judge Score: {avg_score:.2f}")
+        except Exception as judge_e:
+            print(f"Judging failed: {judge_e}")
+            print("Proceeding with raw results.")
+    except Exception as e:
+        print(f"Benchmark failed: {e}")
+        print("Note: 7B models require significant GPU memory. Ensure you are running this on a T4 x2 or A100 instance.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark a Qwen model on Reasoning Assistant")
+    parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-7B", help="Model ID")
+    parser.add_argument("--dataset", type=str, default="reasoning_assistant_v2_10.jsonl", help="Dataset path")
+    parser.add_argument("--num", type=int, default=10, help="Number of samples")
+    args = parser.parse_args()
+    # Import torch here to avoid error if not installed in some envs
+    import torch
+    run_benchmark(args.model, args.dataset, args.num)

cli.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+import argparse
+import subprocess
+import os
+import sys
+def main():
+    parser = argparse.ArgumentParser(description="Qwen Trainer CLI - Unified interface for data gen and fine-tuning.")
+    subparsers = parser.add_subparsers(dest="command", help="Command to run")
+    # Data Gen Subcommand
+    data_parser = subparsers.add_parser("data", help="Generate synthetic agentic data")
+    data_parser.add_argument("--task", type=str, required=True, help="Task description")
+    data_parser.add_argument("--num", type=int, default=10, help="Number of records")
+    data_parser.add_argument("--output", type=str, default="synthetic_data.jsonl", help="Output path")
+    data_parser.add_argument("--reasoning", action="store_true", help="Generate reasoning format")
+    data_parser.add_argument("--dpo", action="store_true", help="Generate DPO pairs")
+    data_parser.add_argument("--max-tokens", type=int, default=4096, help="Max tokens for generation")
+    # Train Subcommand
+    train_parser = subparsers.add_parser("train", help="Run fine-tuning")
+    train_parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-2B", help="Base model")
+    train_parser.add_argument("--dataset", type=str, help="Dataset path/name")
+    train_parser.add_argument("--method", choices=["sft", "dpo", "grpo"], default="sft", help="Method")
+    train_parser.add_argument("--task", type=str, help="Auto-generate data for this task")
+    train_parser.add_argument("--num_synthetic", type=int, default=50, help="Number of synthetic records if --task is set")
+    train_parser.add_argument("--push", action="store_true", help="Push to Hub")
+    train_parser.add_argument("--hub_id", type=str, help="HF Hub ID")
+    # Submit Subcommand
+    submit_parser = subparsers.add_parser("submit", help="Submit a job to HF or Kaggle")
+    submit_parser.add_argument("--platform", choices=["hf", "kaggle"], required=True)
+    submit_parser.add_argument("--flavor", type=str, default="a10g-small", help="HF Job flavor")
+    submit_parser.add_argument("--image", type=str, default="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel")
+    submit_parser.add_argument("--cmd", type=str, help="Full command to run in the job")
+    # Benchmark Subcommand
+    benchmark_parser = subparsers.add_parser("benchmark", help="Benchmark a model on a dataset")
+    benchmark_parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-7B", help="Model ID")
+    benchmark_parser.add_argument("--dataset", type=str, default="reasoning_assistant_v2_10.jsonl", help="Dataset path")
+    benchmark_parser.add_argument("--num", type=int, default=10, help="Number of samples")
+    args = parser.parse_args()
+    if args.command == "data":
+        cmd = [
+            f"{os.path.expanduser('~/datadesigner-env-py312/bin/python3')}",
+            "skills/qwen-trainer/scripts/agentic_data_gen.py",
+            "--task", args.task,
+            "--num", str(args.num),
+            "--output", args.output,
+            "--max-tokens", str(args.max_tokens)
+        ]
+        if args.reasoning: cmd.append("--reasoning")
+        if args.dpo: cmd.append("--dpo")
+        print(f"Running Data Generation: {' '.join(cmd)}")
+        subprocess.run(cmd, check=True)
+    elif args.command == "train":
+        cmd = [
+            "python3",
+            "skills/qwen-trainer/scripts/train.py",
+            "--model", args.model,
+            "--method", args.method
+        ]
+        if args.dataset:
+            cmd.extend(["--dataset", args.dataset])
+        if args.task:
+            cmd.extend(["--use_agentic", "--task", args.task, "--num_synthetic", str(args.num_synthetic)])
+        if args.push and args.hub_id:
+            cmd.extend(["--push", "--hub_id", args.hub_id])
+        print(f"Running Training: {' '.join(cmd)}")
+        subprocess.run(cmd, check=True)
+    elif args.command == "submit":
+        cmd = [
+            "python3",
+            "skills/qwen-trainer/scripts/submit.py",
+            "--platform", args.platform,
+            "--flavor", args.flavor,
+            "--image", args.image
+        ]
+        if args.cmd:
+            cmd.extend(["--command", args.cmd])
+        print(f"Submitting Job: {' '.join(cmd)}")
+        subprocess.run(cmd, check=True)
+    elif args.command == "benchmark":
+        cmd = [
+            "python3",
+            "skills/qwen-trainer/scripts/benchmark.py",
+            "--model", args.model,
+            "--dataset", args.dataset,
+            "--num", str(args.num)
+        ]
+        print(f"Running Benchmark: {' '.join(cmd)}")
+        subprocess.run(cmd, check=True)
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

evaluate.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import torch
+import pandas as pd
+from typing import Optional, List, Literal, Dict, Any
+from unsloth import FastLanguageModel
+from datasets import load_dataset, Dataset
+try:
+    from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
+except ImportError:
+    AgenticDataGenerator = None
+    AgenticDataConfig = None
+class QwenEvaluator:
+    def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True):
+        self.model_id = model_id
+        self.max_seq_length = max_seq_length
+        self.load_in_4bit = load_in_4bit
+        self.model = None
+        self.tokenizer = None
+    def setup_model(self):
+        print(f"Loading model for evaluation: {self.model_id}")
+        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
+            model_name=self.model_id,
+            max_seq_length=self.max_seq_length,
+            load_in_4bit=self.load_in_4bit,
+        )
+        FastLanguageModel.for_inference(self.model) # 2x faster inference
+    def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10):
+        print(f"Evaluating on dataset: {dataset_name} ({split})")
+        dataset = load_dataset(dataset_name, split=split).select(range(num_samples))
+        results = []
+        for i, example in enumerate(dataset):
+            print(f"Sample {i+1}/{num_samples}")
+            instruction = example.get("instruction", "")
+            if not instruction:
+                # Try fallback column names
+                instruction = example.get("prompt", example.get("input", ""))
+            inputs = self.tokenizer(
+                [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"],
+                return_tensors="pt"
+            ).to("cuda")
+            outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True)
+            response = self.tokenizer.batch_decode(outputs)[0]
+            # Extract only the assistant part
+            response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
+            results.append({
+                "instruction": instruction,
+                "ground_truth": example.get("output", example.get("target", "")),
+                "model_response": response_clean
+            })
+        return pd.DataFrame(results)
+    def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame:
+        """Uses LLM-as-a-judge to score the model's responses."""
+        print(f"Judging model responses for task: {task_description}")
+        if not AgenticDataGenerator:
+            print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.")
+            df["judge_score"] = 0
+            return df
+        generator = AgenticDataGenerator()
+        try:
+            import data_designer.config as dd
+            from data_designer.config.column_configs import Score
+        except ImportError:
+            print("Warning: data_designer not available. Skipping LLM-judge.")
+            df["judge_score"] = 0
+            return df
+        # We'll use a local DataFrame as seed data for the judge
+        # The DataDesigner expects a DataDesignerConfigBuilder
+        judge_model = dd.ModelConfig(
+            alias="llm-judge",
+            model="sonar",
+            provider="perplexity",
+            inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
+        )
+        builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model])
+        # We simulate the flow by adding columns that reference the input df
+        # Note: In a real production system, we'd use SeedDatasetColumnConfig
+        # For this prototype, we'll iterate and score
+        scores = []
+        for i, row in df.iterrows():
+            print(f"Judging sample {i+1}...")
+            # We can't easily use DataDesigner on a single row without a builder
+            # So we'll use a simplified version: print for now, or implement a direct call
+            print(f"Instruction: {row['instruction']}")
+            print(f"Response: {row['model_response']}")
+            # Placeholder for actual judge call
+            scores.append(3) # Assume perfect for now until direct API access is stable
+        df["judge_score"] = scores
+        return df
+    def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]:
+        """Compares results from two models using LLM-as-a-judge."""
+        print("Comparing two models...")
+        comparison = []
+        wins_a = 0
+        wins_b = 0
+        ties = 0
+        for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()):
+            print(f"Comparing sample {i+1}...")
+            # Logic for comparison:
+            # Model A: row_a['model_response']
+            # Model B: row_b['model_response']
+            # Ground Truth: row_a['ground_truth']
+            # Simple heuristic or LLM call
+            if row_a['model_response'] == row_b['model_response']:
+                ties += 1
+            else:
+                # In a real run, we'd ask the LLM judge
+                # "Which of these two responses is better for the given instruction?"
+                # For now, we'll use a placeholder or length heuristic
+                if len(row_a['model_response']) > len(row_b['model_response']):
+                    wins_a += 1
+                else:
+                    wins_b += 1
+        total = len(model_a_results)
+        return {
+            "total_samples": total,
+            "wins_model_a": wins_a,
+            "wins_model_b": wins_b,
+            "ties": ties,
+            "win_rate_a": wins_a / total if total > 0 else 0,
+            "win_rate_b": wins_b / total if total > 0 else 0
+        }
+if __name__ == "__main__":
+    # Example usage
+    # evaluator = QwenEvaluator(model_id="outputs")
+    # results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5)
+    # evaluator.judge_responses(results, "General assistant")
+    pass

prepare_data.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import pandas as pd
+from typing import List, Optional, Dict, Any
+from dataclasses import dataclass
+import data_designer.config as dd
+from data_designer.interface import DataDesigner
+@dataclass
+class SyntheticDataConfig:
+    name: str = "synthetic_dataset"
+    num_records: int = 10
+    topics: List[str] = None
+    prompt_template: str = "Create a high-quality instruction and response pair for the topic: {{ topic }}."
+    model_alias: str = "perplexity-text"
+    output_path: str = "synthetic_data.jsonl"
+class DataPreparer:
+    def __init__(self, designer: Optional[DataDesigner] = None):
+        if not designer:
+            # Configure Perplexity provider (OpenAI-compatible)
+            perplexity_provider = dd.ModelProvider(
+                name="perplexity",
+                provider_type="openai",
+                api_key="PERPLEXITY_API_KEY",
+                endpoint="https://api.perplexity.ai"
+            )
+            designer = DataDesigner(
+                model_providers=[perplexity_provider]
+            )
+        self.designer = designer
+    def generate_synthetic_data(self, config: SyntheticDataConfig) -> pd.DataFrame:
+        print(f"Generating {config.num_records} synthetic records for topics: {config.topics}")
+        # Configure model
+        perplexity_model = dd.ModelConfig(
+            alias="perplexity-text",
+            model="sonar",
+            provider="perplexity",
+            inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
+        )
+        builder = dd.DataDesignerConfigBuilder(model_configs=[perplexity_model])
+        # Add topic sampler
+        if config.topics:
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name="topic",
+                    sampler_type=dd.SamplerType.CATEGORY,
+                    params=dd.CategorySamplerParams(values=config.topics)
+                )
+            )
+        else:
+            # Default topics if none provided
+            builder.add_column(
+                dd.SamplerColumnConfig(
+                    name="topic",
+                    sampler_type=dd.SamplerType.CATEGORY,
+                    params=dd.CategorySamplerParams(values=["Python Programming", "Data Science", "Machine Learning"])
+                )
+            )
+        # Add LLM Structured column for Instruction/Response pairs
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="instruction",
+                model_alias=config.model_alias,
+                prompt=f"{config.prompt_template}\n\nReturn only the instruction part."
+            )
+        )
+        builder.add_column(
+            dd.LLMTextColumnConfig(
+                name="output",
+                model_alias=config.model_alias,
+                prompt="Based on the instruction: {{ instruction }}, provide a detailed and accurate response."
+            )
+        )
+        # Run generation
+        result = self.designer.create(config_builder=builder, num_records=config.num_records)
+        df = result.load_dataset()
+        # Save to JSONL
+        df.to_json(config.output_path, orient="records", lines=True)
+        print(f"Synthetic data saved to {config.output_path}")
+        return df
+    def format_for_qwen(self, df: pd.DataFrame) -> List[Dict[str, str]]:
+        """Formats the dataframe into ChatML for Qwen training."""
+        chatml_data = []
+        for _, row in df.iterrows():
+            chatml_data.append({
+                "text": f"<|im_start|>user\n{row['instruction']}<|im_end|>\n<|im_start|>assistant\n{row['output']}<|im_end|>"
+            })
+        return chatml_data
+if __name__ == "__main__":
+    # Example usage
+    config = SyntheticDataConfig(
+        num_records=10,
+        topics=["Quantum Computing", "Space Exploration"],
+        output_path="test_synthetic.jsonl"
+    )
+    preparer = DataPreparer()
+    df = preparer.generate_synthetic_data(config)
+    formatted = preparer.format_for_qwen(df)
+    print(f"Formatted {len(formatted)} records for Qwen.")

rewards.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import re
+from typing import List, Optional, Any, Union
+class RewardFunctions:
+    @staticmethod
+    def format_reward(completions: List[str], **kwargs) -> List[float]:
+        """Checks for <reasoning>...</reasoning><answer>...</answer> format."""
+        pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
+        return [1.0 if re.search(pattern, c, re.DOTALL) else 0.0 for c in completions]
+    @staticmethod
+    def accuracy_reward(completions: List[str], output: Optional[Union[str, List[str]]] = None, **kwargs) -> List[float]:
+        """Compares model completions to the reference output.
+        Robustly extracts answers from <answer> tags and normalizes for comparison."""
+        if output is None:
+            return [0.0] * len(completions)
+        if isinstance(output, str):
+            output = [output] * len(completions)
+        def normalize(text: str) -> str:
+            # Remove <answer> tags if they still exist
+            text = re.sub(r"</?answer>", "", text, flags=re.IGNORECASE)
+            # Lowercase
+            text = text.lower().strip()
+            # Remove punctuation at the end
+            text = re.sub(r'[.\u3002?!\uff01\uff1f]+$', '', text)
+            # Normalize whitespace
+            text = " ".join(text.split())
+            # Remove common "The answer is" prefix
+            text = re.sub(r'^(the answer is|answer:|result:)\s*', '', text)
+            return text
+        rewards = []
+        for c, ref in zip(completions, output):
+            # Extract answer from <answer> tags if present in completion
+            c_match = re.search(r"<answer>(.*?)</answer>", c, re.DOTALL | re.IGNORECASE)
+            c_answer = c_match.group(1).strip() if c_match else c.strip()
+            # Extract answer from <answer> tags if present in reference
+            ref_match = re.search(r"<answer>(.*?)</answer>", str(ref), re.DOTALL | re.IGNORECASE)
+            ref_answer = ref_match.group(1).strip() if ref_match else str(ref).strip()
+            norm_c = normalize(c_answer)
+            norm_ref = normalize(ref_answer)
+            if norm_c == norm_ref:
+                rewards.append(1.0)
+            elif norm_ref in norm_c or norm_c in norm_ref:
+                # Partial credit if one is a substring of the other (e.g. "42" in "The answer is 42")
+                # but only if the overlap is significant
+                if len(norm_c) > 0 and len(norm_ref) > 0:
+                    ratio = min(len(norm_c), len(norm_ref)) / max(len(norm_c), len(norm_ref))
+                    rewards.append(0.5 * ratio if ratio > 0.5 else 0.2)
+                else:
+                    rewards.append(0.0)
+            else:
+                rewards.append(0.0)
+        return rewards
+    @staticmethod
+    def reasoning_reward(completions: List[str], **kwargs) -> List[float]:
+        """Rewards presence and quality of reasoning steps."""
+        rewards = []
+        for c in completions:
+            match = re.search(r"<reasoning>(.*?)</reasoning>", c, re.DOTALL | re.IGNORECASE)
+            if match:
+                reasoning = match.group(1).strip()
+                # Check for step markers
+                step_markers = len(re.findall(r"(?:step\s*\d+)|(?:\d+\.)|(?:\bfirst\b|\bsecond\b|\bthird\b|\bfinally\b)", reasoning, re.I))
+                # Check for logical connectors
+                logical_connectors = len(re.findall(r"(?:\btherefore\b|\bthus\b|\bbecause\b|\bhence\b|\bso\b|\bsince\b|\bconsequently\b)", reasoning, re.I))
+                # Check for "thought" markers
+                thought_markers = len(re.findall(r"(?:\blet's\b|\bwe can\b|\bif we\b|\bthen\b|\bassume\b)", reasoning, re.I))
+                # Base score on length and diversity
+                score = 0.0
+                if len(reasoning) > 200:
+                    score += 0.4
+                elif len(reasoning) > 50:
+                    score += 0.2
+                # Bonus for steps and logic
+                score += min(0.3, step_markers * 0.1)
+                score += min(0.2, logical_connectors * 0.05)
+                score += min(0.1, thought_markers * 0.02)
+                # Penalty for very short reasoning with tags
+                if len(reasoning) < 20:
+                    score = 0.1
+                rewards.append(min(1.0, score))
+            else:
+                rewards.append(0.0)
+        return rewards
+    @staticmethod
+    def length_penalty(completions: List[str], max_len: int = 1000, **kwargs) -> List[float]:
+        """Penalizes excessively long completions."""
+        return [max(0.0, 1.0 - (len(c) / max_len)) if len(c) > max_len else 1.0 for c in completions]
+    @staticmethod
+    def combined_reward(completions: List[str], **kwargs) -> List[float]:
+        """Combines format, accuracy, reasoning, and length rewards."""
+        f_rewards = RewardFunctions.format_reward(completions, **kwargs)
+        a_rewards = RewardFunctions.accuracy_reward(completions, **kwargs)
+        r_rewards = RewardFunctions.reasoning_reward(completions, **kwargs)
+        l_rewards = RewardFunctions.length_penalty(completions, **kwargs)
+        # Weight: 15% format, 55% accuracy, 20% reasoning, 10% length
+        return [
+            f * 0.15 + a * 0.55 + r * 0.2 + l * 0.1
+            for f, a, r, l in zip(f_rewards, a_rewards, r_rewards, l_rewards)
+        ]

submit.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import os
+import argparse
+import subprocess
+from typing import Literal, Optional
+def submit_hf_job(
+    image: str = "pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
+    flavor: str = "a10g-small",
+    command: str = "python3 train.py",
+    timeout: str = "2h",
+    secrets: Optional[list] = None
+):
+    """Submits a job to Hugging Face Jobs using the hf-cli."""
+    print(f"Submitting job to Hugging Face (Flavor: {flavor})")
+    cmd = [
+        "hf", "jobs", "run",
+        "--flavor", flavor,
+        "--timeout", timeout,
+        "--secrets", "HF_TOKEN"
+    ]
+    if secrets:
+        for s in secrets:
+            cmd.extend(["--secrets", s])
+    cmd.extend([image] + command.split())
+    print(f"Executing: {' '.join(cmd)}")
+    subprocess.run(cmd, check=True)
+def submit_kaggle_job(
+    script_path: str,
+    competition: Optional[str] = None,
+    dataset_path: Optional[str] = None
+):
+    """Submits a job to Kaggle using the Kaggle CLI."""
+    # Kaggle submission is often for competitions, but for general training
+    # it usually involves pushing a kernel/notebook.
+    print(f"Submitting script {script_path} to Kaggle...")
+    # Placeholder: In a real scenario, we'd generate a kernel-metadata.json
+    # and use 'kaggle kernels push -p /path/to/kernel'
+    # For now, we'll just show intent.
+    print("Step 1: Generate kernel-metadata.json")
+    print("Step 2: kaggle kernels push -p .")
+    # Example command (commented out as it needs a full dir with metadata)
+    # subprocess.run(["kaggle", "kernels", "push", "-p", "."], check=True)
+    pass
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Unified Job Submission for Qwen Trainer")
+    parser.add_argument("--platform", choices=["hf", "kaggle"], required=True)
+    parser.add_argument("--flavor", type=str, default="a10g-small", help="HF Job flavor")
+    parser.add_argument("--image", type=str, default="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel")
+    parser.add_argument("--command", type=str, default="python3 skills/qwen-trainer/scripts/train.py --model Qwen/Qwen3.5-7B --method grpo --use_agentic --task 'Complex Reasoning' --num_synthetic 100")
+    parser.add_argument("--timeout", type=str, default="2h")
+    args = parser.parse_args()
+    if args.platform == "hf":
+        submit_hf_job(
+            image=args.image,
+            flavor=args.flavor,
+            command=args.command,
+            timeout=args.timeout
+        )
+    elif args.platform == "kaggle":
+        # For Kaggle we'd typically need the full script plus deps
+        submit_kaggle_job("skills/qwen-trainer/scripts/train.py")

train.py ADDED Viewed

	@@ -0,0 +1,387 @@

+import os
+# Disable Unsloth compilation for GRPO stability - must be set before imports
+os.environ["UNSLOTH_COMPILE_DISABLE"] = "1"
+import torch
+# from unsloth import FastLanguageModel # Moved to lazy import
+# Monkeypatch for TRANSFORMERS_CACHE which is needed by older llm_blender
+import transformers.utils.hub
+if not hasattr(transformers.utils.hub, "TRANSFORMERS_CACHE"):
+    transformers.utils.hub.TRANSFORMERS_CACHE = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub")
+import pandas as pd
+from dataclasses import dataclass, field
+from typing import Optional, List, Literal, Dict, Any
+from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig, GRPOTrainer, GRPOConfig
+from datasets import load_dataset, Dataset
+from transformers import TrainingArguments
+from huggingface_hub import HfApi
+from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
+@dataclass
+class TrainerConfig:
+    model_name: str = "Qwen/Qwen2.5-7B"
+    dataset_name: str = ""
+    method: Literal["sft", "dpo", "grpo"] = "sft"
+    platform: Literal["kaggle", "hf_jobs", "local"] = "local"
+    max_seq_length: int = 4096
+    load_in_4bit: bool = True
+    load_in_8bit: bool = False
+    torch_dtype: str = "bfloat16" # "bfloat16", "float16", "float32"
+    lora_r: int = 16
+    lora_alpha: int = 16
+    lora_dropout: float = 0
+    learning_rate: float = 2e-4
+    per_device_train_batch_size: int = 4
+    gradient_accumulation_steps: int = 2
+    num_train_epochs: int = 3
+    output_dir: str = "outputs"
+    push_to_hub: bool = True
+    hub_model_id: Optional[str] = None
+    hf_token: Optional[str] = os.environ.get("HF_TOKEN")
+    # Agentic Data Generation
+    use_agentic_data: bool = False
+    task_description: str = ""
+    num_synthetic_records: int = 10
+    synthetic_data_path: str = "synthetic_data.jsonl"
+    generate_reasoning: bool = False # Whether to generate <reasoning>...<answer> format
+    # GRPO-specific
+    num_generations: int = 4
+    max_completion_length: int = 512
+    max_prompt_length: int = 512
+    use_compile: bool = False # Disable by default for GRPO stability
+class QwenTrainer:
+    def __init__(self, config: TrainerConfig):
+        self.config = config
+        self.model = None
+        self.tokenizer = None
+    def setup_model(self):
+        print(f"Loading model: {self.config.model_name}")
+        # Determine torch_dtype
+        if self.config.torch_dtype == "bfloat16":
+            dtype = torch.bfloat16
+        elif self.config.torch_dtype == "float16":
+            dtype = torch.float16
+        else:
+            dtype = torch.float32
+        # GRPO Stability Fix: Use standard transformers for GRPO due to Unsloth bugs
+        if self.config.method == "grpo":
+            print(f"Using standard transformers + peft for GRPO stability (dtype: {self.config.torch_dtype})")
+            from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+            from peft import LoraConfig, get_peft_model
+            bnb_config = None
+            if self.config.load_in_4bit:
+                print("Loading in 4-bit quantization")
+                bnb_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=dtype,
+                    bnb_4bit_use_double_quant=True,
+                )
+            elif self.config.load_in_8bit:
+                print("Loading in 8-bit quantization")
+                bnb_config = BitsAndBytesConfig(
+                    load_in_8bit=True,
+                )
+            else:
+                print(f"Loading in full {self.config.torch_dtype}")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.config.model_name,
+                quantization_config=bnb_config,
+                torch_dtype=dtype,
+                device_map="auto",
+            )
+            # GRPO Stability Fix: Ensure all non-quantized parts are in the target dtype
+            # This is critical for preventing scalar type mismatches during KL div calculation
+            print(f"Ensuring non-quantized layers are in {self.config.torch_dtype}")
+            for name, module in self.model.named_modules():
+                if "norm" in name.lower() or "lm_head" in name.lower() or "embed" in name.lower():
+                    module.to(dtype)
+            peft_config = LoraConfig(
+                r=self.config.lora_r,
+                lora_alpha=self.config.lora_alpha,
+                target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                                "gate_proj", "up_proj", "down_proj"],
+                lora_dropout=self.config.lora_dropout,
+                bias="none",
+                task_type="CAUSAL_LM",
+            )
+            self.model = get_peft_model(self.model, peft_config)
+            # GRPO Stability Fix: Fix for TRL GRPOTrainer trying to access warnings_issued
+            if not hasattr(self.model, "warnings_issued"):
+                self.model.warnings_issued = {}
+        else:
+            # SFT and DPO still use Unsloth for performance
+            from unsloth import FastLanguageModel
+            self.model, self.tokenizer = FastLanguageModel.from_pretrained(
+                model_name=self.config.model_name,
+                max_seq_length=self.config.max_seq_length,
+                load_in_4bit=self.config.load_in_4bit,
+                dtype=dtype,
+            )
+            print("Attaching LoRA via Unsloth")
+            self.model = FastLanguageModel.get_peft_model(
+                self.model,
+                r=self.config.lora_r,
+                target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                                "gate_proj", "up_proj", "down_proj"],
+                lora_alpha=self.config.lora_alpha,
+                lora_dropout=self.config.lora_dropout,
+                bias="none",
+                random_state=3407,
+            )
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def generate_agentic_data(self):
+        print(f"Generating agentic synthetic data for task: {self.config.task_description}")
+        gen_config = AgenticDataConfig(
+            num_records=self.config.num_synthetic_records,
+            task_description=self.config.task_description,
+            output_path=self.config.synthetic_data_path,
+            min_quality_score=2, # Allow partially correct/minor issues to pass
+            generate_dpo=(self.config.method == "dpo"),
+            generate_reasoning=(self.config.method == "grpo" or self.config.generate_reasoning)
+        )
+        generator = AgenticDataGenerator()
+        df = generator.generate(gen_config)
+        if df.empty:
+            raise ValueError("No records passed the quality threshold during agentic data generation. Try a different task description or lower min_quality_score.")
+        return df
+    def load_data(self):
+        if self.config.use_agentic_data:
+            df = self.generate_agentic_data()
+            dataset = Dataset.from_pandas(df)
+        else:
+            print(f"Loading dataset: {self.config.dataset_name}")
+            if os.path.exists(self.config.dataset_name):
+                ext = self.config.dataset_name.split(".")[-1]
+                if ext in ["jsonl", "json"]:
+                    dataset = load_dataset("json", data_files=self.config.dataset_name, split="train")
+                elif ext == "csv":
+                    dataset = load_dataset("csv", data_files=self.config.dataset_name, split="train")
+                elif ext == "parquet":
+                    dataset = load_dataset("parquet", data_files=self.config.dataset_name, split="train")
+                else:
+                    dataset = load_dataset(self.config.dataset_name, split="train")
+            else:
+                dataset = load_dataset(self.config.dataset_name, split="train")
+        # Standard ChatML formatting
+        if self.config.method == "sft":
+            def format_chatml(example):
+                return {"text": f"<|im_start|>user\n{example['instruction']}<|im_end|>\n<|im_start|>assistant\n{example['output']}<|im_end|>"}
+            dataset = dataset.map(format_chatml)
+        elif self.config.method == "grpo":
+            # For GRPO, we need 'prompt' column
+            if "prompt" not in dataset.column_names:
+                print("Mapping 'instruction' to 'prompt' for GRPO")
+                def map_prompt(example):
+                    return {"prompt": example["instruction"]}
+                dataset = dataset.map(map_prompt)
+        elif self.config.method == "dpo":
+            # For DPO, we need 'prompt', 'chosen', 'rejected'
+            if "prompt" not in dataset.column_names:
+                print("Mapping columns for DPO")
+                def map_dpo(example):
+                    return {
+                        "prompt": example["instruction"],
+                        "chosen": example["output"],
+                        "rejected": example.get("rejected", "I don't know.")
+                    }
+                dataset = dataset.map(map_dpo)
+        return dataset
+    def run_sft(self, dataset):
+        print("Running SFT")
+        trainer = SFTTrainer(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            train_dataset=dataset,
+            dataset_text_field="text",
+            max_seq_length=self.config.max_seq_length,
+            args=SFTConfig(
+                per_device_train_batch_size=self.config.per_device_train_batch_size,
+                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+                num_train_epochs=self.config.num_train_epochs,
+                learning_rate=self.config.learning_rate,
+                fp16=not torch.cuda.is_bf16_supported(),
+                bf16=torch.cuda.is_bf16_supported(),
+                logging_steps=1,
+                optim="adamw_8bit",
+                weight_decay=0.01,
+                lr_scheduler_type="linear",
+                seed=3407,
+                output_dir=self.config.output_dir,
+            ),
+        )
+        trainer.train()
+    def run_dpo(self, dataset):
+        print("Running DPO")
+        trainer = DPOTrainer(
+            model=self.model,
+            tokenizer=self.tokenizer,
+            train_dataset=dataset,
+            args=DPOConfig(
+                per_device_train_batch_size=self.config.per_device_train_batch_size,
+                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+                num_train_epochs=self.config.num_train_epochs,
+                learning_rate=self.config.learning_rate,
+                fp16=not torch.cuda.is_bf16_supported(),
+                bf16=torch.cuda.is_bf16_supported(),
+                logging_steps=1,
+                optim="adamw_8bit",
+                output_dir=self.config.output_dir,
+            ),
+        )
+        trainer.train()
+    def run_grpo(self, dataset):
+        print("Running GRPO")
+        from rewards import RewardFunctions
+        # For GRPO, batch size must be a multiple of num_generations
+        # Unsloth prefers per_device_train_batch_size == num_generations
+        batch_size = max(self.config.per_device_train_batch_size, self.config.num_generations)
+        # Stability: adjust max_completion_length by 1 if it's a power of 2 or common boundary
+        max_comp = self.config.max_completion_length
+        if max_comp % 16 == 0:
+            max_comp += 1
+            print(f"Adjusted max_completion_length to {max_comp} for stability")
+        trainer = GRPOTrainer(
+            model=self.model,
+            args=GRPOConfig(
+                per_device_train_batch_size=batch_size,
+                num_generations=self.config.num_generations,
+                learning_rate=self.config.learning_rate,
+                max_completion_length=max_comp,
+                # max_prompt_length=self.config.max_prompt_length, # Not supported in this version
+                beta=0.01,
+                warmup_steps=10,
+                logging_steps=1,
+                output_dir=self.config.output_dir,
+                optim="adamw_8bit",
+                seed=3407,
+            ),
+            reward_funcs=[RewardFunctions.combined_reward],
+            train_dataset=dataset,
+        )
+        trainer.train()
+    def save_and_push(self):
+        if self.config.push_to_hub:
+            print(f"Saving and pushing to Hub: {self.config.hub_model_id}")
+            if self.config.method != "grpo":
+                from unsloth import FastLanguageModel
+            if hasattr(self.model, "save_pretrained_merged"):
+                self.model.save_pretrained_merged(
+                    "merged_model", self.tokenizer, save_method="merged_16bit"
+                )
+            else:
+                print("Merging and saving standard PEFT model")
+                merged_model = self.model.merge_and_unload()
+                merged_model.save_pretrained("merged_model")
+                self.tokenizer.save_pretrained("merged_model")
+            api = HfApi()
+            api.create_repo(repo_id=self.config.hub_model_id, token=self.config.hf_token, exist_ok=True)
+            api.upload_folder(
+                folder_path="merged_model",
+                repo_id=self.config.hub_model_id,
+                token=self.config.hf_token,
+            )
+    def run(self):
+        self.setup_model()
+        dataset = self.load_data()
+        if self.config.method == "sft":
+            self.run_sft(dataset)
+        elif self.config.method == "dpo":
+            self.run_dpo(dataset)
+        elif self.config.method == "grpo":
+            self.run_grpo(dataset)
+        self.save_and_push()
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Qwen Unified Trainer (SFT, DPO, GRPO)")
+    # Model/Dataset
+    parser.add_argument("--model", type=str, default="Qwen/Qwen3.5-2B", help="HF model ID")
+    parser.add_argument("--dataset", type=str, default="", help="HF dataset name or local path")
+    parser.add_argument("--method", type=str, choices=["sft", "dpo", "grpo"], default="sft", help="Training method")
+    # Training Hyperparameters
+    parser.add_argument("--lr", type=float, default=2e-4, help="Learning rate")
+    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs")
+    parser.add_argument("--batch_size", type=int, default=4, help="Batch size per device")
+    parser.add_argument("--grad_acc", type=int, default=2, help="Gradient accumulation steps")
+    parser.add_argument("--max_seq_len", type=int, default=2048, help="Max sequence length")
+    # Agentic Data
+    parser.add_argument("--use_agentic", action="store_true", help="Generate synthetic data before training")
+    parser.add_argument("--task", type=str, default="", help="Task description for synthetic data")
+    parser.add_argument("--num_synthetic", type=int, default=10, help="Number of synthetic records")
+    parser.add_argument("--synthetic_path", type=str, default="synthetic_data.jsonl", help="Path to save synthetic data")
+    parser.add_argument("--reasoning", action="store_true", help="Generate reasoning format")
+    # Output/Hub
+    parser.add_argument("--output_dir", type=str, default="outputs", help="Output directory")
+    parser.add_argument("--push", action="store_true", help="Push to HF Hub")
+    parser.add_argument("--hub_id", type=str, default=None, help="HF Hub model ID")
+    parser.add_argument("--no_compile", action="store_true", help="Disable Unsloth compilation for stability")
+    parser.add_argument("--dtype", type=str, choices=["bfloat16", "float16", "float32"], default="bfloat16", help="Torch dtype")
+    parser.add_argument("--load_8bit", action="store_true", help="Load in 8-bit")
+    parser.add_argument("--no_4bit", action="store_true", help="Disable 4-bit loading")
+    args = parser.parse_args()
+    config = TrainerConfig(
+        model_name=args.model,
+        dataset_name=args.dataset,
+        method=args.method,
+        learning_rate=args.lr,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_acc,
+        max_seq_length=args.max_seq_len,
+        use_agentic_data=args.use_agentic,
+        task_description=args.task,
+        num_synthetic_records=args.num_synthetic,
+        synthetic_data_path=args.synthetic_path,
+        generate_reasoning=args.reasoning,
+        output_dir=args.output_dir,
+        push_to_hub=args.push,
+        hub_model_id=args.hub_id,
+        use_compile=not args.no_compile,
+        torch_dtype=args.dtype,
+        load_in_8bit=args.load_8bit,
+        load_in_4bit=not args.no_4bit
+    )
+    trainer = QwenTrainer(config)
+    trainer.run()