jefffffff9 Claude Sonnet 4.6 commited on
Commit
cd017e2
·
1 Parent(s): 24b1617

Fix Cell 16 ValueError: load model fp32 so AMP gradient scaler works

Browse files

Root cause: model loaded in fp16 + fp16=True training args causes
GradScaler to see fp16 params and raise "Attempting to unscale FP16 gradients".
Fix: load fp32, let AMP cast internally during forward pass.

Cell 13: torch_dtype float16 -> float32; remove manual gradient_checkpointing_enable
Cell 14: remove explicit .to(float16) cast in data collator (AMP handles it)
Cell 15: gradient_checkpointing False->True; remove deprecated logging_dir kwarg
Cell 17: remove .half() on eval features (fp32 model needs fp32 input)
Cell 19: fix f-string ternary precedence so commit msg always includes all fields

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

notebooks/kaggle_master_trainer.ipynb CHANGED
@@ -170,7 +170,7 @@
170
  "metadata": {},
171
  "outputs": [],
172
  "source": [
173
- "# -- Cell 13: Load Whisper-small (fp16) + freeze most layers ------------------\n# PEFT LoRA causes TypeError with transformers 5.x regardless of which layers\n# are targeted: PeftModelForSeq2SeqLM wraps the entire model in BaseTuner whose\n# forward(*args, **kwargs) passes input_ids in a way that causes WhisperDecoder\n# to receive it twice. Fix: skip PEFT entirely. Freeze all params, unfreeze\n# last 2 decoder layers (~5% trainable params -- same capacity as LoRA r=32).\nimport torch\nfrom transformers.models.whisper.modeling_whisper import WhisperForConditionalGeneration\n\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\nprint(f\"Loading {WHISPER_MODEL_ID} in fp16 on {device} ...\")\n\nmodel = WhisperForConditionalGeneration.from_pretrained(\n WHISPER_MODEL_ID,\n torch_dtype=torch.float16,\n token=HF_TOKEN,\n)\nmodel = model.to(device)\n\n# Force target language -- avoids language-detection overhead during training\n# Move generation params to GenerationConfig (avoids deprecation warning)\n_dec_ids = processor.get_decoder_prompt_ids(language='fr', task='transcribe')\nmodel.generation_config.forced_decoder_ids = _dec_ids\nmodel.generation_config.suppress_tokens = []\nmodel.config.use_cache = False # required for gradient checkpointing\n\n# ── Freeze all params, then selectively unfreeze ─────────────────────────────\nfor param in model.parameters():\n param.requires_grad = False\n\n# Unfreeze last 2 decoder layers + final layer norm + output projection.\n# These handle language-specific token generation.\nfor module in [\n model.model.decoder.layers[-2],\n model.model.decoder.layers[-1],\n model.model.decoder.layer_norm,\n model.proj_out,\n]:\n for param in module.parameters():\n param.requires_grad = True\n\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal = sum(p.numel() for p in model.parameters())\nprint(f\"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.1f}%)\")\n\n# Enable gradient checkpointing -- reduces activation memory at slight speed cost\nmodel.gradient_checkpointing_enable()\nmodel.train()\n\nvram_mb = torch.cuda.memory_allocated() / 1e6 if torch.cuda.is_available() else 0\ntotal_vram = torch.cuda.get_device_properties(0).total_memory / 1e6 if torch.cuda.is_available() else 0\nprint(f\"VRAM used: {vram_mb:.0f} MB / {total_vram:.0f} MB\")\nprint(f\"Model ready on {device}\")\n"
174
  ]
175
  },
176
  {
@@ -179,7 +179,9 @@
179
  "id": "cell-collator",
180
  "metadata": {},
181
  "outputs": [],
182
- "source": "# -- Cell 14: Data collator + WER metric --------------------------------------\nimport jiwer\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List\n\ntransform = jiwer.Compose([\n jiwer.ToLowerCase(),\n jiwer.RemoveMultipleSpaces(),\n jiwer.Strip(),\n jiwer.RemovePunctuation(),\n jiwer.ReduceToListOfListOfWords(),\n])\n\n\n@dataclass\nclass DataCollatorSpeechSeq2SeqWithPadding:\n processor: Any\n\n def __call__(self, features: List[Dict]) -> Dict:\n import torch\n input_feats = [{'input_features': f['input_features']} for f in features]\n batch = self.processor.feature_extractor.pad(input_feats, return_tensors='pt')\n\n # Cast to fp16 to match the model -- avoids dtype mismatch in conv1\n batch['input_features'] = batch['input_features'].to(torch.float16)\n\n label_feats = [{'input_ids': f['labels']} for f in features]\n labels_batch = self.processor.tokenizer.pad(label_feats, return_tensors='pt')\n labels = labels_batch['input_ids'].masked_fill(\n labels_batch.attention_mask.ne(1), -100\n )\n if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():\n labels = labels[:, 1:]\n batch['labels'] = labels\n return batch\n\n\ndef compute_metrics(pred):\n pred_ids = pred.predictions\n label_ids = pred.label_ids\n label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n\n pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n\n wer = jiwer.wer(label_str, pred_str,\n hypothesis_transform=transform,\n reference_transform=transform)\n return {'wer': round(wer, 4)}\n\n\ncollator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\nprint('Collator and WER metric ready')"
 
 
183
  },
184
  {
185
  "cell_type": "markdown",
@@ -194,7 +196,7 @@
194
  "metadata": {},
195
  "outputs": [],
196
  "source": [
197
- "# -- Cell 15: Training arguments ----------------------------------------------\nimport inspect\nfrom transformers import Seq2SeqTrainingArguments\n\n# transformers 4.x used 'evaluation_strategy'; 4.45+ renamed to 'eval_strategy'.\n# Detect which name this installed version accepts.\n_params = inspect.signature(Seq2SeqTrainingArguments.__init__).parameters\n_eval_key = 'eval_strategy' if 'eval_strategy' in _params else 'evaluation_strategy'\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=OUTPUT_DIR,\n\n max_steps=MAX_STEPS,\n warmup_steps=WARMUP_STEPS,\n logging_steps=LOGGING_STEPS,\n save_steps=SAVE_STEPS,\n eval_steps=EVAL_STEPS,\n\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=8,\n gradient_accumulation_steps=GRAD_ACCUM,\n\n fp16=True,\n gradient_checkpointing=False,\n\n learning_rate=LEARNING_RATE,\n lr_scheduler_type='cosine',\n weight_decay=0.0,\n adam_beta1=0.9,\n adam_beta2=0.98,\n adam_epsilon=1e-6,\n\n **{_eval_key: 'steps'},\n predict_with_generate=True,\n generation_max_length=225,\n load_best_model_at_end=True,\n metric_for_best_model='wer',\n greater_is_better=False,\n\n save_total_limit=3,\n save_strategy='steps',\n\n report_to=['tensorboard'],\n logging_dir=f'{OUTPUT_DIR}/logs',\n push_to_hub=False,\n)\n\nprint(f'Training arguments ready (using {_eval_key}=steps)')\nprint(f' Effective batch size: {BATCH_SIZE * GRAD_ACCUM}')\nprint(f' Max steps : {MAX_STEPS}')\n"
198
  ]
199
  },
200
  {
@@ -220,7 +222,7 @@
220
  "metadata": {},
221
  "outputs": [],
222
  "source": [
223
- "# ── Cell 17: WER evaluation ───────────────────────────────────────────────────\nprint('Running full evaluation on eval split ...')\neval_results = trainer.evaluate()\n\nwer_score = eval_results.get('eval_wer', float('nan'))\nprint(f'\\n📊 Final WER : {wer_score:.1%}')\nprint(f' Eval loss : {eval_results.get(\"eval_loss\", float(\"nan\")):.4f}')\n\n# Show a few example transcriptions side-by-side\nimport random, torch\nprint('\\n── Sample predictions ───────────────────────────────')\nsamples = random.sample(range(len(eval_ds)), min(5, len(eval_ds)))\nfor idx in samples:\n item = eval_ds[idx]\n feats = torch.tensor(item['input_features']).unsqueeze(0).to(model.device)\n with torch.no_grad():\n pred_ids = model.generate(\n feats.half(),\n max_new_tokens=128,\n )\n pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]\n labels = [t if t != -100 else processor.tokenizer.pad_token_id\n for t in item['labels']]\n ref_str = processor.tokenizer.decode(labels, skip_special_tokens=True)\n print(f' Ref : {ref_str}')\n print(f' Pred: {pred_str}')\n print()"
224
  ]
225
  },
226
  {
@@ -246,7 +248,7 @@
246
  "metadata": {},
247
  "outputs": [],
248
  "source": [
249
- "# ── Cell 19: Push adapter to HF Model repo ───────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n exist_ok=True, token=HF_TOKEN)\n\ncommit_msg = (\n f'[{VERSION_TAG}] {LANG_NAME} fine-tuned checkpoint — '\n f'{train_result.global_step} steps | '\n f'WER {wer_score:.1%} | ' if wer_score == wer_score else f'WER n/a | '\n f'{len(correction_records)} corrections + WaxalNLP'\n)\n\napi.upload_folder(\n folder_path=OUTPUT_DIR,\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n path_in_repo=PATH_IN_REPO,\n commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n api.create_tag(\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n tag=VERSION_TAG,\n tag_message=commit_msg,\n token=HF_TOKEN,\n )\n print(f'✅ Tag created : {VERSION_TAG}')\nexcept Exception as e:\n print(f'⚠️ Tag creation skipped: {e}')"
250
  ]
251
  },
252
  {
 
170
  "metadata": {},
171
  "outputs": [],
172
  "source": [
173
+ "# -- Cell 13: Load Whisper-small (fp16) + freeze most layers ------------------\n# PEFT LoRA causes TypeError with transformers 5.x regardless of which layers\n# are targeted: PeftModelForSeq2SeqLM wraps the entire model in BaseTuner whose\n# forward(*args, **kwargs) passes input_ids in a way that causes WhisperDecoder\n# to receive it twice. Fix: skip PEFT entirely. Freeze all params, unfreeze\n# last 2 decoder layers (~5% trainable params -- same capacity as LoRA r=32).\nimport torch\nfrom transformers.models.whisper.modeling_whisper import WhisperForConditionalGeneration\n\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\"\nprint(f\"Loading {WHISPER_MODEL_ID} in fp32 on {device} ...\")\n\nmodel = WhisperForConditionalGeneration.from_pretrained(\n WHISPER_MODEL_ID,\n torch_dtype=torch.float32, # fp32 storage -- AMP casts internally during training\n token=HF_TOKEN,\n)\nmodel = model.to(device)\n\n# Force target language -- avoids language-detection overhead during training\n# Move generation params to GenerationConfig (avoids deprecation warning)\n_dec_ids = processor.get_decoder_prompt_ids(language='fr', task='transcribe')\nmodel.generation_config.forced_decoder_ids = _dec_ids\nmodel.generation_config.suppress_tokens = []\nmodel.config.use_cache = False # required for gradient checkpointing\n\n# ── Freeze all params, then selectively unfreeze ─────────────────────────────\nfor param in model.parameters():\n param.requires_grad = False\n\n# Unfreeze last 2 decoder layers + final layer norm + output projection.\n# These handle language-specific token generation.\nfor module in [\n model.model.decoder.layers[-2],\n model.model.decoder.layers[-1],\n model.model.decoder.layer_norm,\n model.proj_out,\n]:\n for param in module.parameters():\n param.requires_grad = True\n\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal = sum(p.numel() for p in model.parameters())\nprint(f\"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.1f}%)\")\n\n# gradient_checkpointing enabled via TrainingArguments below (args handle enable/disable)\nmodel.train()\n\nvram_mb = torch.cuda.memory_allocated() / 1e6 if torch.cuda.is_available() else 0\ntotal_vram = torch.cuda.get_device_properties(0).total_memory / 1e6 if torch.cuda.is_available() else 0\nprint(f\"VRAM used: {vram_mb:.0f} MB / {total_vram:.0f} MB\")\nprint(f\"Model ready on {device}\")\n"
174
  ]
175
  },
176
  {
 
179
  "id": "cell-collator",
180
  "metadata": {},
181
  "outputs": [],
182
+ "source": [
183
+ "# -- Cell 14: Data collator + WER metric --------------------------------------\nimport jiwer\nfrom dataclasses import dataclass\nfrom typing import Any, Dict, List\n\ntransform = jiwer.Compose([\n jiwer.ToLowerCase(),\n jiwer.RemoveMultipleSpaces(),\n jiwer.Strip(),\n jiwer.RemovePunctuation(),\n jiwer.ReduceToListOfListOfWords(),\n])\n\n\n@dataclass\nclass DataCollatorSpeechSeq2SeqWithPadding:\n processor: Any\n\n def __call__(self, features: List[Dict]) -> Dict:\n import torch\n input_feats = [{'input_features': f['input_features']} for f in features]\n batch = self.processor.feature_extractor.pad(input_feats, return_tensors='pt')\n\n # Leave features in fp32 -- AMP (fp16=True in TrainingArgs) handles casting\n\n label_feats = [{'input_ids': f['labels']} for f in features]\n labels_batch = self.processor.tokenizer.pad(label_feats, return_tensors='pt')\n labels = labels_batch['input_ids'].masked_fill(\n labels_batch.attention_mask.ne(1), -100\n )\n if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().item():\n labels = labels[:, 1:]\n batch['labels'] = labels\n return batch\n\n\ndef compute_metrics(pred):\n pred_ids = pred.predictions\n label_ids = pred.label_ids\n label_ids[label_ids == -100] = processor.tokenizer.pad_token_id\n\n pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)\n label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)\n\n wer = jiwer.wer(label_str, pred_str,\n hypothesis_transform=transform,\n reference_transform=transform)\n return {'wer': round(wer, 4)}\n\n\ncollator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)\nprint('Collator and WER metric ready')"
184
+ ]
185
  },
186
  {
187
  "cell_type": "markdown",
 
196
  "metadata": {},
197
  "outputs": [],
198
  "source": [
199
+ "# -- Cell 15: Training arguments ----------------------------------------------\nimport inspect\nfrom transformers import Seq2SeqTrainingArguments\n\n# transformers 4.x used 'evaluation_strategy'; 4.45+ renamed to 'eval_strategy'.\n# Detect which name this installed version accepts.\n_params = inspect.signature(Seq2SeqTrainingArguments.__init__).parameters\n_eval_key = 'eval_strategy' if 'eval_strategy' in _params else 'evaluation_strategy'\n\ntraining_args = Seq2SeqTrainingArguments(\n output_dir=OUTPUT_DIR,\n\n max_steps=MAX_STEPS,\n warmup_steps=WARMUP_STEPS,\n logging_steps=LOGGING_STEPS,\n save_steps=SAVE_STEPS,\n eval_steps=EVAL_STEPS,\n\n per_device_train_batch_size=BATCH_SIZE,\n per_device_eval_batch_size=8,\n gradient_accumulation_steps=GRAD_ACCUM,\n\n fp16=True,\n gradient_checkpointing=True, # reduces activation memory on T4\n\n learning_rate=LEARNING_RATE,\n lr_scheduler_type='cosine',\n weight_decay=0.0,\n adam_beta1=0.9,\n adam_beta2=0.98,\n adam_epsilon=1e-6,\n\n **{_eval_key: 'steps'},\n predict_with_generate=True,\n generation_max_length=225,\n load_best_model_at_end=True,\n metric_for_best_model='wer',\n greater_is_better=False,\n\n save_total_limit=3,\n save_strategy='steps',\n\n report_to=['tensorboard'], # tensorboard logs to OUTPUT_DIR/runs by default\n push_to_hub=False,\n)\n\nprint(f'Training arguments ready (using {_eval_key}=steps)')\nprint(f' Effective batch size: {BATCH_SIZE * GRAD_ACCUM}')\nprint(f' Max steps : {MAX_STEPS}')\n"
200
  ]
201
  },
202
  {
 
222
  "metadata": {},
223
  "outputs": [],
224
  "source": [
225
+ "# ── Cell 17: WER evaluation ───────────────────────────────────────────────────\nprint('Running full evaluation on eval split ...')\neval_results = trainer.evaluate()\n\nwer_score = eval_results.get('eval_wer', float('nan'))\nprint(f'\\n📊 Final WER : {wer_score:.1%}')\nprint(f' Eval loss : {eval_results.get(\"eval_loss\", float(\"nan\")):.4f}')\n\n# Show a few example transcriptions side-by-side\nimport random, torch\nprint('\\n── Sample predictions ───────────────────────────────')\nsamples = random.sample(range(len(eval_ds)), min(5, len(eval_ds)))\nfor idx in samples:\n item = eval_ds[idx]\n feats = torch.tensor(item['input_features']).unsqueeze(0).to(model.device)\n with torch.no_grad():\n pred_ids = model.generate(\n feats, # fp32 to match model dtype\n max_new_tokens=128,\n )\n pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0]\n labels = [t if t != -100 else processor.tokenizer.pad_token_id\n for t in item['labels']]\n ref_str = processor.tokenizer.decode(labels, skip_special_tokens=True)\n print(f' Ref : {ref_str}')\n print(f' Pred: {pred_str}')\n print()"
226
  ]
227
  },
228
  {
 
248
  "metadata": {},
249
  "outputs": [],
250
  "source": [
251
+ "# ── Cell 19: Push adapter to HF Model repo ───────────────────────────────────\nfrom huggingface_hub import HfApi, create_repo\n\n# Ensure repo exists\ncreate_repo(ADAPTER_REPO_ID, repo_type='model', private=True,\n exist_ok=True, token=HF_TOKEN)\n\n_wer_part = f'{wer_score:.1%}' if wer_score == wer_score else 'n/a'\ncommit_msg = (\n f'[{VERSION_TAG}] {LANG_NAME} fine-tuned checkpoint — '\n f'{train_result.global_step} steps | WER {_wer_part} | '\n f'{len(correction_records)} corrections + WaxalNLP'\n)\n\napi.upload_folder(\n folder_path=OUTPUT_DIR,\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n path_in_repo=PATH_IN_REPO,\n commit_message=commit_msg,\n)\nprint(f'✅ Adapter uploaded: {ADAPTER_REPO_ID}/{PATH_IN_REPO}')\n\n# Create a Git tag for this version\ntry:\n api.create_tag(\n repo_id=ADAPTER_REPO_ID,\n repo_type='model',\n tag=VERSION_TAG,\n tag_message=commit_msg,\n token=HF_TOKEN,\n )\n print(f'✅ Tag created : {VERSION_TAG}')\nexcept Exception as e:\n print(f'⚠️ Tag creation skipped: {e}')"
252
  ]
253
  },
254
  {