jefffffff9 Claude Opus 4.7 commited on
Commit
bb78cbf
·
1 Parent(s): 9049ef3

Add torchcodec install for datasets 4.x audio decoding

Browse files

datasets >= 4.0 uses torchcodec as the audio backend; prepare_dataset
would fail with ImportError on batch['audio'] without it.

- scripts/runpod_setup.sh: pip install torchcodec with torch-version
pinning fallback (0.1/0.2/0.3/0.4 → torch 2.4/2.5/2.6/2.7+)
- kaggle_master_trainer.ipynb Cell 2: defensive try/except install so
the notebook is self-healing on any fresh environment

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

notebooks/kaggle_master_trainer.ipynb CHANGED
@@ -34,25 +34,7 @@
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
37
- "# -- Cell 2: Install minimal missing dependencies ----------------------------\n",
38
- "# We do NOT use PEFT/LoRA, so system transformers/numpy/scipy are fine as-is.\n",
39
- "# Kaggle does not ship jiwer (WER metric) -- install it now.\n",
40
- "import subprocess, sys\n",
41
- "\n",
42
- "subprocess.check_call([\n",
43
- " sys.executable, '-m', 'pip', 'install', '-q', 'jiwer==3.0.4',\n",
44
- "])\n",
45
- "\n",
46
- "import torch\n",
47
- "print(f\"torch : {torch.__version__}\")\n",
48
- "print(f\"CUDA avail : {torch.cuda.is_available()}\")\n",
49
- "if torch.cuda.is_available():\n",
50
- " print(f\"GPU : {torch.cuda.get_device_name(0)}\")\n",
51
- "\n",
52
- "import transformers, datasets as ds_lib\n",
53
- "print(f\"transformers: {transformers.__version__}\")\n",
54
- "print(f\"datasets : {ds_lib.__version__}\")\n",
55
- "print(\"All packages ready.\")\n"
56
  ]
57
  },
58
  {
@@ -81,7 +63,9 @@
81
  "id": "cell-login",
82
  "metadata": {},
83
  "outputs": [],
84
- "source": "# ── Cell 5: HuggingFace login + directory setup ───────────────────────────────\nimport os\nfrom pathlib import Path\n\nHF_TOKEN = None\n\n# Kaggle secrets (preferred)\ntry:\n from kaggle_secrets import UserSecretsClient # type: ignore\n HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN')\n print('HF_TOKEN loaded from Kaggle secrets.')\nexcept Exception:\n pass\n\n# Colab secrets (fallback)\nif not HF_TOKEN:\n try:\n from google.colab import userdata # type: ignore\n HF_TOKEN = userdata.get('HF_TOKEN')\n print('HF_TOKEN loaded from Colab secrets.')\n except Exception:\n pass\n\nif not HF_TOKEN:\n HF_TOKEN = os.environ.get('HF_TOKEN', '')\n\nif not HF_TOKEN:\n raise ValueError(\n 'HF_TOKEN not found.\\n'\n 'Kaggle: Add-ons → Secrets → add HF_TOKEN → toggle \"Attach to notebook\" ON'\n )\n\nfrom huggingface_hub import login, HfApi\nlogin(token=HF_TOKEN, add_to_git_credential=False)\napi = HfApi(token=HF_TOKEN)\nos.environ['HF_TOKEN'] = HF_TOKEN\n\n# Create output directories\nfor d in [OUTPUT_DIR, DATA_DIR, AUDIO_DIR]:\n Path(d).mkdir(parents=True, exist_ok=True)\n\nprint(f'✅ Logged in | output: {OUTPUT_DIR}')"
 
 
85
  },
86
  {
87
  "cell_type": "code",
 
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
37
+ "# -- Cell 2: Install minimal missing dependencies ----------------------------\n# We do NOT use PEFT/LoRA, so system transformers/numpy/scipy are fine as-is.\n# Kaggle does not ship jiwer (WER metric) -- install it now.\nimport subprocess, sys\n\nsubprocess.check_call([\n sys.executable, '-m', 'pip', 'install', '-q', 'jiwer==3.0.4',\n])\n\n# datasets >= 4.0 uses torchcodec for audio decoding. Install if missing.\ntry:\n import torchcodec # noqa\nexcept ImportError:\n print('torchcodec not found — installing to match torch ...')\n try:\n subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'torchcodec'])\n except subprocess.CalledProcessError:\n import torch as _t\n _tv = _t.__version__.split('+')[0]\n _pin = {'2.4': '0.1.*', '2.5': '0.2.*', '2.6': '0.3.*', '2.7': '0.4.*', '2.8': '0.4.*'}.get(_tv[:3])\n if _pin:\n subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', f'torchcodec=={_pin}'])\n else:\n print(f'⚠️ Unknown torch {_tv}; install torchcodec manually if audio decoding fails')\n\nimport torch\nprint(f\"torch : {torch.__version__}\")\nprint(f\"CUDA avail : {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU : {torch.cuda.get_device_name(0)}\")\n\nimport transformers, datasets as ds_lib\nprint(f\"transformers: {transformers.__version__}\")\nprint(f\"datasets : {ds_lib.__version__}\")\nprint(\"All packages ready.\")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  ]
39
  },
40
  {
 
63
  "id": "cell-login",
64
  "metadata": {},
65
  "outputs": [],
66
+ "source": [
67
+ "# ── Cell 5: HuggingFace login + directory setup ───────────────────────────────\nimport os\nfrom pathlib import Path\n\nHF_TOKEN = None\n\n# Kaggle secrets (preferred)\ntry:\n from kaggle_secrets import UserSecretsClient # type: ignore\n HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN')\n print('HF_TOKEN loaded from Kaggle secrets.')\nexcept Exception:\n pass\n\n# Colab secrets (fallback)\nif not HF_TOKEN:\n try:\n from google.colab import userdata # type: ignore\n HF_TOKEN = userdata.get('HF_TOKEN')\n print('HF_TOKEN loaded from Colab secrets.')\n except Exception:\n pass\n\n# .env file (RunPod / local) - look in common locations\nif not HF_TOKEN:\n for _env_path in ['/workspace/sahel-voice/.env', './.env', '../.env', os.path.expanduser('~/.env')]:\n if os.path.isfile(_env_path):\n with open(_env_path, encoding='utf-8') as _f:\n for _line in _f:\n _line = _line.strip()\n if _line.startswith('HF_TOKEN='):\n _val = _line.split('=', 1)[1].strip()\n if _val and len(_val) >= 2 and _val[0] in ('\"', \"'\") and _val[-1] == _val[0]:\n _val = _val[1:-1]\n HF_TOKEN = _val\n print(f'HF_TOKEN loaded from {_env_path}')\n break\n if HF_TOKEN:\n break\n\nif not HF_TOKEN:\n HF_TOKEN = os.environ.get('HF_TOKEN', '')\n if HF_TOKEN:\n print('HF_TOKEN loaded from environment variable.')\n\nif not HF_TOKEN:\n raise ValueError(\n 'HF_TOKEN not found.\\n'\n 'Kaggle: Add-ons → Secrets → add HF_TOKEN → toggle \"Attach to notebook\" ON'\n )\n\nfrom huggingface_hub import login, HfApi\nlogin(token=HF_TOKEN, add_to_git_credential=False)\napi = HfApi(token=HF_TOKEN)\nos.environ['HF_TOKEN'] = HF_TOKEN\n\n# Create output directories\nfor d in [OUTPUT_DIR, DATA_DIR, AUDIO_DIR]:\n Path(d).mkdir(parents=True, exist_ok=True)\n\nprint(f'✅ Logged in | output: {OUTPUT_DIR}')"
68
+ ]
69
  },
70
  {
71
  "cell_type": "code",
scripts/runpod_setup.sh CHANGED
@@ -59,6 +59,21 @@ pip install -q \
59
  "pypdf>=4.0.0" \
60
  "python-docx>=1.1.0"
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # 3. HF token prompt (one-time)
63
  ENV_FILE="${REPO_DIR}/.env"
64
  if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then
 
59
  "pypdf>=4.0.0" \
60
  "python-docx>=1.1.0"
61
 
62
+ # torchcodec — required by datasets>=4.0 for audio decoding.
63
+ # Version must match the installed torch; let pip resolve, fallback to pinned.
64
+ echo ">> Installing torchcodec (audio backend for datasets 4.x)..."
65
+ pip install -q torchcodec || {
66
+ TORCH_VER=$(python -c "import torch; print(torch.__version__.split('+')[0])" 2>/dev/null || echo "unknown")
67
+ echo " pip resolve failed; torch=${TORCH_VER}. Trying pinned versions..."
68
+ case "${TORCH_VER}" in
69
+ 2.4.*) pip install -q "torchcodec==0.1.*" ;;
70
+ 2.5.*) pip install -q "torchcodec==0.2.*" ;;
71
+ 2.6.*) pip install -q "torchcodec==0.3.*" ;;
72
+ 2.7.*|2.8.*) pip install -q "torchcodec==0.4.*" ;;
73
+ *) echo " ⚠️ Unknown torch version — install torchcodec manually." ;;
74
+ esac
75
+ }
76
+
77
  # 3. HF token prompt (one-time)
78
  ENV_FILE="${REPO_DIR}/.env"
79
  if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then