Spaces:

MataStrategy
/

ground-zero

Running

jefffffff9 Claude Opus 4.7 commited on 14 days ago

Commit

bb78cbf

1 Parent(s): 9049ef3

Add torchcodec install for datasets 4.x audio decoding

datasets >= 4.0 uses torchcodec as the audio backend; prepare_dataset
would fail with ImportError on batch['audio'] without it.

- scripts/runpod_setup.sh: pip install torchcodec with torch-version
pinning fallback (0.1/0.2/0.3/0.4 → torch 2.4/2.5/2.6/2.7+)
- kaggle_master_trainer.ipynb Cell 2: defensive try/except install so
the notebook is self-healing on any fresh environment

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

notebooks/kaggle_master_trainer.ipynb +4 -20
scripts/runpod_setup.sh +15 -0

notebooks/kaggle_master_trainer.ipynb CHANGED Viewed

@@ -34,25 +34,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# -- Cell 2: Install minimal missing dependencies ----------------------------\n",
-    "# We do NOT use PEFT/LoRA, so system transformers/numpy/scipy are fine as-is.\n",
-    "# Kaggle does not ship jiwer (WER metric) -- install it now.\n",
-    "import subprocess, sys\n",
-    "\n",
-    "subprocess.check_call([\n",
-    "    sys.executable, '-m', 'pip', 'install', '-q', 'jiwer==3.0.4',\n",
-    "])\n",
-    "\n",
-    "import torch\n",
-    "print(f\"torch      : {torch.__version__}\")\n",
-    "print(f\"CUDA avail : {torch.cuda.is_available()}\")\n",
-    "if torch.cuda.is_available():\n",
-    "    print(f\"GPU        : {torch.cuda.get_device_name(0)}\")\n",
-    "\n",
-    "import transformers, datasets as ds_lib\n",
-    "print(f\"transformers: {transformers.__version__}\")\n",
-    "print(f\"datasets    : {ds_lib.__version__}\")\n",
-    "print(\"All packages ready.\")\n"
    ]
   },
   {
@@ -81,7 +63,9 @@
    "id": "cell-login",
    "metadata": {},
    "outputs": [],
-   "source": "# ── Cell 5: HuggingFace login + directory setup ───────────────────────────────\nimport os\nfrom pathlib import Path\n\nHF_TOKEN = None\n\n# Kaggle secrets (preferred)\ntry:\n    from kaggle_secrets import UserSecretsClient  # type: ignore\n    HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN')\n    print('HF_TOKEN loaded from Kaggle secrets.')\nexcept Exception:\n    pass\n\n# Colab secrets (fallback)\nif not HF_TOKEN:\n    try:\n        from google.colab import userdata  # type: ignore\n        HF_TOKEN = userdata.get('HF_TOKEN')\n        print('HF_TOKEN loaded from Colab secrets.')\n    except Exception:\n        pass\n\nif not HF_TOKEN:\n    HF_TOKEN = os.environ.get('HF_TOKEN', '')\n\nif not HF_TOKEN:\n    raise ValueError(\n        'HF_TOKEN not found.\\n'\n        'Kaggle: Add-ons → Secrets → add HF_TOKEN → toggle \"Attach to notebook\" ON'\n    )\n\nfrom huggingface_hub import login, HfApi\nlogin(token=HF_TOKEN, add_to_git_credential=False)\napi = HfApi(token=HF_TOKEN)\nos.environ['HF_TOKEN'] = HF_TOKEN\n\n# Create output directories\nfor d in [OUTPUT_DIR, DATA_DIR, AUDIO_DIR]:\n    Path(d).mkdir(parents=True, exist_ok=True)\n\nprint(f'✅ Logged in | output: {OUTPUT_DIR}')"
   },
   {
    "cell_type": "code",

    "metadata": {},
    "outputs": [],
    "source": [
+    "# -- Cell 2: Install minimal missing dependencies ----------------------------\n# We do NOT use PEFT/LoRA, so system transformers/numpy/scipy are fine as-is.\n# Kaggle does not ship jiwer (WER metric) -- install it now.\nimport subprocess, sys\n\nsubprocess.check_call([\n    sys.executable, '-m', 'pip', 'install', '-q', 'jiwer==3.0.4',\n])\n\n# datasets >= 4.0 uses torchcodec for audio decoding. Install if missing.\ntry:\n    import torchcodec  # noqa\nexcept ImportError:\n    print('torchcodec not found — installing to match torch ...')\n    try:\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'torchcodec'])\n    except subprocess.CalledProcessError:\n        import torch as _t\n        _tv = _t.__version__.split('+')[0]\n        _pin = {'2.4': '0.1.*', '2.5': '0.2.*', '2.6': '0.3.*', '2.7': '0.4.*', '2.8': '0.4.*'}.get(_tv[:3])\n        if _pin:\n            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', f'torchcodec=={_pin}'])\n        else:\n            print(f'⚠️  Unknown torch {_tv}; install torchcodec manually if audio decoding fails')\n\nimport torch\nprint(f\"torch      : {torch.__version__}\")\nprint(f\"CUDA avail : {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU        : {torch.cuda.get_device_name(0)}\")\n\nimport transformers, datasets as ds_lib\nprint(f\"transformers: {transformers.__version__}\")\nprint(f\"datasets    : {ds_lib.__version__}\")\nprint(\"All packages ready.\")\n"
    ]
   },
   {
    "id": "cell-login",
    "metadata": {},
    "outputs": [],
+   "source": [
+    "# ── Cell 5: HuggingFace login + directory setup ───────────────────────────────\nimport os\nfrom pathlib import Path\n\nHF_TOKEN = None\n\n# Kaggle secrets (preferred)\ntry:\n    from kaggle_secrets import UserSecretsClient  # type: ignore\n    HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN')\n    print('HF_TOKEN loaded from Kaggle secrets.')\nexcept Exception:\n    pass\n\n# Colab secrets (fallback)\nif not HF_TOKEN:\n    try:\n        from google.colab import userdata  # type: ignore\n        HF_TOKEN = userdata.get('HF_TOKEN')\n        print('HF_TOKEN loaded from Colab secrets.')\n    except Exception:\n        pass\n\n# .env file (RunPod / local) - look in common locations\nif not HF_TOKEN:\n    for _env_path in ['/workspace/sahel-voice/.env', './.env', '../.env', os.path.expanduser('~/.env')]:\n        if os.path.isfile(_env_path):\n            with open(_env_path, encoding='utf-8') as _f:\n                for _line in _f:\n                    _line = _line.strip()\n                    if _line.startswith('HF_TOKEN='):\n                        _val = _line.split('=', 1)[1].strip()\n                        if _val and len(_val) >= 2 and _val[0] in ('\"', \"'\") and _val[-1] == _val[0]:\n                            _val = _val[1:-1]\n                        HF_TOKEN = _val\n                        print(f'HF_TOKEN loaded from {_env_path}')\n                        break\n            if HF_TOKEN:\n                break\n\nif not HF_TOKEN:\n    HF_TOKEN = os.environ.get('HF_TOKEN', '')\n    if HF_TOKEN:\n        print('HF_TOKEN loaded from environment variable.')\n\nif not HF_TOKEN:\n    raise ValueError(\n        'HF_TOKEN not found.\\n'\n        'Kaggle: Add-ons → Secrets → add HF_TOKEN → toggle \"Attach to notebook\" ON'\n    )\n\nfrom huggingface_hub import login, HfApi\nlogin(token=HF_TOKEN, add_to_git_credential=False)\napi = HfApi(token=HF_TOKEN)\nos.environ['HF_TOKEN'] = HF_TOKEN\n\n# Create output directories\nfor d in [OUTPUT_DIR, DATA_DIR, AUDIO_DIR]:\n    Path(d).mkdir(parents=True, exist_ok=True)\n\nprint(f'✅ Logged in | output: {OUTPUT_DIR}')"
+   ]
   },
   {
    "cell_type": "code",

scripts/runpod_setup.sh CHANGED Viewed

@@ -59,6 +59,21 @@ pip install -q \
     "pypdf>=4.0.0" \
     "python-docx>=1.1.0"
 # 3. HF token prompt (one-time)
 ENV_FILE="${REPO_DIR}/.env"
 if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then

     "pypdf>=4.0.0" \
     "python-docx>=1.1.0"
+# torchcodec — required by datasets>=4.0 for audio decoding.
+# Version must match the installed torch; let pip resolve, fallback to pinned.
+echo ">> Installing torchcodec (audio backend for datasets 4.x)..."
+pip install -q torchcodec || {
+    TORCH_VER=$(python -c "import torch; print(torch.__version__.split('+')[0])" 2>/dev/null || echo "unknown")
+    echo "   pip resolve failed; torch=${TORCH_VER}. Trying pinned versions..."
+    case "${TORCH_VER}" in
+        2.4.*) pip install -q "torchcodec==0.1.*" ;;
+        2.5.*) pip install -q "torchcodec==0.2.*" ;;
+        2.6.*) pip install -q "torchcodec==0.3.*" ;;
+        2.7.*|2.8.*) pip install -q "torchcodec==0.4.*" ;;
+        *)     echo "   ⚠️  Unknown torch version — install torchcodec manually." ;;
+    esac
+}
 # 3. HF token prompt (one-time)
 ENV_FILE="${REPO_DIR}/.env"
 if [[ -z "${HF_TOKEN:-}" ]] && [[ ! -f "${ENV_FILE}" ]]; then