File size: 31,338 Bytes

b386992

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import libraries\n",
    "\n",
    "import glob\n",
    "import json\n",
    "import librosa\n",
    "import numpy as np\n",
    "from omegaconf import OmegaConf, open_dict\n",
    "import os\n",
    "import soundfile as sf\n",
    "import subprocess\n",
    "import tarfile\n",
    "import tqdm\n",
    "import wget\n",
    "\n",
    "import torch\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"/home/ubuntu/respair/Tsukasa_LITE_Qanary.csv\")\n",
    "\n",
    "import re\n",
    "\n",
    "\n",
    "\n",
    "# Replace any sequence containing \"HAHA\" with <|🤣|>\n",
    "df['text'] = df['text'].apply(lambda x: re.sub(r'\\S*HAHA\\S*', '<|🤣|>', x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "notebookRunGroups": {
     "groupValue": "1"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Files with .wav: 461086\n",
      "Files with .ogg: 27643\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "def check_and_fix_extension(filepath):\n",
    "    \"\"\"Check if file exists, if not try .ogg extension\"\"\"\n",
    "    if os.path.exists(filepath):\n",
    "        return filepath\n",
    "    \n",
    "    if filepath.endswith('.wav'):\n",
    "        ogg_path = filepath.replace('.wav', '.ogg')\n",
    "        if os.path.exists(ogg_path):\n",
    "            return ogg_path\n",
    "    \n",
    "    return filepath  # Return original if neither exists\n",
    "\n",
    "# Apply the fix to all filenames\n",
    "df['filename'] = df['filename'].apply(check_and_fix_extension)\n",
    "\n",
    "# Show summary\n",
    "wav_count = df['filename'].str.endswith('.wav').sum()\n",
    "ogg_count = df['filename'].str.endswith('.ogg').sum()\n",
    "print(f\"Files with .wav: {wav_count}\")\n",
    "print(f\"Files with .ogg: {ogg_count}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "notebookRunGroups": {
     "groupValue": "1"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(27643, 4)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['filename'].str.contains(\".ogg\")].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['text'].str.contains(\"🤣\")]\n",
    "df.to_csv(\"/home/ubuntu/respair/Tsukasa_LITE_Qanary.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Duration column found in CSV. Using provided durations.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 488729 entries in the CSV file.\n",
      "Processing entries with provided durations...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 488729/488729 [00:04<00:00, 102634.90it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Processing complete!\n",
      "Successfully processed: 488729/488729 entries\n",
      "Total duration: 766.35 hours\n",
      "Manifest created at: /home/ubuntu/NeMo/data/tsukasa_manifest.json\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "import csv\n",
    "import librosa\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from multiprocessing import Pool, cpu_count\n",
    "from functools import partial\n",
    "\n",
    "\n",
    "def get_audio_duration(audio_path):\n",
    "    \"\"\"Get duration of an audio file.\"\"\"\n",
    "    try:\n",
    "        duration = librosa.core.get_duration(path=audio_path)\n",
    "        return duration\n",
    "    except Exception as e:\n",
    "        print(f\"Error processing {audio_path}: {e}\")\n",
    "        return None\n",
    "\n",
    "\n",
    "def process_row_with_duration(row, lang=\"jp\"):\n",
    "    \"\"\"Process a row that already has duration information.\"\"\"\n",
    "    metadata = {\n",
    "        \"audio_filepath\": row['filename'],\n",
    "        \"duration\": float(row['duration']),\n",
    "        \"text\": row['text'],\n",
    "        \"lang\": lang,\n",
    "        \"target_lang\": lang,\n",
    "        \"source_lang\": lang,\n",
    "        \"pnc\": \"False\"\n",
    "    }\n",
    "    return metadata\n",
    "\n",
    "\n",
    "def process_row_without_duration(row, lang=\"jp\"):\n",
    "    \"\"\"Process a row and calculate duration.\"\"\"\n",
    "    audio_path = row['filename']\n",
    "    duration = get_audio_duration(audio_path)\n",
    "    \n",
    "    if duration is None:\n",
    "        return None\n",
    "    \n",
    "    metadata = {\n",
    "        \"audio_filepath\": audio_path,\n",
    "        \"duration\": duration,\n",
    "        \"text\": row['text'],\n",
    "        \"lang\": lang,\n",
    "        \"target_lang\": lang,\n",
    "        \"source_lang\": lang,\n",
    "        \"pnc\": \"False\"\n",
    "    }\n",
    "    return metadata\n",
    "\n",
    "\n",
    "def build_manifest_from_csv(csv_path, manifest_path, lang=\"jp\", n_jobs=None):\n",
    "    \"\"\"\n",
    "    Build a manifest file from a CSV dataset.\n",
    "    \n",
    "    Args:\n",
    "        csv_path: Path to the CSV file containing filename and text columns\n",
    "        manifest_path: Path where the manifest JSON file will be saved\n",
    "        lang: Language code (default: \"jp\" for Japanese)\n",
    "        n_jobs: Number of parallel jobs for duration calculation (default: CPU count - 1)\n",
    "    \"\"\"\n",
    "    if n_jobs is None:\n",
    "        n_jobs = max(1, cpu_count() - 1)\n",
    "    \n",
    "    # Read the CSV file\n",
    "    rows = []\n",
    "    has_duration = False\n",
    "    \n",
    "    with open(csv_path, 'r', encoding='utf-8') as f:\n",
    "        reader = csv.DictReader(f)\n",
    "        \n",
    "        # Check if duration column exists\n",
    "        if 'duration' in reader.fieldnames:\n",
    "            has_duration = True\n",
    "            print(\"Duration column found in CSV. Using provided durations.\")\n",
    "        else:\n",
    "            print(f\"Duration column not found. Will calculate durations using {n_jobs} parallel workers.\")\n",
    "        \n",
    "        for row in reader:\n",
    "            rows.append(row)\n",
    "    \n",
    "    print(f\"Found {len(rows)} entries in the CSV file.\")\n",
    "    \n",
    "    # Process rows\n",
    "    tot_duration = 0\n",
    "    successful_entries = 0\n",
    "    \n",
    "    # Create/clear the manifest file\n",
    "    with open(manifest_path, 'w') as fout:\n",
    "        pass\n",
    "    \n",
    "    if has_duration:\n",
    "        # Process without parallel computation\n",
    "        print(\"Processing entries with provided durations...\")\n",
    "        with open(manifest_path, 'a') as fout:\n",
    "            for row in tqdm(rows):\n",
    "                metadata = process_row_with_duration(row, lang)\n",
    "                if metadata:\n",
    "                    json.dump(metadata, fout)\n",
    "                    fout.write('\\n')\n",
    "                    tot_duration += metadata['duration']\n",
    "                    successful_entries += 1\n",
    "    else:\n",
    "        # Process with parallel duration calculation\n",
    "        print(\"Calculating audio durations in parallel...\")\n",
    "        \n",
    "        # Split processing into chunks for better progress tracking\n",
    "        chunk_size = 100\n",
    "        chunks = [rows[i:i + chunk_size] for i in range(0, len(rows), chunk_size)]\n",
    "        \n",
    "        with open(manifest_path, 'a') as fout:\n",
    "            for chunk in tqdm(chunks, desc=\"Processing chunks\"):\n",
    "                # Use multiprocessing pool for duration calculation\n",
    "                with Pool(n_jobs) as pool:\n",
    "                    process_func = partial(process_row_without_duration, lang=lang)\n",
    "                    results = pool.map(process_func, chunk)\n",
    "                \n",
    "                # Write results\n",
    "                for metadata in results:\n",
    "                    if metadata:\n",
    "                        json.dump(metadata, fout)\n",
    "                        fout.write('\\n')\n",
    "                        tot_duration += metadata['duration']\n",
    "                        successful_entries += 1\n",
    "    \n",
    "    print(f\"\\nProcessing complete!\")\n",
    "    print(f\"Successfully processed: {successful_entries}/{len(rows)} entries\")\n",
    "    print(f\"Total duration: {np.round(tot_duration/3600, 2)} hours\")\n",
    "    print(f\"Manifest created at: {manifest_path}\")\n",
    "    \n",
    "    return manifest_path, tot_duration\n",
    "\n",
    "\n",
    "def verify_manifest(manifest_path, sample_size=5):\n",
    "    \"\"\"Verify the manifest by displaying a few sample entries.\"\"\"\n",
    "    print(f\"\\nVerifying manifest: {manifest_path}\")\n",
    "    print(f\"Sample entries (first {sample_size}):\")\n",
    "    \n",
    "    with open(manifest_path, 'r') as f:\n",
    "        for i, line in enumerate(f):\n",
    "            if i >= sample_size:\n",
    "                break\n",
    "            entry = json.loads(line)\n",
    "            print(f\"\\nEntry {i+1}:\")\n",
    "            print(f\"  Audio: {entry['audio_filepath']}\")\n",
    "            print(f\"  Duration: {entry['duration']:.2f}s\")\n",
    "            print(f\"  Text: {entry['text'][:50]}{'...' if len(entry['text']) > 50 else ''}\")\n",
    "\n",
    "\n",
    "# Example usage\n",
    "if __name__ == \"__main__\":\n",
    "    # Example 1: Process the provided CSV file\n",
    "    csv_path = \"/home/ubuntu/respair/Tsukasa_LITE_Qanary.csv\"\n",
    "    manifest_path = \"/home/ubuntu/NeMo/data/tsukasa_manifest.json\"\n",
    "    \n",
    "    # Build the manifest\n",
    "    build_manifest_from_csv(\n",
    "        csv_path=csv_path,\n",
    "        manifest_path=manifest_path,\n",
    "        lang=\"ja\",  # Japanese\n",
    "        n_jobs=None  # Use all cores - 1\n",
    "    )\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "BRANCH='r2.3.0'\n",
    "def wget_from_nemo(nemo_script_path, local_dir=\"scripts\"):\n",
    "    os.makedirs(local_dir, exist_ok=True)\n",
    "    script_url = f\"https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/{BRANCH}/{nemo_script_path}\"\n",
    "    script_path = os.path.basename(nemo_script_path)\n",
    "    if not os.path.exists(f\"{local_dir}/{script_path}\"):\n",
    "        !wget -P {local_dir}/ {script_url}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# wget_from_nemo(\"scripts/speech_recognition/canary/build_canary_2_special_tokenizer.py\")\n",
    "output_dir = \"tokenizers/spl_tokens\"\n",
    "!mkdir -p {output_dir}\n",
    "!python scripts/speech_recognition/canary/build_canary_2_special_tokenizer.py {output_dir}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n"
     ]
    }
   ],
   "source": [
    "!sudo rm -r /home/ubuntu/NeMo/tokenizers/spl_tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
      "--2025-08-02 15:48:13--  https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/r2.3.0/scripts/tokenizers/process_asr_text_tokenizer.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 17146 (17K) [text/plain]\n",
      "Saving to: ‘scripts/process_asr_text_tokenizer.py’\n",
      "\n",
      "process_asr_text_to 100%[===================>]  16.74K  --.-KB/s    in 0.003s  \n",
      "\n",
      "2025-08-02 15:48:13 (6.02 MB/s) - ‘scripts/process_asr_text_tokenizer.py’ saved [17146/17146]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "wget_from_nemo('scripts/tokenizers/process_asr_text_tokenizer.py')\n",
    "LANG='jp'\n",
    "DATA='TSUKA'\n",
    "VOCAB_SIZE=1024\n",
    "OUT_DIR = f\"tokenizers/{LANG}_{DATA}_{VOCAB_SIZE}\"\n",
    "manifest_path = \"/home/ubuntu/NeMo/data/tsukasa_manifest.json\"\n",
    "train_text_path =\"/home/ubuntu/NeMo/data/tsukasa_manifest.lst\"\n",
    "with open(manifest_path, \"r\") as f:\n",
    "    data = [json.loads(line.strip()) for line in f.readlines()]\n",
    "with open(train_text_path, \"w\") as f:\n",
    "    for line in data:\n",
    "        f.write(f\"{line['text']}\\n\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
      "[NeMo I 2025-08-02 15:50:45 nemo_logging:393] Processing /home/ubuntu/NeMo/data/tsukasa_manifest.lst and store at tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024\n",
      "sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=/home/ubuntu/NeMo/data/tsukasa_manifest.lst --model_prefix=tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer --vocab_size=1024 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1 --remove_extra_whitespaces=false\n",
      "sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : \n",
      "trainer_spec {\n",
      "  input: /home/ubuntu/NeMo/data/tsukasa_manifest.lst\n",
      "  input_format: \n",
      "  model_prefix: tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer\n",
      "  model_type: BPE\n",
      "  vocab_size: 1024\n",
      "  self_test_sample_size: 0\n",
      "  character_coverage: 1\n",
      "  input_sentence_size: 0\n",
      "  shuffle_input_sentence: 1\n",
      "  seed_sentencepiece_size: 1000000\n",
      "  shrinking_factor: 0.75\n",
      "  max_sentence_length: 4192\n",
      "  num_threads: 16\n",
      "  num_sub_iterations: 2\n",
      "  max_sentencepiece_length: 16\n",
      "  split_by_unicode_script: 1\n",
      "  split_by_number: 1\n",
      "  split_by_whitespace: 1\n",
      "  split_digits: 0\n",
      "  pretokenization_delimiter: \n",
      "  treat_whitespace_as_suffix: 0\n",
      "  allow_whitespace_only_pieces: 0\n",
      "  required_chars: \n",
      "  byte_fallback: 0\n",
      "  vocabulary_output_piece_score: 1\n",
      "  train_extremely_large_corpus: 0\n",
      "  seed_sentencepieces_file: \n",
      "  hard_vocab_limit: 0\n",
      "  use_all_vocab: 0\n",
      "  unk_id: 0\n",
      "  bos_id: -1\n",
      "  eos_id: -1\n",
      "  pad_id: -1\n",
      "  unk_piece: <unk>\n",
      "  bos_piece: <s>\n",
      "  eos_piece: </s>\n",
      "  pad_piece: <pad>\n",
      "  unk_surface:  ⁇ \n",
      "  enable_differential_privacy: 0\n",
      "  differential_privacy_noise_level: 0\n",
      "  differential_privacy_clipping_threshold: 0\n",
      "}\n",
      "normalizer_spec {\n",
      "  name: nmt_nfkc\n",
      "  add_dummy_prefix: 1\n",
      "  remove_extra_whitespaces: 0\n",
      "  escape_whitespaces: 1\n",
      "  normalization_rule_tsv: \n",
      "}\n",
      "denormalizer_spec {}\n",
      "trainer_interface.cc(353) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.\n",
      "trainer_interface.cc(185) LOG(INFO) Loading corpus: /home/ubuntu/NeMo/data/tsukasa_manifest.lst\n",
      "trainer_interface.cc(409) LOG(INFO) Loaded all 488729 sentences\n",
      "trainer_interface.cc(425) LOG(INFO) Adding meta_piece: <unk>\n",
      "trainer_interface.cc(430) LOG(INFO) Normalizing sentences...\n",
      "trainer_interface.cc(539) LOG(INFO) all chars count=32921930\n",
      "trainer_interface.cc(560) LOG(INFO) Alphabet size=89\n",
      "trainer_interface.cc(561) LOG(INFO) Final character coverage=1\n",
      "trainer_interface.cc(592) LOG(INFO) Done! preprocessed 488729 sentences.\n",
      "trainer_interface.cc(598) LOG(INFO) Tokenizing input sentences with whitespace: 488729\n",
      "trainer_interface.cc(609) LOG(INFO) Done! 291302\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=1034735 min_freq=1\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=271215 size=20 all=1868 active=1762 piece=ka\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=161716 size=40 all=2954 active=2848 piece=ʔte\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=83371 size=60 all=4380 active=4274 piece=▁desɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=56419 size=80 all=6100 active=5994 piece=▁ɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=39490 size=100 all=8433 active=8327 piece=▁dʑa\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=39478 min_freq=1761\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=26896 size=120 all=10127 active=2649 piece=ː,\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=21363 size=140 all=12094 active=4616 piece=ɕo\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=17930 size=160 all=14308 active=6830 piece=ː.\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=15622 size=180 all=16373 active=8895 piece=▁naɽa\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=13828 size=200 all=18200 active=10722 piece=▁ze\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=13777 min_freq=1551\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=11759 size=220 all=19976 active=2730 piece=▁ano\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=10250 size=240 all=21790 active=4544 piece=▁mitai\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=8631 size=260 all=23455 active=6209 piece=taɽi\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=7995 size=280 all=24911 active=7665 piece=▁tsɯka\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=7268 size=300 all=26692 active=9446 piece=▁sen\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=7217 min_freq=977\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=6778 size=320 all=28308 active=2910 piece=toɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=6354 size=340 all=29507 active=4109 piece=▁tsɯkɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5983 size=360 all=30874 active=5476 piece=ː—\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5580 size=380 all=32629 active=7231 piece=▁kakɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=5169 size=400 all=34217 active=8819 piece=▁ona\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=5142 min_freq=650\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4900 size=420 all=35396 active=2854 piece=ɽei\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4645 size=440 all=36976 active=4434 piece=▁kao\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4280 size=460 all=38553 active=6011 piece=seɴ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=4053 size=480 all=39979 active=7437 piece=rɯɴ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3842 size=500 all=41237 active=8695 piece=kenai\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=3830 min_freq=492\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3611 size=520 all=42650 active=3382 piece=▁kawai\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3449 size=540 all=44120 active=4852 piece=▁toʔte\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3305 size=560 all=45261 active=5993 piece=▁hoɽa\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3170 size=580 all=46443 active=7175 piece=▁moʔte\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3026 size=600 all=47398 active=8130 piece=▁tsɯite\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=3017 min_freq=390\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2908 size=620 all=48580 active=3527 piece=▁tsɯzɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2779 size=640 all=49765 active=4712 piece=▁baɕo\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2635 size=660 all=51134 active=6081 piece=▁kɯtɕi\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2554 size=680 all=52065 active=7012 piece=▁wakaɽi\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2444 size=700 all=53397 active=8344 piece=waɽi\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=2444 min_freq=319\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2381 size=720 all=54657 active=3818 piece=▁harɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2286 size=740 all=55443 active=4604 piece=▁ikenai\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2164 size=760 all=57108 active=6269 piece=▁hadʑimete\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2102 size=780 all=58234 active=7395 piece=gaʔte\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=2040 size=800 all=59522 active=8683 piece=▁ɕimai\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=2037 min_freq=269\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1964 size=820 all=60584 active=4013 piece=▁sɯkɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1888 size=840 all=61568 active=4997 piece=▁natsɯ\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1834 size=860 all=62710 active=6139 piece=▁totemo\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1768 size=880 all=63277 active=6706 piece=rɯi\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1703 size=900 all=64460 active=7889 piece=eta\n",
      "bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=1701 min_freq=233\n",
      "bpe_model_trainer.cc(268) LOG(INFO) Added: freq=1640 size=920 all=65670 active=4333 piece=ʔkakɯ\n",
      "trainer_interface.cc(687) LOG(INFO) Saving model: tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer.model\n",
      "trainer_interface.cc(699) LOG(INFO) Saving vocabs: tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024/tokenizer.vocab\n",
      "Serialized tokenizer at location : tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024\n"
     ]
    }
   ],
   "source": [
    "\n",
    "!python scripts/process_asr_text_tokenizer.py \\\n",
    "  --data_file={train_text_path} \\\n",
    "  --vocab_size={VOCAB_SIZE} \\\n",
    "  --data_root={OUT_DIR} \\\n",
    "  --tokenizer=\"spe\" \\\n",
    "  --spe_type=bpe \\\n",
    "  --spe_character_coverage=1.0 \\\n",
    "  --no_lower_case \\\n",
    "  --log"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
      "--2025-08-02 15:54:46--  https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/r2.3.0/examples/asr/speech_multitask/speech_to_text_aed.py\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 3875 (3.8K) [text/plain]\n",
      "Saving to: ‘scripts/speech_to_text_aed.py’\n",
      "\n",
      "speech_to_text_aed. 100%[===================>]   3.78K  --.-KB/s    in 0s      \n",
      "\n",
      "2025-08-02 15:54:46 (59.9 MB/s) - ‘scripts/speech_to_text_aed.py’ saved [3875/3875]\n",
      "\n",
      "/bin/bash: /home/ubuntu/miniconda3/envs/respair/lib/libtinfo.so.6: no version information available (required by /bin/bash)\n",
      "--2025-08-02 15:54:47--  https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/r2.3.0/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 12239 (12K) [text/plain]\n",
      "Saving to: ‘config/fast-conformer_aed.yaml’\n",
      "\n",
      "fast-conformer_aed. 100%[===================>]  11.95K  --.-KB/s    in 0.001s  \n",
      "\n",
      "2025-08-02 15:54:47 (14.3 MB/s) - ‘config/fast-conformer_aed.yaml’ saved [12239/12239]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "wget_from_nemo('examples/asr/speech_multitask/speech_to_text_aed.py')\n",
    "wget_from_nemo('examples/asr/conf/speech_multitask/fast-conformer_aed.yaml', 'config')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# %%bash\n",
    "\n",
    "# HYDRA_FULL_ERROR=1 python scripts/speech_to_text_aed.py \\\n",
    "#   --config-path=\"/home/ubuntu/NeMo/config\" \\\n",
    "#   --config-name=\"/home/ubuntu/NeMo/config/fast-conformer_aed.yaml\" \\\n",
    "#   name=\"canary-small\" \\\n",
    "#   model.prompt_format=\"canary2\" \\\n",
    "#   model.train_ds.manifest_filepath=\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\" \\\n",
    "#   model.validation_ds.manifest_filepath=\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\" \\\n",
    "#   model.test_ds.manifest_filepath=\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\" \\\n",
    "#   model.tokenizer.langs.jp.dir=\"/home/ubuntu/NeMo/tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024\" \\\n",
    "#   model.tokenizer.langs.spl_tokens.dir=\"/home/ubuntu/NeMo/tokenizers/spl_tokens\" \\\n",
    "#   spl_tokens.model_dir=\"/home/ubuntu/NeMo/tokenizers/spl_tokens\" \\\n",
    "#   model.encoder.n_layers=17 \\\n",
    "#   model.transf_decoder.config_dict.num_layers=4 \\\n",
    "#   model.transf_decoder.config_dict.max_sequence_length=512 \\ \n",
    "#   model.model_defaults.asr_enc_hidden=512 \\\n",
    "#   model.model_defaults.lm_dec_hidden=1024 \\\n",
    "#   exp_manager.exp_dir=\"canary_results\" \\\n",
    "#   exp_manager.resume_ignore_no_checkpoint=true \\\n",
    "#   trainer.max_steps=200_000 \\\n",
    "#   trainer.log_every_n_steps=50\n",
    "\n",
    "!bash /home/ubuntu/NeMo/train_qanary.sh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: 487507 samples (99.7%)\n",
      "Val: 1222 samples (0.3%)\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import random\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Read all entries\n",
    "with open(\"/home/ubuntu/NeMo/data/tsukasa_manifest.json\", 'r') as f:\n",
    "    all_data = [json.loads(line) for line in f]\n",
    "\n",
    "# Split 90/10 for train/val (adjust ratio as needed)\n",
    "train_data, val_data = train_test_split(all_data, test_size=0.0025, random_state=42)\n",
    "\n",
    "# Write train manifest\n",
    "with open(\"/home/ubuntu/NeMo/data/tsukasa_train.json\", 'w') as f:\n",
    "    for entry in train_data:\n",
    "        json.dump(entry, f)\n",
    "        f.write('\\n')\n",
    "\n",
    "# Write validation manifest  \n",
    "with open(\"/home/ubuntu/NeMo/data/tsukasa_val.json\", 'w') as f:\n",
    "    for entry in val_data:\n",
    "        json.dump(entry, f)\n",
    "        f.write('\\n')\n",
    "\n",
    "print(f\"Train: {len(train_data)} samples ({len(train_data)/len(all_data)*100:.1f}%)\")\n",
    "print(f\"Val: {len(val_data)} samples ({len(val_data)/len(all_data)*100:.1f}%)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train: 794 samples (65.0%)\n",
      "Val: 428 samples (35.0%)\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import random\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Read all entries\n",
    "with open(\"/home/ubuntu/NeMo/data/tsukasa_val.json\", 'r') as f:\n",
    "    all_data = [json.loads(line) for line in f]\n",
    "\n",
    "# Split 90/10 for train/val (adjust ratio as needed)\n",
    "train_data, val_data = train_test_split(all_data, test_size=0.35, random_state=42)\n",
    "\n",
    "# Write train manifest\n",
    "with open(\"/home/ubuntu/NeMo/data/tsukasa_val.json\", 'w') as f:\n",
    "    for entry in train_data:\n",
    "        json.dump(entry, f)\n",
    "        f.write('\\n')\n",
    "\n",
    "# Write validation manifest  \n",
    "with open(\"/home/ubuntu/NeMo/data/tsukasa_test.json\", 'w') as f:\n",
    "    for entry in val_data:\n",
    "        json.dump(entry, f)\n",
    "        f.write('\\n')\n",
    "\n",
    "print(f\"Train: {len(train_data)} samples ({len(train_data)/len(all_data)*100:.1f}%)\")\n",
    "print(f\"Val: {len(val_data)} samples ({len(val_data)/len(all_data)*100:.1f}%)\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "respair",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}