Spaces:

yalsaffar
/

S3TVR-Demo

Sleeping

File size: 14,379 Bytes

aa7cb02

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\spn\\anaconda3\\envs\\capstone\\Lib\\site-packages\\torchvision\\io\\image.py:13: UserWarning: Failed to load image Python extension: '[WinError 127] The specified procedure could not be found'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n",
      "  warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-06-10 23:30:49,190] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2024-06-10 23:30:49,544] torch.distributed.elastic.multiprocessing.redirects: [WARNING] NOTE: Redirects are currently not supported in Windows or MacOs.\n",
      "[NeMo W 2024-06-10 23:30:52 nemo_logging:393] Could not import NeMo NLP collection which is required for speech translation model.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2024-06-10 23:31:08 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2024-06-10 23:31:08 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
      "    Train config : \n",
      "    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpeech_NeMo/librivox-train-all.json\n",
      "    sample_rate: 16000\n",
      "    batch_size: 16\n",
      "    shuffle: true\n",
      "    num_workers: 8\n",
      "    pin_memory: true\n",
      "    use_start_end_token: false\n",
      "    trim_silence: false\n",
      "    max_duration: 16.7\n",
      "    min_duration: 0.1\n",
      "    is_tarred: false\n",
      "    tarred_audio_filepaths: null\n",
      "    shuffle_n: 2048\n",
      "    bucketing_strategy: fully_randomized\n",
      "    bucketing_batch_size: null\n",
      "    \n",
      "[NeMo W 2024-06-10 23:31:08 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
      "    Validation config : \n",
      "    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpeech_NeMo/librivox-dev-clean.json\n",
      "    sample_rate: 16000\n",
      "    batch_size: 16\n",
      "    shuffle: false\n",
      "    use_start_end_token: false\n",
      "    num_workers: 8\n",
      "    pin_memory: true\n",
      "    \n",
      "[NeMo W 2024-06-10 23:31:08 nemo_logging:393] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
      "    Test config : \n",
      "    manifest_filepath: null\n",
      "    sample_rate: 16000\n",
      "    batch_size: 16\n",
      "    shuffle: false\n",
      "    use_start_end_token: false\n",
      "    num_workers: 8\n",
      "    pin_memory: true\n",
      "    \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2024-06-10 23:31:08 nemo_logging:381] PADDING: 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `method_cfg` is deprecated and will be removed in the future. Please use `measure_cfg` instead.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `measure_cfg` with the value of `method_cfg`.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `temperature` is deprecated and will be removed in the future. Please use `alpha` instead.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `alpha` with the value of `temperature`.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `method_cfg` is deprecated and will be removed in the future. Please use `measure_cfg` instead.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `measure_cfg` with the value of `method_cfg`.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] `temperature` is deprecated and will be removed in the future. Please use `alpha` instead.\n",
      "[NeMo W 2024-06-10 23:31:11 nemo_logging:393] Re-writing `alpha` with the value of `temperature`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2024-06-10 23:31:16 nemo_logging:381] Model EncDecCTCModelBPE was successfully restored from C:\\Users\\spn\\.cache\\huggingface\\hub\\models--nvidia--parakeet-ctc-0.6b\\snapshots\\097ffc5b027beabc73acb627def2d1d278e774e9\\parakeet-ctc-0.6b.nemo.\n"
     ]
    }
   ],
   "source": [
    "from models.nllb import nllb\n",
    "#from models.TTS_utils import xtts_v2\n",
    "from models.parakeet import parakeet_ctc_model\n",
    "from models.es_fastconformer import stt_es_model\n",
    "model_nllb, tokinizer_nllb = nllb()\n",
    "#xtts_v2_model = xtts_v2()\n",
    "parakeet = parakeet_ctc_model()\n",
    "#sst = stt_es_model()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Writing audio_segments\\segment_0.wav...\n",
      "Processing segment...\n",
      "0.021454915\n",
      "Noise reduction done!\n",
      "Noise removed. Time: 0.06042814254760742\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6909654da05f4b0a88458139a9b37d6d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Transcription: hello can you hear me\n",
      "Transcription time: 1.3255603313446045\n",
      "Translating...\n",
      "Processing translation...\n",
      "Translation: Hola, ¿ me escuchas?\n",
      "Translation time: 0.932790994644165\n",
      "Writing audio_segments\\segment_1.wav...\n",
      "Processing segment...\n",
      "0.010297036\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_2.wav...\n",
      "Processing segment...\n",
      "0.006772096\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_3.wav...\n",
      "Processing segment...\n",
      "0.0034770737\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_4.wav...\n",
      "Processing segment...\n",
      "0.0039069764\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_5.wav...\n",
      "Processing segment...\n",
      "0.0046523036\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_6.wav...\n",
      "Processing segment...\n",
      "0.0040206155\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_7.wav...\n",
      "Processing segment...\n",
      "0.0043495107\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_8.wav...\n",
      "Processing segment...\n",
      "0.00421352\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_9.wav...\n",
      "Processing segment...\n",
      "0.0040656724\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_10.wav...\n",
      "Processing segment...\n",
      "0.0042125704\n",
      "No speech detected.\n",
      "Writing audio_segments\\segment_11.wav...\n",
      "Processing segment...\n",
      "0.015398192\n",
      "Noise reduction done!\n",
      "Noise removed. Time: 0.020929336547851562\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "de3d4b3a7bc14de2afbb01ff82252dc2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Transcribing:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from stream_VAD import stream\n",
    "stream(parakeet, model_nllb, tokinizer_nllb, \"english\", \"spanish\", 'record_temp.json', 'record_per.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fdc0440dfcaf4c9f814689fc47c10e3e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "(…)tt_es_fastconformer_hybrid_large_pc.nemo:   0%|          | 0.00/459M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2024-04-12 16:10:09 nemo_logging:381] Tokenizer SentencePieceTokenizer initialized with 1024 tokens\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2024-04-12 16:10:10 nemo_logging:393] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.\n",
      "    Train config : \n",
      "    manifest_filepath: null\n",
      "    sample_rate: 16000\n",
      "    batch_size: 16\n",
      "    shuffle: true\n",
      "    num_workers: 8\n",
      "    pin_memory: true\n",
      "    use_start_end_token: false\n",
      "    trim_silence: false\n",
      "    max_duration: 20\n",
      "    min_duration: 0.1\n",
      "    is_tarred: false\n",
      "    tarred_audio_filepaths: null\n",
      "    shuffle_n: 2048\n",
      "    bucketing_strategy: fully_randomized\n",
      "    bucketing_batch_size: null\n",
      "    is_concat: false\n",
      "    concat_sampling_technique: random\n",
      "    concat_sampling_probabilities: ''\n",
      "    \n",
      "[NeMo W 2024-04-12 16:10:10 nemo_logging:393] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). \n",
      "    Validation config : \n",
      "    manifest_filepath: null\n",
      "    sample_rate: 16000\n",
      "    batch_size: 32\n",
      "    shuffle: false\n",
      "    num_workers: 8\n",
      "    pin_memory: true\n",
      "    use_start_end_token: false\n",
      "    is_concat: true\n",
      "    concat_sampling_technique: random\n",
      "    concat_sampling_probabilities:\n",
      "    - 0.099\n",
      "    - 0.2771\n",
      "    - 0.5482\n",
      "    - 0.0757\n",
      "    concat_shuffle: false\n",
      "    concat_sampling_seed: 1234\n",
      "    max_duration: 20\n",
      "    \n",
      "[NeMo W 2024-04-12 16:10:10 nemo_logging:393] Please call the ModelPT.setup_test_data() or ModelPT.setup_multiple_test_data() method and provide a valid configuration file to setup the test data loader(s).\n",
      "    Test config : \n",
      "    manifest_filepath: null\n",
      "    sample_rate: 16000\n",
      "    batch_size: 16\n",
      "    shuffle: false\n",
      "    num_workers: 8\n",
      "    pin_memory: true\n",
      "    use_start_end_token: false\n",
      "    \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2024-04-12 16:10:10 nemo_logging:381] PADDING: 0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[NeMo W 2024-04-12 16:10:11 nemo_logging:393] c:\\Users\\spn\\anaconda3\\envs\\capstone\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:83: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
      "      warnings.warn(\"dropout option adds dropout after all but last \"\n",
      "    \n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NeMo I 2024-04-12 16:10:11 nemo_logging:381] Using RNNT Loss : warprnnt_numba\n",
      "    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}\n",
      "[NeMo I 2024-04-12 16:10:12 nemo_logging:381] Model EncDecHybridRNNTCTCBPEModel was successfully restored from C:\\Users\\spn\\.cache\\huggingface\\hub\\models--nvidia--stt_es_fastconformer_hybrid_large_pc\\snapshots\\65f775445d5947d6784c3e80d9a14d859571947f\\stt_es_fastconformer_hybrid_large_pc.nemo.\n"
     ]
    }
   ],
   "source": [
    "from models.es_fastconformer import stt_es_model\n",
    "model = stt_es_model()\n",
    "# check how much memory is used by the model\n",
    "import torch\n",
    "import psutil\n",
    "import os\n",
    "import time\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model size: 458.86 MB\n"
     ]
    }
   ],
   "source": [
    "# get the size of the model in term of memory in MB\n",
    "def get_size(model):\n",
    "    torch.save(model.state_dict(), 'temp.p')\n",
    "    size = os.path.getsize('temp.p') / 1e6\n",
    "    os.remove('temp.p')\n",
    "    return size\n",
    "size = get_size(model)\n",
    "print(f\"Model size: {size:.2f} MB\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "capstone",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}