File size: 6,242 Bytes

c6b1960

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
      "Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118. Bad things might happen unless you revert torch to 1.x.\n",
      "CPU times: user 826 ms, sys: 96.7 ms, total: 923 ms\n",
      "Wall time: 831 ms\n",
      "[{'text': 'đó là ước vọng của nguyễn ái quốc từ những năm hai mươi của thế kỷ trước về một nhà nước việt nam độc lập dân chủ', 'start': 0.008, 'end': 6.556}]\n"
     ]
    }
   ],
   "source": [
    "import whisperx\n",
    "import gc \n",
    "\n",
    "device = \"cuda\" \n",
    "audio_file = \"6.wav\"\n",
    "batch_size = 16 \n",
    "compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n",
    "model_path = \"./Vietnamese_ASR/ct2ranslate\"\n",
    "# 1. Transcribe with original whisper (batched)\n",
    "model = whisperx.load_model(model_path, device, compute_type=compute_type,language='vi')\n",
    "\n",
    "audio = whisperx.load_audio(audio_file)\n",
    "%time result = model.transcribe(audio, batch_size=batch_size)\n",
    "print(result[\"segments\"]) # before alignment\n",
    "\n",
    "# delete model if low on GPU resources\n",
    "# import gc; gc.collect(); torch.cuda.empty_cache(); del model\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc; gc.collect()\n",
    "import torch\n",
    "torch.cuda.empty_cache(); del model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vi and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'start': 0.008, 'end': 2.396, 'text': 'với một người đi làm thuê như anh số tiền kiếm được chưa đủ để thoả mãn nhu cầu cá nhân nói gì đến chăm lo cho gia đình', 'words': [{'word': 'với', 'start': 0.008, 'end': 0.068, 'score': 0.01}, {'word': 'một', 'start': 0.088, 'end': 0.148, 'score': 0.01}, {'word': 'người', 'start': 0.169, 'end': 0.269, 'score': 0.011}, {'word': 'đi', 'start': 0.289, 'end': 0.329, 'score': 0.01}, {'word': 'làm', 'start': 0.349, 'end': 0.409, 'score': 0.011}, {'word': 'thuê', 'start': 0.429, 'end': 0.51, 'score': 0.01}, {'word': 'như', 'start': 0.53, 'end': 0.59, 'score': 0.012}, {'word': 'anh', 'start': 0.61, 'end': 0.67, 'score': 0.01}, {'word': 'số', 'start': 0.69, 'end': 0.73, 'score': 0.01}, {'word': 'tiền', 'start': 0.75, 'end': 0.831, 'score': 0.01}, {'word': 'kiếm', 'start': 0.851, 'end': 0.931, 'score': 0.01}, {'word': 'được', 'start': 0.951, 'end': 1.031, 'score': 0.01}, {'word': 'chưa', 'start': 1.051, 'end': 1.132, 'score': 0.01}, {'word': 'đủ', 'start': 1.152, 'end': 1.192, 'score': 0.01}, {'word': 'để', 'start': 1.212, 'end': 1.252, 'score': 0.01}, {'word': 'thoả', 'start': 1.272, 'end': 1.353, 'score': 0.01}, {'word': 'mãn', 'start': 1.373, 'end': 1.433, 'score': 0.011}, {'word': 'nhu', 'start': 1.453, 'end': 1.513, 'score': 0.011}, {'word': 'cầu', 'start': 1.533, 'end': 1.593, 'score': 0.011}, {'word': 'cá', 'start': 1.613, 'end': 1.654, 'score': 0.01}, {'word': 'nhân', 'start': 1.674, 'end': 1.754, 'score': 0.011}, {'word': 'nói', 'start': 1.774, 'end': 1.834, 'score': 0.01}, {'word': 'gì', 'start': 1.854, 'end': 1.894, 'score': 0.011}, {'word': 'đến', 'start': 1.914, 'end': 1.975, 'score': 0.01}, {'word': 'chăm', 'start': 1.995, 'end': 2.075, 'score': 0.011}, {'word': 'lo', 'start': 2.095, 'end': 2.135, 'score': 0.009}, {'word': 'cho', 'start': 2.155, 'end': 2.215, 'score': 0.011}, {'word': 'gia', 'start': 2.235, 'end': 2.296, 'score': 0.01}, {'word': 'đình', 'start': 2.316, 'end': 2.396, 'score': 0.011}]}]\n"
     ]
    }
   ],
   "source": [
    "# 2. Align whisper output\n",
    "device = \"cuda\" \n",
    "audio_file = \"audio.wav\"\n",
    "batch_size = 16 \n",
    "compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n",
    "model_path = \"./Vietnamese_ASR/ct2ranslate\"\n",
    "model_a, metadata = whisperx.load_align_model(language_code=\"vi\" ,device=device)\n",
    "result = whisperx.align(result[\"segments\"], model_a, metadata, audio, device, return_char_alignments=False)\n",
    "\n",
    "print(result[\"segments\"]) # after alignment\n",
    "\n",
    "# delete model if low on GPU resources\n",
    "import gc; gc.collect(); torch.cuda.empty_cache(); del model_a\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. Assign speaker labels\n",
    "diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)\n",
    "\n",
    "# add min/max number of speakers if known\n",
    "diarize_segments = diarize_model(audio)\n",
    "# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)\n",
    "\n",
    "result = whisperx.assign_word_speakers(diarize_segments, result)\n",
    "print(diarize_segments)\n",
    "print(result[\"segments\"]) # segments are now assigned speaker IDs"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DUY",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.17"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}