{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.\n", "Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118. Bad things might happen unless you revert torch to 1.x.\n", "CPU times: user 826 ms, sys: 96.7 ms, total: 923 ms\n", "Wall time: 831 ms\n", "[{'text': 'đó là ước vọng của nguyễn ái quốc từ những năm hai mươi của thế kỷ trước về một nhà nước việt nam độc lập dân chủ', 'start': 0.008, 'end': 6.556}]\n" ] } ], "source": [ "import whisperx\n", "import gc \n", "\n", "device = \"cuda\" \n", "audio_file = \"6.wav\"\n", "batch_size = 16 \n", "compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n", "model_path = \"./Vietnamese_ASR/ct2ranslate\"\n", "# 1. Transcribe with original whisper (batched)\n", "model = whisperx.load_model(model_path, device, compute_type=compute_type,language='vi')\n", "\n", "audio = whisperx.load_audio(audio_file)\n", "%time result = model.transcribe(audio, batch_size=batch_size)\n", "print(result[\"segments\"]) # before alignment\n", "\n", "# delete model if low on GPU resources\n", "# import gc; gc.collect(); torch.cuda.empty_cache(); del model\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gc; gc.collect()\n", "import torch\n", "torch.cuda.empty_cache(); del model" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vi and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[{'start': 0.008, 'end': 2.396, 'text': 'với một người đi làm thuê như anh số tiền kiếm được chưa đủ để thoả mãn nhu cầu cá nhân nói gì đến chăm lo cho gia đình', 'words': [{'word': 'với', 'start': 0.008, 'end': 0.068, 'score': 0.01}, {'word': 'một', 'start': 0.088, 'end': 0.148, 'score': 0.01}, {'word': 'người', 'start': 0.169, 'end': 0.269, 'score': 0.011}, {'word': 'đi', 'start': 0.289, 'end': 0.329, 'score': 0.01}, {'word': 'làm', 'start': 0.349, 'end': 0.409, 'score': 0.011}, {'word': 'thuê', 'start': 0.429, 'end': 0.51, 'score': 0.01}, {'word': 'như', 'start': 0.53, 'end': 0.59, 'score': 0.012}, {'word': 'anh', 'start': 0.61, 'end': 0.67, 'score': 0.01}, {'word': 'số', 'start': 0.69, 'end': 0.73, 'score': 0.01}, {'word': 'tiền', 'start': 0.75, 'end': 0.831, 'score': 0.01}, {'word': 'kiếm', 'start': 0.851, 'end': 0.931, 'score': 0.01}, {'word': 'được', 'start': 0.951, 'end': 1.031, 'score': 0.01}, {'word': 'chưa', 'start': 1.051, 'end': 1.132, 'score': 0.01}, {'word': 'đủ', 'start': 1.152, 'end': 1.192, 'score': 0.01}, {'word': 'để', 'start': 1.212, 'end': 1.252, 'score': 0.01}, {'word': 'thoả', 'start': 1.272, 'end': 1.353, 'score': 0.01}, {'word': 'mãn', 'start': 1.373, 'end': 1.433, 'score': 0.011}, {'word': 'nhu', 'start': 1.453, 'end': 1.513, 'score': 0.011}, {'word': 'cầu', 'start': 1.533, 'end': 1.593, 'score': 0.011}, {'word': 'cá', 'start': 1.613, 'end': 1.654, 'score': 0.01}, {'word': 'nhân', 'start': 1.674, 'end': 1.754, 'score': 0.011}, {'word': 'nói', 'start': 1.774, 'end': 1.834, 'score': 0.01}, {'word': 'gì', 'start': 1.854, 'end': 1.894, 'score': 0.011}, {'word': 'đến', 'start': 1.914, 'end': 1.975, 'score': 0.01}, {'word': 'chăm', 'start': 1.995, 'end': 2.075, 'score': 0.011}, {'word': 'lo', 'start': 2.095, 'end': 2.135, 'score': 0.009}, {'word': 'cho', 'start': 2.155, 'end': 2.215, 'score': 0.011}, {'word': 'gia', 'start': 2.235, 'end': 2.296, 'score': 0.01}, {'word': 'đình', 'start': 2.316, 'end': 2.396, 'score': 0.011}]}]\n" ] } ], "source": [ "# 2. Align whisper output\n", "device = \"cuda\" \n", "audio_file = \"audio.wav\"\n", "batch_size = 16 \n", "compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n", "model_path = \"./Vietnamese_ASR/ct2ranslate\"\n", "model_a, metadata = whisperx.load_align_model(language_code=\"vi\" ,device=device)\n", "result = whisperx.align(result[\"segments\"], model_a, metadata, audio, device, return_char_alignments=False)\n", "\n", "print(result[\"segments\"]) # after alignment\n", "\n", "# delete model if low on GPU resources\n", "import gc; gc.collect(); torch.cuda.empty_cache(); del model_a\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. Assign speaker labels\n", "diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)\n", "\n", "# add min/max number of speakers if known\n", "diarize_segments = diarize_model(audio)\n", "# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)\n", "\n", "result = whisperx.assign_word_speakers(diarize_segments, result)\n", "print(diarize_segments)\n", "print(result[\"segments\"]) # segments are now assigned speaker IDs" ] } ], "metadata": { "kernelspec": { "display_name": "DUY", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.17" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }