File size: 6,242 Bytes
c6b1960 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model was trained with pyannote.audio 0.0.1, yours is 2.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
"Model was trained with torch 1.10.0+cu102, yours is 2.0.0+cu118. Bad things might happen unless you revert torch to 1.x.\n",
"CPU times: user 826 ms, sys: 96.7 ms, total: 923 ms\n",
"Wall time: 831 ms\n",
"[{'text': 'đó là ước vọng của nguyễn ái quốc từ những năm hai mươi của thế kỷ trước về một nhà nước việt nam độc lập dân chủ', 'start': 0.008, 'end': 6.556}]\n"
]
}
],
"source": [
"import whisperx\n",
"import gc \n",
"\n",
"device = \"cuda\" \n",
"audio_file = \"6.wav\"\n",
"batch_size = 16 \n",
"compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n",
"model_path = \"./Vietnamese_ASR/ct2ranslate\"\n",
"# 1. Transcribe with original whisper (batched)\n",
"model = whisperx.load_model(model_path, device, compute_type=compute_type,language='vi')\n",
"\n",
"audio = whisperx.load_audio(audio_file)\n",
"%time result = model.transcribe(audio, batch_size=batch_size)\n",
"print(result[\"segments\"]) # before alignment\n",
"\n",
"# delete model if low on GPU resources\n",
"# import gc; gc.collect(); torch.cuda.empty_cache(); del model\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gc; gc.collect()\n",
"import torch\n",
"torch.cuda.empty_cache(); del model"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vi and are newly initialized: ['lm_head.weight', 'lm_head.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'start': 0.008, 'end': 2.396, 'text': 'với một người đi làm thuê như anh số tiền kiếm được chưa đủ để thoả mãn nhu cầu cá nhân nói gì đến chăm lo cho gia đình', 'words': [{'word': 'với', 'start': 0.008, 'end': 0.068, 'score': 0.01}, {'word': 'một', 'start': 0.088, 'end': 0.148, 'score': 0.01}, {'word': 'người', 'start': 0.169, 'end': 0.269, 'score': 0.011}, {'word': 'đi', 'start': 0.289, 'end': 0.329, 'score': 0.01}, {'word': 'làm', 'start': 0.349, 'end': 0.409, 'score': 0.011}, {'word': 'thuê', 'start': 0.429, 'end': 0.51, 'score': 0.01}, {'word': 'như', 'start': 0.53, 'end': 0.59, 'score': 0.012}, {'word': 'anh', 'start': 0.61, 'end': 0.67, 'score': 0.01}, {'word': 'số', 'start': 0.69, 'end': 0.73, 'score': 0.01}, {'word': 'tiền', 'start': 0.75, 'end': 0.831, 'score': 0.01}, {'word': 'kiếm', 'start': 0.851, 'end': 0.931, 'score': 0.01}, {'word': 'được', 'start': 0.951, 'end': 1.031, 'score': 0.01}, {'word': 'chưa', 'start': 1.051, 'end': 1.132, 'score': 0.01}, {'word': 'đủ', 'start': 1.152, 'end': 1.192, 'score': 0.01}, {'word': 'để', 'start': 1.212, 'end': 1.252, 'score': 0.01}, {'word': 'thoả', 'start': 1.272, 'end': 1.353, 'score': 0.01}, {'word': 'mãn', 'start': 1.373, 'end': 1.433, 'score': 0.011}, {'word': 'nhu', 'start': 1.453, 'end': 1.513, 'score': 0.011}, {'word': 'cầu', 'start': 1.533, 'end': 1.593, 'score': 0.011}, {'word': 'cá', 'start': 1.613, 'end': 1.654, 'score': 0.01}, {'word': 'nhân', 'start': 1.674, 'end': 1.754, 'score': 0.011}, {'word': 'nói', 'start': 1.774, 'end': 1.834, 'score': 0.01}, {'word': 'gì', 'start': 1.854, 'end': 1.894, 'score': 0.011}, {'word': 'đến', 'start': 1.914, 'end': 1.975, 'score': 0.01}, {'word': 'chăm', 'start': 1.995, 'end': 2.075, 'score': 0.011}, {'word': 'lo', 'start': 2.095, 'end': 2.135, 'score': 0.009}, {'word': 'cho', 'start': 2.155, 'end': 2.215, 'score': 0.011}, {'word': 'gia', 'start': 2.235, 'end': 2.296, 'score': 0.01}, {'word': 'đình', 'start': 2.316, 'end': 2.396, 'score': 0.011}]}]\n"
]
}
],
"source": [
"# 2. Align whisper output\n",
"device = \"cuda\" \n",
"audio_file = \"audio.wav\"\n",
"batch_size = 16 \n",
"compute_type = \"float16\" # change to \"int8\" if low on GPU mem (may reduce accuracy)\n",
"model_path = \"./Vietnamese_ASR/ct2ranslate\"\n",
"model_a, metadata = whisperx.load_align_model(language_code=\"vi\" ,device=device)\n",
"result = whisperx.align(result[\"segments\"], model_a, metadata, audio, device, return_char_alignments=False)\n",
"\n",
"print(result[\"segments\"]) # after alignment\n",
"\n",
"# delete model if low on GPU resources\n",
"import gc; gc.collect(); torch.cuda.empty_cache(); del model_a\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 3. Assign speaker labels\n",
"diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)\n",
"\n",
"# add min/max number of speakers if known\n",
"diarize_segments = diarize_model(audio)\n",
"# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)\n",
"\n",
"result = whisperx.assign_word_speakers(diarize_segments, result)\n",
"print(diarize_segments)\n",
"print(result[\"segments\"]) # segments are now assigned speaker IDs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "DUY",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.17"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
|