File size: 2,456 Bytes
5c641bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import wave\n",
"\n",
"from librosa import resample\n",
"from IPython.display import Audio\n",
"from transformers import pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def open_wave(wav_filename):\n",
" with wave.open(wav_filename, mode=\"rb\") as wav_in:\n",
" if wav_in.getsampwidth() != 2:\n",
" raise Exception(\"Input not 16-bit\")\n",
"\n",
" nchannels = wav_in.getnchannels()\n",
" nframes = wav_in.getnframes()\n",
" nsamples = nchannels * nframes\n",
" xb = wav_in.readframes(nframes)\n",
" b_np = np.frombuffer(xb, dtype=np.int16) / nchannels\n",
" samples = [int(sum(b_np[b0 : b0 + nchannels])) for b0 in range(0, nsamples, nchannels)]\n",
"\n",
" return (samples, wav_in.getframerate())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipe = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base\", chunk_length_s=30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def transcribe(samples, orig_sr=44100, target_sr=16000):\n",
" min_s, max_s = min(samples), max(samples)\n",
" samples_f = 2.0 * (np.array(samples) - min_s) / (max_s - min_s) - 1.0\n",
" resamples = resample(samples_f, orig_sr=orig_sr, target_sr=target_sr)\n",
" prediction = pipe(resamples.copy(), batch_size=8)\n",
" return prediction[\"text\"].strip().lower()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"samples, sr = open_wave(\"./audio/plain_01.wav\")\n",
"display(Audio(samples, rate=sr))\n",
"transcribe(samples, sr)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "gradio",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|