{ "cells": [ { "cell_type": "markdown", "id": "807e94de-b600-46ca-9808-372619e38e69", "metadata": {}, "source": [ "# Making kurianbenoy/faster-speech-to-text-for-malayalam with Jupyter notebooks" ] }, { "cell_type": "markdown", "id": "f0e04921-4634-4d16-940f-bf8dd20bb63b", "metadata": {}, "source": [ "## Install packages" ] }, { "cell_type": "code", "execution_count": 1, "id": "7a6257dd-ea39-44e1-b103-3f9588d6cf4d", "metadata": {}, "outputs": [], "source": [ "!pip install -Uqq nbdev gradio==3.31.0 faster-whisper==0.5.1" ] }, { "cell_type": "markdown", "id": "d7ba223d-8043-4aab-8df3-f6cf3a4ac6b2", "metadata": {}, "source": [ "## Basic inference code" ] }, { "cell_type": "code", "execution_count": 2, "id": "22e6e9c5-7a3f-4546-8039-ecf98004235b", "metadata": {}, "outputs": [], "source": [ "#|export\n", "import gradio as gr\n", "from faster_whisper import WhisperModel" ] }, { "cell_type": "code", "execution_count": 3, "id": "81691362-0c73-4af0-9f99-96ffb7dc318b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'3.31.0'" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gr.__version__" ] }, { "cell_type": "code", "execution_count": 5, "id": "de8e21b9-449a-4ae3-bd64-bba334075fdd", "metadata": {}, "outputs": [], "source": [ "def t_asr(folder=\"vegam-whisper-medium-ml-fp16\", audio_file=\"vegam-whisper-medium-ml-fp16/00b38e80-80b8-4f70-babf-566e848879fc.webm\", compute_type=\"float16\", device=\"cpu\"):\n", " model = WhisperModel(folder, device=device, compute_type=compute_type)\n", " \n", " segments, info = model.transcribe(audio_file, beam_size=5)\n", " \n", " for segment in segments:\n", " print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))" ] }, { "cell_type": "code", "execution_count": 6, "id": "25e1413f-8f80-4704-a94e-26b8d9581a6a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.00s -> 4.58s] പാലം കടുക്കുവോളം നാരായണ പാലം കടന്നാലോ കൂരായണ\n", "CPU times: user 11.2 s, sys: 2.2 s, total: 13.4 s\n", "Wall time: 6.54 s\n" ] } ], "source": [ "%%time\n", "t_asr(compute_type=\"int8\", device=\"cuda\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "b255a0f0-4987-4f04-b63c-5ca0167917b6", "metadata": {}, "outputs": [], "source": [ "#|export \n", "def transcribe_malayalam_speech(audio_file, compute_type=\"int8\", device=\"cpu\", folder=\"vegam-whisper-medium-ml-fp16\"):\n", " \n", " model = WhisperModel(folder, device=device, compute_type=compute_type)\n", " segments, info = model.transcribe(audio_file, beam_size=5)\n", "\n", " lst = []\n", " for segment in segments:\n", " # print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))\n", " lst.append(segment.text)\n", "\n", " return(\" \".join(lst))" ] }, { "cell_type": "code", "execution_count": 8, "id": "48cd4ec3-512f-49d0-87ac-3ef989e25b80", "metadata": {}, "outputs": [], "source": [ "#|export\n", "def gr_transcribe_malayalam_speech(microphone, file_upload, compute_type=\"int8\", device=\"cpu\", folder=\"vegam-whisper-medium-ml-fp16\"):\n", " warn_output = \"\"\n", " if (microphone is not None) and (file_upload is not None):\n", " warn_output = (\n", " \"WARNING: You've uploaded an audio file and used the microphone. \"\n", " \"The recorded file from the microphone will be used and the uploaded audio will be discarded.\\n\"\n", " )\n", "\n", " elif (microphone is None) and (file_upload is None):\n", " return \"ERROR: You have to either use the microphone or upload an audio file\"\n", "\n", " audio_file = microphone if microphone is not None else file_upload\n", " \n", " model = WhisperModel(folder, device=device, compute_type=compute_type)\n", " segments, info = model.transcribe(audio_file, beam_size=5)\n", "\n", " lst = []\n", " for segment in segments:\n", " # print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))\n", " lst.append(segment.text)\n", "\n", " return(\" \".join(lst))" ] }, { "cell_type": "code", "execution_count": 9, "id": "14fda29a-aee1-44b2-9269-048cc8b98ea8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 40.6 s, sys: 9.76 s, total: 50.3 s\n", "Wall time: 13.6 s\n" ] }, { "data": { "text/plain": [ "'പാലം കടുക്കുവോളം നാരായണ പാലം കടന്നാലോ കൂരായണ'" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "transcribe_malayalam_speech(audio_file=\"vegam-whisper-medium-ml-fp16/00b38e80-80b8-4f70-babf-566e848879fc.webm\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "bbdadecf-68d1-4183-8e43-7965c1aecf6a", "metadata": {}, "outputs": [], "source": [ "## Haha, You are burning GPUs and wasting CO2" ] }, { "cell_type": "markdown", "id": "45fade75-e0b1-4c5d-90a3-ebd7345a4d16", "metadata": {}, "source": [ "## Figure out Whisper Demo by Huggingface" ] }, { "cell_type": "code", "execution_count": null, "id": "023ffa7c-b82f-49ea-b6ca-00f84e2c8698", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "fe37c9e1-bc56-422d-9547-be94ab4e4844", "metadata": {}, "source": [ "## Make an app with Gradio" ] }, { "cell_type": "code", "execution_count": 10, "id": "9badfdcd-dd99-49ea-a318-eda88cddefb6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://0.0.0.0:6006\n", "Running on public URL: https://9fa992d2ba37b0af49.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import gradio as gr\n", "\n", "def greet(name):\n", " return \"Hello \" + name + \"!!\"\n", "\n", "iface = gr.Interface(fn=greet, inputs=\"text\", outputs=\"text\")\n", "iface.launch(share=True)" ] }, { "cell_type": "code", "execution_count": 20, "id": "81f3b241-8a6d-4ff0-bb70-d389d4d4e93a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/gradio/inputs.py:321: UserWarning: Usage of gradio.inputs is deprecated, and will not be supported in the future, please import your components from gradio.components\n", " warnings.warn(\n", "/opt/conda/lib/python3.10/site-packages/gradio/inputs.py:324: UserWarning: `optional` parameter is deprecated, and it has no effect\n", " super().__init__(source=source, type=type, label=label, optional=optional)\n" ] } ], "source": [ "#|export\n", "mf_transcribe = gr.Interface(\n", " fn=gr_transcribe_malayalam_speech,\n", " inputs=[\n", " gr.inputs.Audio(source=\"microphone\", type=\"filepath\", optional=True),\n", " gr.inputs.Audio(source=\"upload\", type=\"filepath\", optional=True),\n", " ],\n", " outputs=\"text\",\n", " title=\"PALLAKKU (പല്ലക്ക്)\",\n", " description=(\n", " \"Pallakku is a Malayalam speech to text demo leveraging the model-weights of [vegam-whisper-medium-ml](https://huggingface.co/kurianbenoy/vegam-whisper-medium-ml-fp16).\"\n", " ),\n", " article=\"Please note that this demo now uses CPU only and in my testing for a 5 seconds audio file it can take upto 15 seconds for results to come. If you are interested to use a GPU based API instead, feel free to contact the author @ kurian.bkk@gmail.com\",\n", " allow_flagging=\"never\",\n", ")" ] }, { "cell_type": "code", "execution_count": 24, "id": "b1e34fa5-8340-4329-a348-b641ca4db341", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rerunning server... use `close()` to stop if you need to change `launch()` parameters.\n", "----\n", "Running on local URL: http://0.0.0.0:6010\n", "Running on public URL: https://19b32861466405ac95.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#|export\n", "mf_transcribe.launch(share=True)" ] }, { "cell_type": "markdown", "id": "7ec1f78d-d9c0-46c7-9466-0408bc6c6cdc", "metadata": {}, "source": [ "## Create a requirements.txt file" ] }, { "cell_type": "code", "execution_count": 22, "id": "7c3e753f-5051-4c3b-a5ab-fa65c7e7cae9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing requirements.txt\n" ] } ], "source": [ "%%writefile requirements.txt\n", "gradio==3.31.0\n", "faster-whisper==0.5.1\n", "torch" ] }, { "cell_type": "markdown", "id": "43505375-9b3d-4661-93d1-11965cd8d6b5", "metadata": {}, "source": [ "## Convert this notebook into a Gradio app" ] }, { "cell_type": "code", "execution_count": 25, "id": "fba83810-1f0f-4777-b831-aabb4cfead39", "metadata": {}, "outputs": [], "source": [ "from nbdev.export import nb_export\n", "nb_export('app.ipynb', lib_path='.', name='app')" ] }, { "cell_type": "markdown", "id": "2c7c52be-c7c4-4026-9886-ae9f71dec603", "metadata": {}, "source": [ "## Reference\n", "\n", "1. [Create A 🤗 Space From A Notebook](https://nbdev.fast.ai/blog/posts/2022-11-07-spaces/index.html)\n", "2. [Nbdev Demo](https://gist.github.com/hamelsmu/35be07d242f3f19063c3a3839127dc67)\n", "3. [Whisper-demo space by 🤗](https://huggingface.co/spaces/whisper-event/whisper-demo)" ] }, { "cell_type": "code", "execution_count": null, "id": "5384528f-9a83-4a0d-b4fd-8ed8458b0eda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }