Spaces:

kurianbenoy
/

Pallakku

Running

App Files Files Community

kurianbenoy commited on May 21, 2023

Commit

ecc934a

•

1 Parent(s): 5857953

add nbs

Browse files

Files changed (1) hide show

app.ipynb +630 -0

app.ipynb ADDED Viewed

	@@ -0,0 +1,630 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "807e94de-b600-46ca-9808-372619e38e69",
+   "metadata": {},
+   "source": [
+    "# Making kurianbenoy/faster-speech-to-text-for-malayalam with Jupyter notebooks"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f0e04921-4634-4d16-940f-bf8dd20bb63b",
+   "metadata": {},
+   "source": [
+    "## Install packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7a6257dd-ea39-44e1-b103-3f9588d6cf4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -Uqq nbdev gradio==3.31.0 faster-whisper==0.5.1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7ba223d-8043-4aab-8df3-f6cf3a4ac6b2",
+   "metadata": {},
+   "source": [
+    "## Basic inference code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "22e6e9c5-7a3f-4546-8039-ecf98004235b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "import gradio as gr\n",
+    "from faster_whisper import WhisperModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "81691362-0c73-4af0-9f99-96ffb7dc318b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'3.31.0'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gr.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5f4d3586-a6b9-4d3e-b02a-9f25f5068dbe",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "module 'faster_whisper' has no attribute '__version__'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mfaster_whisper\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mfaster_whisper\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__version__\u001b[49m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: module 'faster_whisper' has no attribute '__version__'"
+     ]
+    }
+   ],
+   "source": [
+    "# import faster_whisper\n",
+    "# faster_whisper.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "de8e21b9-449a-4ae3-bd64-bba334075fdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def t_asr(folder=\"vegam-whisper-medium-ml-fp16\", audio_file=\"00b38e80-80b8-4f70-babf-566e848879fc.webm\", compute_type=\"float16\", device=\"cpu\"):\n",
+    "    model = WhisperModel(folder, device=device, compute_type=compute_type)\n",
+    "    \n",
+    "    segments, info = model.transcribe(audio_file, beam_size=5)\n",
+    "    \n",
+    "    for segment in segments:\n",
+    "        print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "87c58dd2-7d3d-4fb3-821c-cdac673fee0d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.00s -> 4.58s] പാലം കടുക്കുവോളം നാരായണ പാലം കടന്നാലോ കൂരായണ\n",
+      "CPU times: user 42.2 s, sys: 9.58 s, total: 51.8 s\n",
+      "Wall time: 13.5 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "t_asr(compute_type=\"int8\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "a5624cf5-b3b8-4ae3-aa82-ee19505bb42d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language 'ta' with probability 0.372757\n",
+      "[0.00s -> 4.74s] പാലം കടുക്കുവോളം നാരായണ പാലം കടന്നാലൊ കൂരായണ\n",
+      "CPU times: user 36.5 s, sys: 9.52 s, total: 46.1 s\n",
+      "Wall time: 12.3 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "t_asr(folder=\"vegam-whisper-medium-ml\", compute_type=\"int8\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "25e1413f-8f80-4704-a94e-26b8d9581a6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.00s -> 4.58s] പാലം കടുക്കുവോളം നാരായണ പാലം കടന്നാലോ കൂരായണ\n",
+      "CPU times: user 9.39 s, sys: 792 ms, total: 10.2 s\n",
+      "Wall time: 4.51 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "t_asr(compute_type=\"int8\", device=\"cuda\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "48cd4ec3-512f-49d0-87ac-3ef989e25b80",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|export\n",
+    "def transcribe_malayalam_speech(audio_file, compute_type=\"int8\", device=\"cpu\", folder=\"vegam-whisper-medium-ml-fp16\"):\n",
+    "    model = WhisperModel(folder, device=device, compute_type=compute_type)\n",
+    "    segments, info = model.transcribe(audio_file, beam_size=5)\n",
+    "\n",
+    "    lst = []\n",
+    "    for segment in segments:\n",
+    "        # print(\"[%.2fs -> %.2fs] %s\" % (segment.start, segment.end, segment.text))\n",
+    "        lst.append(segment.text)\n",
+    "\n",
+    "    return(\" \".join(lst))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "14fda29a-aee1-44b2-9269-048cc8b98ea8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 43.1 s, sys: 12.3 s, total: 55.4 s\n",
+      "Wall time: 14.8 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'പാലം കടുക്കുവോളം നാരായണ പാലം കടന്നാലോ കൂരായണ'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "transcribe_malayalam_speech(audio_file=\"00b38e80-80b8-4f70-babf-566e848879fc.webm\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "bbdadecf-68d1-4183-8e43-7965c1aecf6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Haha, You are burning GPUs and wasting CO2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf706a0a-c3a2-489c-a1fe-df4fbf700d9c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "45fade75-e0b1-4c5d-90a3-ebd7345a4d16",
+   "metadata": {},
+   "source": [
+    "## Figure out Whisper  Demo by Huggingface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "fa06f8a6-87b7-45af-b36b-fb5ebe362455",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e437727ccbcd40838a43a0c1bbb00143",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)lve/main/config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2f654c303e24413cb73990bdd9d99907",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading pytorch_model.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "16386d3b586d475fa021ea8d6f925161",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)neration_config.json:   0%|          | 0.00/3.51k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bf5964b1ba024ce685a04127f21f78d0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)okenizer_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "038082a393084da998eed2085960e634",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "105ef799439d4c1ea0e3d2cbbfbcaf5d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e3369330ed9a4a9f8208ba6f160210bf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4c3a9c73c84245b0b88e42980d65abdf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8d61551d78914036a2b6475a6d840663",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "262213d3b6364b4e8648180c903c3008",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "258a3b7a9eb94dcdb8355c09c1b683b3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from transformers import pipeline\n",
+    "from huggingface_hub import model_info\n",
+    "\n",
+    "MODEL_NAME = \"openai/whisper-small\" #this always needs to stay in line 8 :D sorry for the hackiness\n",
+    "lang = \"en\"\n",
+    "\n",
+    "device = 0 if torch.cuda.is_available() else \"cpu\"\n",
+    "pipe = pipeline(\n",
+    "    task=\"automatic-speech-recognition\",\n",
+    "    model=MODEL_NAME,\n",
+    "    chunk_length_s=30,\n",
+    "    device=device,\n",
+    ")\n",
+    "\n",
+    "pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task=\"transcribe\")\n",
+    "\n",
+    "def transcribe(microphone, file_upload):\n",
+    "    warn_output = \"\"\n",
+    "    if (microphone is not None) and (file_upload is not None):\n",
+    "        warn_output = (\n",
+    "            \"WARNING: You've uploaded an audio file and used the microphone. \"\n",
+    "            \"The recorded file from the microphone will be used and the uploaded audio will be discarded.\\n\"\n",
+    "        )\n",
+    "\n",
+    "    elif (microphone is None) and (file_upload is None):\n",
+    "        return \"ERROR: You have to either use the microphone or upload an audio file\"\n",
+    "\n",
+    "    file = microphone if microphone is not None else file_upload\n",
+    "\n",
+    "    text = pipe(file)[\"text\"]\n",
+    "\n",
+    "    return warn_output + text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "023ffa7c-b82f-49ea-b6ca-00f84e2c8698",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe37c9e1-bc56-422d-9547-be94ab4e4844",
+   "metadata": {},
+   "source": [
+    "## Make an app with Gradio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "9badfdcd-dd99-49ea-a318-eda88cddefb6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://0.0.0.0:6007\n",
+      "Running on public URL: https://537af5b5b55ed185f5.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://537af5b5b55ed185f5.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "\n",
+    "def greet(name):\n",
+    "    return \"Hello \" + name + \"!!\"\n",
+    "\n",
+    "iface = gr.Interface(fn=greet, inputs=\"text\", outputs=\"text\")\n",
+    "iface.launch(share=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "81f3b241-8a6d-4ff0-bb70-d389d4d4e93a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mf_transcribe = gr.Interface(\n",
+    "    fn=transcribe,\n",
+    "    inputs=[\n",
+    "        gr.inputs.Audio(source=\"microphone\", type=\"filepath\", optional=True),\n",
+    "        gr.inputs.Audio(source=\"upload\", type=\"filepath\", optional=True),\n",
+    "    ],\n",
+    "    outputs=\"text\",\n",
+    "    title=\"Whisper Demo: Transcribe Audio\",\n",
+    "    description=(\n",
+    "        \"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned\"\n",
+    "        f\" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files\"\n",
+    "        \" of arbitrary length.\"\n",
+    "    ),\n",
+    "    allow_flagging=\"never\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1e34fa5-8340-4329-a348-b641ca4db341",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ec1f78d-d9c0-46c7-9466-0408bc6c6cdc",
+   "metadata": {},
+   "source": [
+    "## Create a requirements.txt file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "7c3e753f-5051-4c3b-a5ab-fa65c7e7cae9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting requirements.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile requirements.txt\n",
+    "gradio==3.31.0\n",
+    "faster-whisper==0.5.1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43505375-9b3d-4661-93d1-11965cd8d6b5",
+   "metadata": {},
+   "source": [
+    "## Convert this notebook into a Gradio app"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "fba83810-1f0f-4777-b831-aabb4cfead39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nbdev.export import nb_export\n",
+    "nb_export('app.ipynb', lib_path='.', name='app')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c7c52be-c7c4-4026-9886-ae9f71dec603",
+   "metadata": {},
+   "source": [
+    "## Reference\n",
+    "\n",
+    "1. [Create A 🤗 Space From A Notebook](https://nbdev.fast.ai/blog/posts/2022-11-07-spaces/index.html)\n",
+    "2. [Nbdev Demo](https://gist.github.com/hamelsmu/35be07d242f3f19063c3a3839127dc67)\n",
+    "3. [Whisper-demo space by  🤗](https://huggingface.co/spaces/whisper-event/whisper-demo)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5384528f-9a83-4a0d-b4fd-8ed8458b0eda",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}