{ "cells": [ { "cell_type": "markdown", "id": "exact-strand", "metadata": {}, "source": [ "## Install NVidia NeMo environment" ] }, { "cell_type": "markdown", "id": "compound-found", "metadata": {}, "source": [ "You can locally install NeMo environment by following [installation guide](https://github.com/heartexlabs/NeMo#installation), or quickstart it from the prebuilt Docker container:" ] }, { "cell_type": "code", "execution_count": null, "id": "pacific-pepper", "metadata": {}, "outputs": [], "source": [ "!docker run --gpus all -it --rm --shm-size=8g \\\n", "-p 8888:8888 -p 6006:6006 -p 8080:8080 --ulimit memlock=-1 --ulimit \\\n", "stack=67108864 --device=/dev/snd nvcr.io/nvidia/nemo:1.0.1" ] }, { "cell_type": "markdown", "id": "strong-therapist", "metadata": {}, "source": [ "Note that the default Label Studio port 8080 is exposed from Docker." ] }, { "cell_type": "markdown", "id": "everyday-depth", "metadata": {}, "source": [ "## Install Label Studio" ] }, { "cell_type": "code", "execution_count": null, "id": "spoken-venice", "metadata": {}, "outputs": [], "source": [ "!pip install label-studio" ] }, { "cell_type": "markdown", "id": "integral-introduction", "metadata": {}, "source": [ "## Create ML backend with NeMo model" ] }, { "cell_type": "markdown", "id": "dimensional-playing", "metadata": {}, "source": [ "Let's create a simple script `asr.py` that wraps NeMo inference call and converts its output to annotation format expected by Label Studio" ] }, { "cell_type": "code", "execution_count": null, "id": "tribal-minority", "metadata": {}, "outputs": [], "source": [ "import nemo\n", "import nemo.collections.asr as nemo_asr\n", "from label_studio_ml.model import LabelStudioMLBase\n", "\n", "\n", "class NemoASR(LabelStudioMLBase):\n", "\n", " def __init__(self, model_name='QuartzNet15x5Base-En', **kwargs):\n", " super(NemoASR, self).__init__(**kwargs)\n", "\n", " # Find TextArea control tag and bind ASR model to it\n", " self.from_name, self.to_name, self.value = self._bind_to_textarea()\n", "\n", " # This line will download pre-trained QuartzNet15x5 model from NVIDIA's NGC cloud and instantiate it for you\n", " self.model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name=model_name)\n", "\n", " def predict(self, tasks, **kwargs):\n", " \"\"\"Returns NeMo ASR predictions given audio files in Label Studio's tasks\"\"\"\n", " audio_path = self.get_local_path(tasks[0]['data'][self.value])\n", " transcription = self.model.transcribe(paths2audio_files=[audio_path])[0]\n", " return [{\n", " 'result': [{\n", " 'from_name': self.from_name,\n", " 'to_name': self.to_name,\n", " 'type': 'textarea',\n", " 'value': {\n", " 'text': [transcription]\n", " }\n", " }],\n", " 'score': 1.0\n", " }]\n", "\n", " def _bind_to_textarea(self):\n", " \"\"\"Helper to bind inference output to annotation format expected by Label Studio\"\"\"\n", " from_name, to_name, value = None, None, None\n", " for tag_name, tag_info in self.parsed_label_config.items():\n", " if tag_info['type'] == 'TextArea':\n", " from_name = tag_name\n", " if len(tag_info['inputs']) > 1:\n", " logger.warning(\n", " 'ASR model works with single Audio or AudioPlus input, '\n", " 'but {0} found: {1}. We\\'ll use only the first one'.format(\n", " len(tag_info['inputs']), ', '.join(tag_info['to_name'])))\n", " if tag_info['inputs'][0]['type'] not in ('Audio', 'AudioPlus'):\n", " raise ValueError('{0} tag expected to be of type Audio or AudioPlus, but type {1} found'.format(\n", " tag_info['to_name'][0], tag_info['inputs'][0]['type']))\n", " to_name = tag_info['to_name'][0]\n", " value = tag_info['inputs'][0]['value']\n", " if from_name is None:\n", " raise ValueError('ASR model expects