diff --git "a/prueba.ipynb" "b/prueba.ipynb" new file mode 100644--- /dev/null +++ "b/prueba.ipynb" @@ -0,0 +1,966 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/matias/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from transformers import pipeline\n", + "import torch\n", + "from IPython.display import Audio\n", + "\n", + "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", + "\n", + "classifier = pipeline(\n", + " \"audio-classification\", model=\"MIT/ast-finetuned-speech-commands-v2\", device=device\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reconocer palabra para empezar" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: 'backward', 1: 'follow', 2: 'five', 3: 'bed', 4: 'zero', 5: 'on', 6: 'learn', 7: 'two', 8: 'house', 9: 'tree', 10: 'dog', 11: 'stop', 12: 'seven', 13: 'eight', 14: 'down', 15: 'six', 16: 'forward', 17: 'cat', 18: 'right', 19: 'visual', 20: 'four', 21: 'wow', 22: 'no', 23: 'nine', 24: 'off', 25: 'three', 26: 'left', 27: 'marvin', 28: 'yes', 29: 'up', 30: 'sheila', 31: 'happy', 32: 'bird', 33: 'go', 34: 'one'}\n", + "Listening for wake word...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/matias/.local/lib/python3.10/site-packages/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py:96: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n", + " waveform = torch.from_numpy(waveform).unsqueeze(0)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'score': 0.04762890934944153, 'label': 'two'}\n", + "{'score': 0.1426355093717575, 'label': 'six'}\n", + "{'score': 0.11895965784788132, 'label': 'up'}\n", + "{'score': 0.13453948497772217, 'label': 'off'}\n", + "{'score': 0.12843511998653412, 'label': 'stop'}\n", + "{'score': 0.13055971264839172, 'label': 'stop'}\n", + "{'score': 0.13055971264839172, 'label': 'stop'}\n", + "{'score': 0.13055971264839172, 'label': 'stop'}\n", + "{'score': 0.14089344441890717, 'label': 'up'}\n", + "{'score': 0.35520532727241516, 'label': 'stop'}\n", + "{'score': 0.33248171210289, 'label': 'stop'}\n", + "{'score': 0.33248171210289, 'label': 'stop'}\n", + "{'score': 0.33248171210289, 'label': 'stop'}\n", + "{'score': 0.33248171210289, 'label': 'stop'}\n", + "{'score': 0.18591004610061646, 'label': 'stop'}\n", + "{'score': 0.16003373265266418, 'label': 'stop'}\n", + "{'score': 0.16003373265266418, 'label': 'stop'}\n", + "{'score': 0.16003373265266418, 'label': 'stop'}\n", + "{'score': 0.16003373265266418, 'label': 'stop'}\n", + "{'score': 0.7757278680801392, 'label': 'marvin'}\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers.pipelines.audio_utils import ffmpeg_microphone_live\n", + "\n", + "print(classifier.model.config.id2label)\n", + "\n", + "def launch_fn(\n", + " wake_word=\"marvin\",\n", + " prob_threshold=0.5,\n", + " chunk_length_s=2.0,\n", + " stream_chunk_s=0.25,\n", + " debug=False,\n", + "):\n", + " if wake_word not in classifier.model.config.label2id.keys():\n", + " raise ValueError(\n", + " f\"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}.\"\n", + " )\n", + "\n", + " sampling_rate = classifier.feature_extractor.sampling_rate\n", + "\n", + " mic = ffmpeg_microphone_live(\n", + " sampling_rate=sampling_rate,\n", + " chunk_length_s=chunk_length_s,\n", + " stream_chunk_s=stream_chunk_s,\n", + " )\n", + "\n", + " print(\"Listening for wake word...\")\n", + " for prediction in classifier(mic):\n", + " prediction = prediction[0]\n", + " if debug:\n", + " print(prediction)\n", + " if prediction[\"label\"] == wake_word:\n", + " if prediction[\"score\"] > prob_threshold:\n", + " return True\n", + " \n", + "launch_fn(debug=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transcribir audio" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "transcriber = pipeline(\n", + " \"automatic-speech-recognition\", model=\"openai/whisper-base.en\", device=device\n", + ")\n", + "import sys\n", + "\n", + "\n", + "def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):\n", + " sampling_rate = transcriber.feature_extractor.sampling_rate\n", + "\n", + " mic = ffmpeg_microphone_live(\n", + " sampling_rate=sampling_rate,\n", + " chunk_length_s=chunk_length_s,\n", + " stream_chunk_s=stream_chunk_s,\n", + " )\n", + "\n", + " print(\"Start speaking...\")\n", + " for item in transcriber(mic, generate_kwargs={\"max_new_tokens\": 128}):\n", + " sys.stdout.write(\"\\033[K\")\n", + " print(item[\"text\"], end=\"\\r\")\n", + " if not item[\"partial\"][0]:\n", + " break\n", + "\n", + " return item[\"text\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start speaking...\n", + "\u001b[K hola, how are you??\r" + ] + }, + { + "data": { + "text/plain": [ + "' hola, how are you?'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_id = \"openai/whisper-base\" # update with your model id\n", + "#model_id =\"openai/whisper-tiny\"\n", + "transcriber = pipeline(\n", + " \"automatic-speech-recognition\",\n", + " model = model_id,\n", + " max_new_tokens=256,\n", + " generate_kwargs={\n", + " \"task\": \"transcribe\",\n", + " \"language\": \"spanish\",\n", + " }, \n", + " )\n", + "\n", + "def transcribe_whisper(chunk_length_s=5.0, stream_chunk_s=1.0):\n", + " sampling_rate = transcriber.feature_extractor.sampling_rate\n", + "\n", + " mic = ffmpeg_microphone_live(\n", + " sampling_rate=sampling_rate,\n", + " chunk_length_s=chunk_length_s,\n", + " stream_chunk_s=stream_chunk_s,\n", + " )\n", + "\n", + " print(\"Start speaking...\")\n", + " for item in transcriber(mic):\n", + " sys.stdout.write(\"\\033[K\")\n", + " print(item[\"text\"], end=\"\\r\")\n", + " if not item[\"partial\"][0]:\n", + " break\n", + "\n", + " return item[\"text\"]\n", + "transcribe()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def transcribe_speech(filepath):\n", + " pipe = pipeline(\"automatic-speech-recognition\", model=model_id)\n", + " output = pipe(\n", + " filepath,\n", + " max_new_tokens=256,\n", + " generate_kwargs={\n", + " \"task\": \"transcribe\",\n", + " \"language\": \"spanish\",\n", + " }, # update with the language you've fine-tuned on\n", + " chunk_length_s=30,\n", + " batch_size=8,\n", + " )\n", + " return output[\"text\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Responder el mensaje" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Querying...: How many days are in a week?\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "from huggingface_hub import HfFolder\n", + "import requests\n", + "\n", + "#\"tiiuae/falcon-7b-instruct\"\n", + "def query(text, model_id=\"PulsarAI/llama-2-alpacagpt4-1000step\"):\n", + " api_url = f\"https://api-inference.huggingface.co/models/{model_id}\"\n", + " headers = {\"Authorization\": f\"Bearer {HfFolder().get_token()}\"}\n", + " payload = {\"inputs\": text}\n", + "\n", + " print(f\"Querying...: {text}\")\n", + " response = requests.post(api_url, headers=headers, json=payload)\n", + " # return response.json()[0][\"generated_text\"][len(text) + 1 :]\n", + " return response\n", + "\n", + "query(\"How many days are in a week?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import openai\n", + "from dotenv import load_dotenv\n", + "\n", + "# Load environment variables from the .env file de forma local\n", + "load_dotenv()\n", + "openai.api_key = os.environ['OPENAI_API_KEY']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def query_chatgpt(text):\n", + " messages = []\n", + " messages.append({'role': 'user', 'content': '{}'.format(text)})\n", + " print(\"Preguntando \"+text)\n", + " response = openai.ChatCompletion.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " messages= messages,\n", + " temperature=0.5,\n", + " max_tokens=120\n", + " ).choices[0].message.content\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preguntando hola, cómo estas?\n" + ] + }, + { + "data": { + "text/plain": [ + "'¡Hola! Estoy bien, gracias. ¿Y tú?'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_chatgpt(\"hola, cómo estas?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generar la respuesta\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### microsoft t5" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n", + "\n", + "processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n", + "\n", + "model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\").to(device)\n", + "vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\").to(device)\n", + "\n", + "from datasets import load_dataset\n", + "\n", + "embeddings_dataset = load_dataset(\"Matthijs/cmu-arctic-xvectors\", split=\"validation\")\n", + "speaker_embeddings = torch.tensor(embeddings_dataset[7306][\"xvector\"]).unsqueeze(0)\n", + "\n", + "\n", + "def synthesise(text):\n", + " print(\"sintetizando respuesta\")\n", + " inputs = processor(text=text, return_tensors=\"pt\")\n", + " speech = model.generate_speech(\n", + " inputs[\"input_ids\"].to(device), speaker_embeddings.to(device), vocoder=vocoder\n", + " )\n", + " return speech.cpu()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### facebook mms, no lo pude hacer funcionar" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pwd\n", + "!git clone https://github.com/jaywalnut310/vits.git\n", + "!python --version\n", + "%cd vits/\n", + "\n", + "!pip install Cython==0.29.21\n", + "!pip install librosa==0.8.0\n", + "!pip install phonemizer==2.2.1\n", + "!pip install scipy\n", + "!pip install numpy\n", + "!pip install torch\n", + "!pip install torchvision\n", + "!pip install matplotlib\n", + "!pip install Unidecode==1.1.1\n", + "\n", + "%cd monotonic_align/\n", + "%mkdir monotonic_align\n", + "!python3 setup.py build_ext --inplace\n", + "%cd ../\n", + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd chat_otrosmodelos\n", + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "from tts import synthesize" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "228.60s - pydevd: Sending message related to process being replaced timed-out after 5 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Collecting fairseq\n", + " Downloading fairseq-0.12.2.tar.gz (9.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", + "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", + "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: sentencepiece in /home/matias/.local/lib/python3.10/site-packages (0.1.99)\n", + "Requirement already satisfied: numpy in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (1.25.2)\n", + "Requirement already satisfied: cffi in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (1.15.1)\n", + "Requirement already satisfied: regex in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (2023.8.8)\n", + "Collecting omegaconf<2.1\n", + " Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n", + "Requirement already satisfied: tqdm in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (4.66.1)\n", + "Requirement already satisfied: torchaudio>=0.8.0 in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (2.0.2)\n", + "Collecting sacrebleu>=1.4.12\n", + " Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m118.9/118.9 KB\u001b[0m \u001b[31m35.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: torch in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (2.0.1)\n", + "Requirement already satisfied: cython in /home/matias/.local/lib/python3.10/site-packages (from fairseq) (0.29.21)\n", + "Collecting bitarray\n", + " Downloading bitarray-2.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (286 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m286.2/286.2 KB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting hydra-core<1.1,>=1.0.7\n", + " Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.8/123.8 KB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting antlr4-python3-runtime==4.8\n", + " Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.4/112.4 KB\u001b[0m \u001b[31m31.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25hRequirement already satisfied: typing-extensions in /home/matias/.local/lib/python3.10/site-packages (from omegaconf<2.1->fairseq) (4.7.1)\n", + "Requirement already satisfied: PyYAML>=5.1.* in /usr/lib/python3/dist-packages (from omegaconf<2.1->fairseq) (5.4.1)\n", + "Collecting portalocker\n", + " Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n", + "Requirement already satisfied: tabulate>=0.8.9 in /home/matias/.local/lib/python3.10/site-packages (from sacrebleu>=1.4.12->fairseq) (0.9.0)\n", + "Requirement already satisfied: lxml in /home/matias/.local/lib/python3.10/site-packages (from sacrebleu>=1.4.12->fairseq) (4.9.3)\n", + "Requirement already satisfied: colorama in /usr/lib/python3/dist-packages (from sacrebleu>=1.4.12->fairseq) (0.4.4)\n", + "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (10.9.0.58)\n", + "Requirement already satisfied: networkx in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (3.1)\n", + "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (10.2.10.91)\n", + "Requirement already satisfied: sympy in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (1.12)\n", + "Requirement already satisfied: triton==2.0.0 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (2.0.0)\n", + "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.7.101)\n", + "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (8.5.0.96)\n", + "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.4.0.1)\n", + "Requirement already satisfied: filelock in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (3.12.3)\n", + "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.7.99)\n", + "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.10.3.66)\n", + "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.7.4.91)\n", + "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.7.91)\n", + "Requirement already satisfied: jinja2 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (3.1.2)\n", + "Requirement already satisfied: nvidia-nccl-cu11==2.14.3 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (2.14.3)\n", + "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99 in /home/matias/.local/lib/python3.10/site-packages (from torch->fairseq) (11.7.99)\n", + "Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch->fairseq) (0.37.1)\n", + "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from nvidia-cublas-cu11==11.10.3.66->torch->fairseq) (59.6.0)\n", + "Requirement already satisfied: lit in /home/matias/.local/lib/python3.10/site-packages (from triton==2.0.0->torch->fairseq) (16.0.6)\n", + "Requirement already satisfied: cmake in /home/matias/.local/lib/python3.10/site-packages (from triton==2.0.0->torch->fairseq) (3.27.2)\n", + "Requirement already satisfied: pycparser in /home/matias/.local/lib/python3.10/site-packages (from cffi->fairseq) (2.21)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/lib/python3/dist-packages (from jinja2->torch->fairseq) (2.0.1)\n", + "Requirement already satisfied: mpmath>=0.19 in /home/matias/.local/lib/python3.10/site-packages (from sympy->torch->fairseq) (1.3.0)\n", + "Building wheels for collected packages: fairseq, antlr4-python3-runtime\n", + " Building wheel for fairseq (pyproject.toml) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for fairseq: filename=fairseq-0.12.2-cp310-cp310-linux_x86_64.whl size=11291819 sha256=4118e3f9771ef60e2501205a92f3683459f17c18835a49a6a9ef5dee8b65de7f\n", + " Stored in directory: /home/matias/.cache/pip/wheels/e4/35/55/9c66f65ec7c83fd6fbc2b9502a0ac81b2448a1196159dacc32\n", + " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141230 sha256=dbecaf43800b09a2d86a181a3c820ce1113ee9b4857893dde0ef7bc0beac0d31\n", + " Stored in directory: /home/matias/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9e801350d5\n", + "Successfully built fairseq antlr4-python3-runtime\n", + "Installing collected packages: bitarray, antlr4-python3-runtime, portalocker, omegaconf, sacrebleu, hydra-core, fairseq\n", + "Successfully installed antlr4-python3-runtime-4.8 bitarray-2.8.1 fairseq-0.12.2 hydra-core-1.0.7 omegaconf-2.0.6 portalocker-2.8.2 sacrebleu-2.3.1\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install fairseq sentencepiece" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 10 files: 100%|██████████| 10/10 [00:00<00:00, 56603.29it/s]\n", + "2023-09-27 16:24:41 | INFO | fairseq.tasks.speech_to_text | dictionary size (spm_char.txt): 107\n", + "2023-09-27 16:24:42 | INFO | fairseq.models.text_to_speech.vocoder | loaded HiFiGAN checkpoint from /home/matias/.cache/fairseq/models--facebook--tts_transformer-es-css10/snapshots/f52cf36f741df546bed60cdd5e6b71e0b85378c1/hifigan.bin\n", + "2023-09-27 16:24:42 | INFO | fairseq.models.text_to_speech.vocoder | loaded HiFiGAN checkpoint from /home/matias/.cache/fairseq/models--facebook--tts_transformer-es-css10/snapshots/f52cf36f741df546bed60cdd5e6b71e0b85378c1/hifigan.bin\n" + ] + } + ], + "source": [ + "from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub\n", + "from fairseq.models.text_to_speech.hub_interface import TTSHubInterface\n", + "import IPython.display as ipd\n", + "\n", + "\n", + "models, cfg, task = load_model_ensemble_and_task_from_hf_hub(\n", + " \"facebook/tts_transformer-es-css10\",\n", + " arg_overrides={\"vocoder\": \"hifigan\", \"fp16\": False}\n", + ")\n", + "model = models[0]\n", + "TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)\n", + "generator = task.build_generator([model], cfg)\n", + "\n", + "# text = \"Había una vez.\"\n", + "\n", + "# sample = TTSHubInterface.get_model_input(task, text)\n", + "# wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)\n", + "\n", + "# ipd.Audio(wav, rate=rate)\n", + "\n", + "def syn_facebookmms(text):\n", + " sample = TTSHubInterface.get_model_input(task, text)\n", + " wav,rate = TTSHubInterface.get_prediction(task, model, generator, sample)\n", + " return wav,rate\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "audio, rate = syn_facebookmms(\"hola, cómo estás? yo estoy bien, gracias por preguntar\")\n", + "Audio(audio, rate=rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### suno, es muy pesado" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoProcessor, AutoModel\n", + "\n", + "\n", + "processor = AutoProcessor.from_pretrained(\"suno/bark-small\")\n", + "model = AutoModel.from_pretrained(\"suno/bark-small\")\n", + "\n", + "inputs = processor(\n", + " text=[\"Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.\"],\n", + " return_tensors=\"pt\",\n", + ")\n", + "\n", + "speech_values = model.generate(**inputs, do_sample=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Audio\n", + "\n", + "sampling_rate = model.generation_config.sample_rate\n", + "Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api-inference.huggingface.co:443\n", + "DEBUG:urllib3.connectionpool:https://api-inference.huggingface.co:443 \"POST /models/suno/bark-small HTTP/1.1\" 503 89\n" + ] + } + ], + "source": [ + "\n", + "from huggingface_hub import HfFolder\n", + "import json\n", + "import requests\n", + "API_URL = \"https://api-inference.huggingface.co/models/suno/bark-small\"\n", + "headers = {\"Authorization\": f\"Bearer {HfFolder().get_token()}\"}\n", + "def query(payload):\n", + " data = json.dumps(payload)\n", + " response = requests.request(\"POST\", API_URL, headers=headers, data=data)\n", + " return json.loads(response.content.decode(\"utf-8\"))\n", + "data = query(\"Can you please let us know more details about your \")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api-inference.huggingface.co:443\n", + "DEBUG:urllib3.connectionpool:https://api-inference.huggingface.co:443 \"POST /models/gpt2 HTTP/1.1\" 500 None\n" + ] + } + ], + "source": [ + "import json\n", + "import requests\n", + "\n", + "headers = {\"Authorization\": f\"Bearer {HfFolder().get_token()}\"}\n", + "API_URL = \"https://api-inference.huggingface.co/models/gpt2\"\n", + "\n", + "def query(payload):\n", + " response = requests.post(API_URL, headers=headers, json=payload)\n", + " return response\n", + "\n", + "output = query({\"text_inputs\": \"This is a test\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "Audio(output, rate=16000, autoplay=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Probar todo junto" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Listening for wake word...\n", + "Start speaking...\n", + "Preguntando ¿Cómo hacer el color amariche?\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.display import Audio\n", + "launch_fn()\n", + "transcription = transcribe_whisper()\n", + "response = query_chatgpt(transcription)\n", + "# audio = synthesise(response)\n", + "audio, rate = syn_facebookmms(response)\n", + "\n", + "Audio(audio, rate=rate, autoplay=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def answer_question(filepath):\n", + " transcription = transcribe_speech(filepath)\n", + " response = query_chatgpt(transcription)\n", + " # audio = synthesise(response)\n", + " audio, rate = syn_facebookmms(response)\n", + " return rate,audio" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'como es el dia?'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[30], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m rate, audio \u001b[39m=\u001b[39m answer_question(\u001b[39m\"\u001b[39;49m\u001b[39mcomo es el dia?\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m 2\u001b[0m Audio(audio,rate)\n", + "Cell \u001b[0;32mIn[29], line 2\u001b[0m, in \u001b[0;36manswer_question\u001b[0;34m(filepath)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39manswer_question\u001b[39m(filepath):\n\u001b[0;32m----> 2\u001b[0m transcription \u001b[39m=\u001b[39m transcribe_speech(filepath)\n\u001b[1;32m 3\u001b[0m response \u001b[39m=\u001b[39m query_chatgpt(transcription)\n\u001b[1;32m 4\u001b[0m \u001b[39m# audio = synthesise(response)\u001b[39;00m\n", + "Cell \u001b[0;32mIn[27], line 3\u001b[0m, in \u001b[0;36mtranscribe_speech\u001b[0;34m(filepath)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mtranscribe_speech\u001b[39m(filepath):\n\u001b[1;32m 2\u001b[0m pipe \u001b[39m=\u001b[39m pipeline(\u001b[39m\"\u001b[39m\u001b[39mautomatic-speech-recognition\u001b[39m\u001b[39m\"\u001b[39m, model\u001b[39m=\u001b[39mmodel_id)\n\u001b[0;32m----> 3\u001b[0m output \u001b[39m=\u001b[39m pipe(\n\u001b[1;32m 4\u001b[0m filepath,\n\u001b[1;32m 5\u001b[0m max_new_tokens\u001b[39m=\u001b[39;49m\u001b[39m256\u001b[39;49m,\n\u001b[1;32m 6\u001b[0m generate_kwargs\u001b[39m=\u001b[39;49m{\n\u001b[1;32m 7\u001b[0m \u001b[39m\"\u001b[39;49m\u001b[39mtask\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39mtranscribe\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 8\u001b[0m \u001b[39m\"\u001b[39;49m\u001b[39mlanguage\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39mspanish\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[1;32m 9\u001b[0m }, \u001b[39m# update with the language you've fine-tuned on\u001b[39;49;00m\n\u001b[1;32m 10\u001b[0m chunk_length_s\u001b[39m=\u001b[39;49m\u001b[39m30\u001b[39;49m,\n\u001b[1;32m 11\u001b[0m batch_size\u001b[39m=\u001b[39;49m\u001b[39m8\u001b[39;49m,\n\u001b[1;32m 12\u001b[0m )\n\u001b[1;32m 13\u001b[0m \u001b[39mreturn\u001b[39;00m output[\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m]\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/pipelines/automatic_speech_recognition.py:356\u001b[0m, in \u001b[0;36mAutomaticSpeechRecognitionPipeline.__call__\u001b[0;34m(self, inputs, **kwargs)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__call__\u001b[39m(\n\u001b[1;32m 295\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 296\u001b[0m inputs: Union[np\u001b[39m.\u001b[39mndarray, \u001b[39mbytes\u001b[39m, \u001b[39mstr\u001b[39m],\n\u001b[1;32m 297\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[1;32m 298\u001b[0m ):\n\u001b[1;32m 299\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 300\u001b[0m \u001b[39m Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[39m documentation for more information.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[39m `\"\".join(chunk[\"text\"] for chunk in output[\"chunks\"])`.\u001b[39;00m\n\u001b[1;32m 355\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 356\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49m\u001b[39m__call__\u001b[39;49m(inputs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/pipelines/base.py:1121\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[0;34m(self, inputs, num_workers, batch_size, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1119\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39miterate(inputs, preprocess_params, forward_params, postprocess_params)\n\u001b[1;32m 1120\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mframework \u001b[39m==\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mpt\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39misinstance\u001b[39m(\u001b[39mself\u001b[39m, ChunkPipeline):\n\u001b[0;32m-> 1121\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mnext\u001b[39;49m(\n\u001b[1;32m 1122\u001b[0m \u001b[39miter\u001b[39;49m(\n\u001b[1;32m 1123\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mget_iterator(\n\u001b[1;32m 1124\u001b[0m [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params\n\u001b[1;32m 1125\u001b[0m )\n\u001b[1;32m 1126\u001b[0m )\n\u001b[1;32m 1127\u001b[0m )\n\u001b[1;32m 1128\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 1129\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mrun_single(inputs, preprocess_params, forward_params, postprocess_params)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py:124\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mloader_batch_item()\n\u001b[1;32m 123\u001b[0m \u001b[39m# We're out of items within a batch\u001b[39;00m\n\u001b[0;32m--> 124\u001b[0m item \u001b[39m=\u001b[39m \u001b[39mnext\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49miterator)\n\u001b[1;32m 125\u001b[0m processed \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfer(item, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams)\n\u001b[1;32m 126\u001b[0m \u001b[39m# We now have a batch of \"inferred things\".\u001b[39;00m\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py:266\u001b[0m, in \u001b[0;36mPipelinePackIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[39mreturn\u001b[39;00m accumulator\n\u001b[1;32m 265\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mnot\u001b[39;00m is_last:\n\u001b[0;32m--> 266\u001b[0m processed \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfer(\u001b[39mnext\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49miterator), \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams)\n\u001b[1;32m 267\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mloader_batch_size \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 268\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed, torch\u001b[39m.\u001b[39mTensor):\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/utils/data/dataloader.py:633\u001b[0m, in \u001b[0;36m_BaseDataLoaderIter.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 630\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sampler_iter \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 631\u001b[0m \u001b[39m# TODO(https://github.com/pytorch/pytorch/issues/76750)\u001b[39;00m\n\u001b[1;32m 632\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_reset() \u001b[39m# type: ignore[call-arg]\u001b[39;00m\n\u001b[0;32m--> 633\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_next_data()\n\u001b[1;32m 634\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_num_yielded \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[1;32m 635\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_dataset_kind \u001b[39m==\u001b[39m _DatasetKind\u001b[39m.\u001b[39mIterable \u001b[39mand\u001b[39;00m \\\n\u001b[1;32m 636\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_IterableDataset_len_called \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \\\n\u001b[1;32m 637\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_num_yielded \u001b[39m>\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_IterableDataset_len_called:\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/utils/data/dataloader.py:677\u001b[0m, in \u001b[0;36m_SingleProcessDataLoaderIter._next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 675\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_next_data\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[1;32m 676\u001b[0m index \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_next_index() \u001b[39m# may raise StopIteration\u001b[39;00m\n\u001b[0;32m--> 677\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_dataset_fetcher\u001b[39m.\u001b[39;49mfetch(index) \u001b[39m# may raise StopIteration\u001b[39;00m\n\u001b[1;32m 678\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pin_memory:\n\u001b[1;32m 679\u001b[0m data \u001b[39m=\u001b[39m _utils\u001b[39m.\u001b[39mpin_memory\u001b[39m.\u001b[39mpin_memory(data, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pin_memory_device)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:32\u001b[0m, in \u001b[0;36m_IterableDatasetFetcher.fetch\u001b[0;34m(self, possibly_batched_index)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[39mfor\u001b[39;00m _ \u001b[39min\u001b[39;00m possibly_batched_index:\n\u001b[1;32m 31\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 32\u001b[0m data\u001b[39m.\u001b[39mappend(\u001b[39mnext\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mdataset_iter))\n\u001b[1;32m 33\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m:\n\u001b[1;32m 34\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mended \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/pipelines/pt_utils.py:183\u001b[0m, in \u001b[0;36mPipelineChunkIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msubiterator \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfer(\u001b[39mnext\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39miterator), \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams)\n\u001b[1;32m 181\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 182\u001b[0m \u001b[39m# Try to return next item\u001b[39;00m\n\u001b[0;32m--> 183\u001b[0m processed \u001b[39m=\u001b[39m \u001b[39mnext\u001b[39;49m(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msubiterator)\n\u001b[1;32m 184\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m:\n\u001b[1;32m 185\u001b[0m \u001b[39m# When a preprocess iterator ends, we can start lookig at the next item\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \u001b[39m# ChunkIterator will keep feeding until ALL elements of iterator\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[39m# Another way to look at it, is we're basically flattening lists of lists\u001b[39;00m\n\u001b[1;32m 190\u001b[0m \u001b[39m# into a single list, but with generators\u001b[39;00m\n\u001b[1;32m 191\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msubiterator \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39minfer(\u001b[39mnext\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39miterator), \u001b[39m*\u001b[39m\u001b[39m*\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mparams)\n", + "File \u001b[0;32m~/.local/lib/python3.10/site-packages/transformers/pipelines/automatic_speech_recognition.py:429\u001b[0m, in \u001b[0;36mAutomaticSpeechRecognitionPipeline.preprocess\u001b[0;34m(self, inputs, chunk_length_s, stride_length_s)\u001b[0m\n\u001b[1;32m 427\u001b[0m inputs \u001b[39m=\u001b[39m requests\u001b[39m.\u001b[39mget(inputs)\u001b[39m.\u001b[39mcontent\n\u001b[1;32m 428\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m--> 429\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39;49m(inputs, \u001b[39m\"\u001b[39;49m\u001b[39mrb\u001b[39;49m\u001b[39m\"\u001b[39;49m) \u001b[39mas\u001b[39;00m f:\n\u001b[1;32m 430\u001b[0m inputs \u001b[39m=\u001b[39m f\u001b[39m.\u001b[39mread()\n\u001b[1;32m 432\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(inputs, \u001b[39mbytes\u001b[39m):\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'como es el dia?'" + ] + } + ], + "source": [ + "rate, audio = answer_question(\"como es el dia?\")\n", + "Audio(audio,rate)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running on local URL: http://127.0.0.1:7862\n", + "\n", + "To create a public link, set `share=True` in `launch()`.\n" + ] + }, + { + "data": { + "text/html": [ + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/matias/.local/lib/python3.10/site-packages/gradio/processing_utils.py:188: UserWarning: Trying to convert audio automatically from int32 to 16-bit int format.\n", + " warnings.warn(warning.format(data.dtype))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Preguntando ¿Cuántos años tienen centenaria?\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"/home/matias/.local/lib/python3.10/site-packages/gradio/routes.py\", line 488, in run_predict\n", + " output = await app.get_blocks().process_api(\n", + " File \"/home/matias/.local/lib/python3.10/site-packages/gradio/blocks.py\", line 1434, in process_api\n", + " data = self.postprocess_data(fn_index, result[\"prediction\"], state)\n", + " File \"/home/matias/.local/lib/python3.10/site-packages/gradio/blocks.py\", line 1335, in postprocess_data\n", + " prediction_value = block.postprocess(prediction_value)\n", + " File \"/home/matias/.local/lib/python3.10/site-packages/gradio/components/audio.py\", line 349, in postprocess\n", + " file_path = self.audio_to_temp_file(\n", + " File \"/home/matias/.local/lib/python3.10/site-packages/gradio/components/base.py\", line 325, in audio_to_temp_file\n", + " temp_dir = Path(self.DEFAULT_TEMP_DIR) / self.hash_bytes(data.tobytes())\n", + "AttributeError: 'Tensor' object has no attribute 'tobytes'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Keyboard interruption in main thread... closing server.\n" + ] + }, + { + "data": { + "text/plain": [] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import gradio as gr\n", + "with gr.Blocks() as demo:\n", + " entrada = gr.Audio(source=\"microphone\",type=\"filepath\")\n", + " salida = gr.Audio()\n", + " boton = gr.Button(\"Responder\")\n", + " boton.click(answer_question,entrada,salida)\n", + "demo.launch(debug=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}