{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "6afcd792", "metadata": {}, "outputs": [], "source": [ "import librosa\n", "import torch\n", "from transformers import Wav2Vec2Processor, HubertForCTC\n", "from huggingsound import SpeechRecognitionModel\n", "model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n", "audio_paths = [\"1.wav\"]\n", "transcriptions = model.transcribe(audio_paths)\n", "\n", "\n", "input_audio, sr = librosa.load('english.wav', sr = 16000)\n", "input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n", "logits = model(input_values).logits\n", "predicted_ids = torch.argmax(logits, dim=-1)\n", "transcription = processor.decode(predicted_ids[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "e1831eab", "metadata": {}, "outputs": [], "source": [ "\n", "model = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n", "\n", "\n", "processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n", "model = HubertForCTC.from_pretrained(\"./english_fine_tune\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8b7aee3c", "metadata": {}, "outputs": [], "source": [ "def pipeline(path_to_audio):\n", " \n", " \n", " if :\n", " input_audio, sr = librosa.load(path_to_audio, sr = 16000)\n", " input_values = processor(input_audio, return_tensors=\"pt\").input_values # Batch size 1\n", " logits = model(input_values).logits\n", " predicted_ids = torch.argmax(logits, dim=-1)\n", " transcription = processor.decode(predicted_ids[0])\n", " \n", " else:\n", " transcriptions = model.transcribe([path_to_audio])\n", " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }