{ "cells": [ { "cell_type": "code", "execution_count": 92, "id": "edc2e2ff", "metadata": {}, "outputs": [], "source": [ "import librosa\n", "import torch\n", "from transformers import Wav2Vec2Processor, HubertForCTC\n", "from huggingsound import SpeechRecognitionModel\n", "import torchaudio\n", "from speechbrain.pretrained import EncoderClassifier\n", "import time\n", "from transformers import Pipeline" ] }, { "cell_type": "code", "execution_count": 93, "id": "76f25cc3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12/06/2022 13:42:19 - INFO - huggingsound.speech_recognition.model - Loading model...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "12/06/2022 13:42:23 - WARNING - root - bos_token not in provided tokens. It will be added to the list of tokens\n", "12/06/2022 13:42:23 - WARNING - root - eos_token not in provided tokens. It will be added to the list of tokens\n" ] } ], "source": [ "model_chinese = SpeechRecognitionModel(\"./wav2vec2-large-xlsr-chinese\")\n", "processor = Wav2Vec2Processor.from_pretrained(\"./english_fine_tune\")\n", "model = HubertForCTC.from_pretrained(\"./english_fine_tune\")\n", "language_id = EncoderClassifier.from_hparams(source=\"speechbrain/lang-id-voxlingua107-ecapa\", savedir=\"tmp\")" ] }, { "cell_type": "code", "execution_count": 94, "id": "3b142546", "metadata": {}, "outputs": [], "source": [ "def pipeline(path_to_audio):\n", " signal = language_id.load_audio(path_to_audio)\n", " prediction = language_id.classify_batch(signal)\n", " prediction[3]\n", " \n", " if prediction[3][0] == 'zh: Chinese':\n", " print('Detected Language is Chinese')\n", " transcriptions = model_chinese.transcribe([path_to_audio])\n", " print(transcriptions[0]['transcription'])\n", " else:\n", " print('Detected language is English')\n", " input_audio, sr = librosa.load(path_to_audio, sr = 16000)\n", " input_values = processor(input_audio, return_tensors=\"pt\").input_values \n", " logits = model(input_values).logits\n", " predicted_ids = torch.argmax(logits, dim=-1)\n", " transcription = processor.decode(predicted_ids[0])\n", " print(transcription)\n", " " ] }, { "cell_type": "code", "execution_count": 95, "id": "b0fae1dd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Detected language is English\n", "NISHE JUAN FANMA HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HEAR HIM\n" ] } ], "source": [ "start = time.time()\n", "pipeline('combine.wav')\n", "end = time.time()" ] }, { "cell_type": "code", "execution_count": 96, "id": "1e0321b5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Detected Language is Chinese\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 1.28it/s]\n", "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "你喜欢饭吗\n", "Detected language is English\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Detected language is English\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "HE MOVED ABOUT\n", "Detected language is English\n", "INVISIBLE BUT EVERYONE COULD HEAR HIM\n" ] } ], "source": [ "from pydub import AudioSegment\n", "from pydub.silence import split_on_silence\n", "\n", "sound_file = AudioSegment.from_wav(\"combine.wav\")\n", "audio_chunks = split_on_silence(sound_file, \n", " min_silence_len=100,\n", " silence_thresh=-50\n", ")\n", "\n", "for i, chunk in enumerate(audio_chunks):\n", "\n", " out_file = \"./chunk{0}.wav\".format(i)\n", " chunk.export(out_file, format=\"wav\")\n", " pipeline(out_file)" ] }, { "cell_type": "code", "execution_count": null, "id": "a069a0fd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }