diff --git "a/dev.ipynb" "b/dev.ipynb" new file mode 100644--- /dev/null +++ "b/dev.ipynb" @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/hewliyang/speech-to-speech-translation/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import torch\n", + "import librosa\n", + "from transformers import VitsModel, VitsTokenizer, pipeline\n", + "from IPython.display import Audio" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using cuda:0 with dtype torch.float16\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at facebook/mms-tts-zlm were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.weight_g', 'flow.flows.1.wavenet.in_layers.1.weight_v', 'flow.flows.1.wavenet.in_layers.2.weight_g', 'flow.flows.1.wavenet.in_layers.2.weight_v', 'flow.flows.1.wavenet.in_layers.3.weight_g', 'flow.flows.1.wavenet.in_layers.3.weight_v', 'flow.flows.1.wavenet.res_skip_layers.0.weight_g', 'flow.flows.1.wavenet.res_skip_layers.0.weight_v', 'flow.flows.1.wavenet.res_skip_layers.1.weight_g', 'flow.flows.1.wavenet.res_skip_layers.1.weight_v', 'flow.flows.1.wavenet.res_skip_layers.2.weight_g', 'flow.flows.1.wavenet.res_skip_layers.2.weight_v', 'flow.flows.1.wavenet.res_skip_layers.3.weight_g', 'flow.flows.1.wavenet.res_skip_layers.3.weight_v', 'flow.flows.2.wavenet.in_layers.0.weight_g', 'flow.flows.2.wavenet.in_layers.0.weight_v', 'flow.flows.2.wavenet.in_layers.1.weight_g', 'flow.flows.2.wavenet.in_layers.1.weight_v', 'flow.flows.2.wavenet.in_layers.2.weight_g', 'flow.flows.2.wavenet.in_layers.2.weight_v', 'flow.flows.2.wavenet.in_layers.3.weight_g', 'flow.flows.2.wavenet.in_layers.3.weight_v', 'flow.flows.2.wavenet.res_skip_layers.0.weight_g', 'flow.flows.2.wavenet.res_skip_layers.0.weight_v', 'flow.flows.2.wavenet.res_skip_layers.1.weight_g', 'flow.flows.2.wavenet.res_skip_layers.1.weight_v', 'flow.flows.2.wavenet.res_skip_layers.2.weight_g', 'flow.flows.2.wavenet.res_skip_layers.2.weight_v', 'flow.flows.2.wavenet.res_skip_layers.3.weight_g', 'flow.flows.2.wavenet.res_skip_layers.3.weight_v', 'flow.flows.3.wavenet.in_layers.0.weight_g', 'flow.flows.3.wavenet.in_layers.0.weight_v', 'flow.flows.3.wavenet.in_layers.1.weight_g', 'flow.flows.3.wavenet.in_layers.1.weight_v', 'flow.flows.3.wavenet.in_layers.2.weight_g', 'flow.flows.3.wavenet.in_layers.2.weight_v', 'flow.flows.3.wavenet.in_layers.3.weight_g', 'flow.flows.3.wavenet.in_layers.3.weight_v', 'flow.flows.3.wavenet.res_skip_layers.0.weight_g', 'flow.flows.3.wavenet.res_skip_layers.0.weight_v', 'flow.flows.3.wavenet.res_skip_layers.1.weight_g', 'flow.flows.3.wavenet.res_skip_layers.1.weight_v', 'flow.flows.3.wavenet.res_skip_layers.2.weight_g', 'flow.flows.3.wavenet.res_skip_layers.2.weight_v', 'flow.flows.3.wavenet.res_skip_layers.3.weight_g', 'flow.flows.3.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.in_layers.0.weight_g', 'posterior_encoder.wavenet.in_layers.0.weight_v', 'posterior_encoder.wavenet.in_layers.1.weight_g', 'posterior_encoder.wavenet.in_layers.1.weight_v', 'posterior_encoder.wavenet.in_layers.10.weight_g', 'posterior_encoder.wavenet.in_layers.10.weight_v', 'posterior_encoder.wavenet.in_layers.11.weight_g', 'posterior_encoder.wavenet.in_layers.11.weight_v', 'posterior_encoder.wavenet.in_layers.12.weight_g', 'posterior_encoder.wavenet.in_layers.12.weight_v', 'posterior_encoder.wavenet.in_layers.13.weight_g', 'posterior_encoder.wavenet.in_layers.13.weight_v', 'posterior_encoder.wavenet.in_layers.14.weight_g', 'posterior_encoder.wavenet.in_layers.14.weight_v', 'posterior_encoder.wavenet.in_layers.15.weight_g', 'posterior_encoder.wavenet.in_layers.15.weight_v', 'posterior_encoder.wavenet.in_layers.2.weight_g', 'posterior_encoder.wavenet.in_layers.2.weight_v', 'posterior_encoder.wavenet.in_layers.3.weight_g', 'posterior_encoder.wavenet.in_layers.3.weight_v', 'posterior_encoder.wavenet.in_layers.4.weight_g', 'posterior_encoder.wavenet.in_layers.4.weight_v', 'posterior_encoder.wavenet.in_layers.5.weight_g', 'posterior_encoder.wavenet.in_layers.5.weight_v', 'posterior_encoder.wavenet.in_layers.6.weight_g', 'posterior_encoder.wavenet.in_layers.6.weight_v', 'posterior_encoder.wavenet.in_layers.7.weight_g', 'posterior_encoder.wavenet.in_layers.7.weight_v', 'posterior_encoder.wavenet.in_layers.8.weight_g', 'posterior_encoder.wavenet.in_layers.8.weight_v', 'posterior_encoder.wavenet.in_layers.9.weight_g', 'posterior_encoder.wavenet.in_layers.9.weight_v', 'posterior_encoder.wavenet.res_skip_layers.0.weight_g', 'posterior_encoder.wavenet.res_skip_layers.0.weight_v', 'posterior_encoder.wavenet.res_skip_layers.1.weight_g', 'posterior_encoder.wavenet.res_skip_layers.1.weight_v', 'posterior_encoder.wavenet.res_skip_layers.10.weight_g', 'posterior_encoder.wavenet.res_skip_layers.10.weight_v', 'posterior_encoder.wavenet.res_skip_layers.11.weight_g', 'posterior_encoder.wavenet.res_skip_layers.11.weight_v', 'posterior_encoder.wavenet.res_skip_layers.12.weight_g', 'posterior_encoder.wavenet.res_skip_layers.12.weight_v', 'posterior_encoder.wavenet.res_skip_layers.13.weight_g', 'posterior_encoder.wavenet.res_skip_layers.13.weight_v', 'posterior_encoder.wavenet.res_skip_layers.14.weight_g', 'posterior_encoder.wavenet.res_skip_layers.14.weight_v', 'posterior_encoder.wavenet.res_skip_layers.15.weight_g', 'posterior_encoder.wavenet.res_skip_layers.15.weight_v', 'posterior_encoder.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.2.weight_v', 'posterior_encoder.wavenet.res_skip_layers.3.weight_g', 'posterior_encoder.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.res_skip_layers.4.weight_g', 'posterior_encoder.wavenet.res_skip_layers.4.weight_v', 'posterior_encoder.wavenet.res_skip_layers.5.weight_g', 'posterior_encoder.wavenet.res_skip_layers.5.weight_v', 'posterior_encoder.wavenet.res_skip_layers.6.weight_g', 'posterior_encoder.wavenet.res_skip_layers.6.weight_v', 'posterior_encoder.wavenet.res_skip_layers.7.weight_g', 'posterior_encoder.wavenet.res_skip_layers.7.weight_v', 'posterior_encoder.wavenet.res_skip_layers.8.weight_g', 'posterior_encoder.wavenet.res_skip_layers.8.weight_v', 'posterior_encoder.wavenet.res_skip_layers.9.weight_g', 'posterior_encoder.wavenet.res_skip_layers.9.weight_v']\n", + "- This IS expected if you are initializing VitsModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing VitsModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of VitsModel were not initialized from the model checkpoint at facebook/mms-tts-zlm and are newly initialized: ['flow.flows.0.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.0.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.0.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.0.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.0.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.0.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.1.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.1.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.10.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.10.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.11.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.11.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.12.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.12.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.13.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.13.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.14.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.14.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.15.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.15.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.2.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.2.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.3.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.3.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.4.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.4.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.5.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.5.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.6.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.6.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.7.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.7.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.8.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.8.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.9.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.9.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.10.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.10.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.11.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.11.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.12.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.12.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.13.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.13.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.14.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.14.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.15.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.15.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.4.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.4.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.5.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.5.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.6.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.6.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.7.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.7.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.8.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.8.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.9.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.9.parametrizations.weight.original1']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "config.json: 100%|██████████| 1.27k/1.27k [00:00<00:00, 4.96MB/s]\n", + "model.safetensors: 100%|██████████| 3.09G/3.09G [03:04<00:00, 16.7MB/s]\n", + "generation_config.json: 100%|██████████| 3.94k/3.94k [00:00<00:00, 21.4MB/s]\n", + "tokenizer_config.json: 100%|██████████| 283k/283k [00:00<00:00, 6.43MB/s]\n", + "vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 15.4MB/s]\n", + "tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 7.27MB/s]\n", + "merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 16.5MB/s]\n", + "normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 223kB/s]\n", + "added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 54.4MB/s]\n", + "special_tokens_map.json: 100%|██████████| 2.07k/2.07k [00:00<00:00, 10.3MB/s]\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "preprocessor_config.json: 100%|██████████| 340/340 [00:00<00:00, 1.69MB/s]\n" + ] + } + ], + "source": [ + "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", + "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n", + "print(f\"Using {device} with dtype {torch_dtype}\")\n", + "\n", + "model = VitsModel.from_pretrained(\"facebook/mms-tts-zlm\")\n", + "tokenizer = VitsTokenizer.from_pretrained(\"facebook/mms-tts-zlm\")\n", + "\n", + "asr_pipe = pipeline( # noqa: F821\n", + " \"automatic-speech-recognition\",\n", + " model=\"openai/whisper-large-v3\",\n", + " device=device,\n", + " torch_dtype=torch_dtype,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def synthesise(text):\n", + " inputs = tokenizer(text=text, return_tensors=\"pt\")\n", + " input_ids = inputs[\"input_ids\"]\n", + "\n", + " with torch.no_grad():\n", + " outputs = model(input_ids)\n", + "\n", + " speech = outputs[\"waveform\"]\n", + " return speech\n", + "\n", + "def translate(audio):\n", + " outputs = asr_pipe(\n", + " audio,\n", + " max_new_tokens=256,\n", + " generate_kwargs={\"task\": \"transcribe\", \"language\": \"ms\"},\n", + " )\n", + " return outputs[\"text\"]\n", + "\n", + "def speech_to_speech_translation(audio):\n", + " translated_text = translate(audio)\n", + " synthesised_speech = synthesise(translated_text)\n", + " synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)\n", + " return 16000, synthesised_speech" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "x, sr = librosa.load(\"audio.wav\", sr=16_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Audio(x, rate=sr)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Pada bab 16. Saya mungkin telah memberitahu anda tentang permulaan penyelidikan ini dalam beberapa lirik. Tetapi saya mahu anda melihat setiap langkah dengan mana kami datang. Saya juga setuju dengan apa-apa pun yang Marguerite mahukan.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = translate(x)\n", + "print(text)\n", + "tts = synthesise(text)\n", + "\n", + "Audio(tts, rate=16_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-8, -8, -5, ..., -4, -2, -3]], dtype=int16)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "(tts.numpy() * 32767).astype(np.int16)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Audio((tts.numpy() * 32767).astype(np.int16), rate=16_000)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sr, x = speech_to_speech_translation(x)\n", + "Audio(x, rate=sr)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}