rohitptnk commited on
Commit
06582f9
·
1 Parent(s): 69ecbc4

Refactor: move transcription logic to separate script

Browse files
Files changed (2) hide show
  1. Voice2VoiceTranslation.ipynb +197 -225
  2. transcribe.py +0 -0
Voice2VoiceTranslation.ipynb CHANGED
@@ -1,27 +1,10 @@
1
  {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "T4",
8
- "include_colab_link": true
9
- },
10
- "kernelspec": {
11
- "name": "python3",
12
- "display_name": "Python 3"
13
- },
14
- "language_info": {
15
- "name": "python"
16
- },
17
- "accelerator": "GPU"
18
- },
19
  "cells": [
20
  {
21
  "cell_type": "markdown",
22
  "metadata": {
23
- "id": "view-in-github",
24
- "colab_type": "text"
25
  },
26
  "source": [
27
  "<a href=\"https://colab.research.google.com/github/rohitptnk/Voice2VoiceTranslate/blob/main/Voice2VoiceTranslation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
@@ -29,9 +12,7 @@
29
  },
30
  {
31
  "cell_type": "code",
32
- "source": [
33
- "!nvidia-smi"
34
- ],
35
  "metadata": {
36
  "colab": {
37
  "base_uri": "https://localhost:8080/"
@@ -39,11 +20,10 @@
39
  "id": "8locV9sOyhkn",
40
  "outputId": "23b45964-93da-47fe-f12f-04f9a857e205"
41
  },
42
- "execution_count": 1,
43
  "outputs": [
44
  {
45
- "output_type": "stream",
46
  "name": "stdout",
 
47
  "text": [
48
  "Thu Jun 12 10:52:12 2025 \n",
49
  "+-----------------------------------------------------------------------------------------+\n",
@@ -67,14 +47,14 @@
67
  "+-----------------------------------------------------------------------------------------+\n"
68
  ]
69
  }
 
 
 
70
  ]
71
  },
72
  {
73
  "cell_type": "code",
74
- "source": [
75
- "!git clone https://github.com/rohitptnk/Voice2VoiceTranslate.git\n",
76
- "%cd Voice2VoiceTranslate"
77
- ],
78
  "metadata": {
79
  "colab": {
80
  "base_uri": "https://localhost:8080/"
@@ -82,11 +62,10 @@
82
  "id": "NnO-D2woRdMR",
83
  "outputId": "16075dc5-7586-45e5-d6a9-0bd0a5c8b70a"
84
  },
85
- "execution_count": 2,
86
  "outputs": [
87
  {
88
- "output_type": "stream",
89
  "name": "stdout",
 
90
  "text": [
91
  "Cloning into 'Voice2VoiceTranslate'...\n",
92
  "remote: Enumerating objects: 44, done.\u001b[K\n",
@@ -98,35 +77,36 @@
98
  "/content/Voice2VoiceTranslate\n"
99
  ]
100
  }
 
 
 
 
101
  ]
102
  },
103
  {
104
  "cell_type": "markdown",
105
- "source": [
106
- "# Install and Import"
107
- ],
108
  "metadata": {
109
  "id": "UCVhY-RBkP_Q"
110
- }
 
 
 
111
  },
112
  {
113
  "cell_type": "code",
114
- "source": [
115
- "!pip install --quiet -r requirements.txt"
116
- ],
117
  "metadata": {
118
  "colab": {
119
  "base_uri": "https://localhost:8080/"
120
  },
 
121
  "id": "poBUySdreSA9",
122
- "outputId": "323ac8fa-2e91-4b81-8a79-a17dda19d1c8",
123
- "collapsed": true
124
  },
125
- "execution_count": 12,
126
  "outputs": [
127
  {
128
- "output_type": "stream",
129
  "name": "stdout",
 
130
  "text": [
131
  " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
132
  " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
@@ -136,77 +116,49 @@
136
  " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
137
  ]
138
  }
 
 
 
139
  ]
140
  },
141
  {
142
  "cell_type": "markdown",
143
- "source": [
144
- "# Convert Speech to Text using Whisper"
145
- ],
146
  "metadata": {
147
  "id": "HBzVTrKIjCFz"
148
- }
149
- },
150
- {
151
- "cell_type": "code",
152
- "source": [
153
- "import whisper\n",
154
- "def transcribe_audio_locally(audio_file_path, model_size=\"base\"):\n",
155
- " \"\"\"\n",
156
- " Transcribe audio using locally installed Whisper\n",
157
- "\n",
158
- " Args:\n",
159
- " audio_file_path (str): Path to audio file\n",
160
- " model_size (str): Whisper model size (tiny, base, small, medium, large)\n",
161
- "\n",
162
- " Returns:\n",
163
- " dict: Transcription result containing text and other info\n",
164
- " \"\"\"\n",
165
- " # Load the model\n",
166
- " model = whisper.load_model(model_size)\n",
167
- "\n",
168
- " # Transcribe the audio\n",
169
- " result = model.transcribe(audio_file_path)\n",
170
- "\n",
171
- " return result"
172
- ],
173
- "metadata": {
174
- "id": "rbXMa6MoeriP"
175
  },
176
- "execution_count": 3,
177
- "outputs": []
 
178
  },
179
  {
180
  "cell_type": "code",
181
- "source": [
182
- "# Usage\n",
183
- "audio_file = \"Input Audio Sample.wav\" # Supports many audio formats\n",
184
- "result = transcribe_audio_locally(audio_file, \"base\") # Using base model"
185
- ],
186
  "metadata": {
187
- "id": "QIlmE2rffVCc",
188
  "colab": {
189
  "base_uri": "https://localhost:8080/"
190
  },
 
191
  "outputId": "1c313b3e-a79b-4cc3-f9e8-6a55f891526f"
192
  },
193
- "execution_count": 4,
194
  "outputs": [
195
  {
196
- "output_type": "stream",
197
  "name": "stderr",
 
198
  "text": [
199
  "100%|███████████████████████████████████████| 139M/139M [00:06<00:00, 20.9MiB/s]\n"
200
  ]
201
  }
 
 
 
 
 
 
202
  ]
203
  },
204
  {
205
  "cell_type": "code",
206
- "source": [
207
- "text = result[\"text\"]\n",
208
- "print(text)"
209
- ],
210
  "metadata": {
211
  "colab": {
212
  "base_uri": "https://localhost:8080/"
@@ -214,55 +166,63 @@
214
  "id": "czqyT0rziVZz",
215
  "outputId": "fdac7ad1-b51d-4ab4-d556-8477494206e4"
216
  },
217
- "execution_count": 5,
218
  "outputs": [
219
  {
220
- "output_type": "stream",
221
  "name": "stdout",
 
222
  "text": [
223
  " Not the best because eventually see the thing is for us everybody shines when a film shines. For us the film is the hero of the film. So we are always hoping and praying that the film is the thing that people take back the most. But this is also fine. It's like a good second prize.\n"
224
  ]
225
  }
 
 
 
 
226
  ]
227
  },
228
  {
229
  "cell_type": "code",
230
- "source": [
231
- "# Save to a text file\n",
232
- "with open(\"transcribed_text.txt\", \"w\", encoding=\"utf-8\") as f:\n",
233
- " f.write(text)\n",
234
- "\n",
235
- "print(\"Saved to transcribed_text.txt\")"
236
- ],
237
  "metadata": {
238
- "id": "mSsUsAS-UdSw",
239
  "colab": {
240
  "base_uri": "https://localhost:8080/"
241
  },
 
242
  "outputId": "5033abbe-4d7f-42bd-ffcd-7f2290690fd2"
243
  },
244
- "execution_count": 6,
245
  "outputs": [
246
  {
247
- "output_type": "stream",
248
  "name": "stdout",
 
249
  "text": [
250
  "Saved to transcribed_text.txt\n"
251
  ]
252
  }
 
 
 
 
 
 
 
253
  ]
254
  },
255
  {
256
  "cell_type": "markdown",
257
- "source": [
258
- "# Translate text-to-text using Argos Translate"
259
- ],
260
  "metadata": {
261
  "id": "V7ipQTRbjPXq"
262
- }
 
 
 
263
  },
264
  {
265
  "cell_type": "code",
 
 
 
 
 
266
  "source": [
267
  "import argostranslate.package\n",
268
  "import argostranslate.translate\n",
@@ -273,34 +233,23 @@
273
  "package = next(filter(lambda x: x.from_code == \"en\" and x.to_code == \"hi\", available_packages))\n",
274
  "argostranslate.package.install_from_path(package.download())\n",
275
  "\n"
276
- ],
277
- "metadata": {
278
- "id": "NNkRgTgkjwoG"
279
- },
280
- "execution_count": 7,
281
- "outputs": []
282
  },
283
  {
284
  "cell_type": "code",
285
- "source": [
286
- "with open(\"transcribed_text.txt\", \"r\", encoding=\"utf-8\") as f:\n",
287
- " text = f.read()"
288
- ],
289
  "metadata": {
290
  "id": "fR_Q7Bb8w2ho"
291
  },
292
- "execution_count": 8,
293
- "outputs": []
 
 
 
294
  },
295
  {
296
  "cell_type": "code",
297
- "source": [
298
- "# Translate offline\n",
299
- "hindi_translation = argostranslate.translate.translate(text, \"en\", \"hi\")\n",
300
- "\n",
301
- "print(\"English:\", text)\n",
302
- "print(\"Hindi:\", hindi_translation)"
303
- ],
304
  "metadata": {
305
  "colab": {
306
  "base_uri": "https://localhost:8080/"
@@ -308,101 +257,103 @@
308
  "id": "ksXnYJNTkSji",
309
  "outputId": "7e5ef285-d3aa-4a0b-89f5-0f0a81efda8f"
310
  },
311
- "execution_count": 9,
312
  "outputs": [
313
  {
314
- "output_type": "stream",
315
  "name": "stdout",
 
316
  "text": [
317
  "English: Not the best because eventually see the thing is for us everybody shines when a film shines. For us the film is the hero of the film. So we are always hoping and praying that the film is the thing that people take back the most. But this is also fine. It's like a good second prize.\n",
318
  "Hindi: क्योंकि अंततः यह देखने की बात हमारे लिए है, हर कोई चमकता है जब एक फिल्म चमकती है। हमारे लिए फिल्म फिल्म का हीरो है। इसलिए हम हमेशा उम्मीद करते हैं और प्रार्थना करते हैं कि फिल्म वह चीज है जिसे लोग वापस लेते हैं। लेकिन यह भी ठीक है। यह एक अच्छा दूसरा पुरस्कार है।\n"
319
  ]
320
  }
 
 
 
 
 
 
 
321
  ]
322
  },
323
  {
324
  "cell_type": "code",
325
- "source": [
326
- "# Save to a text file\n",
327
- "with open(\"hindi_translation.txt\", \"w\", encoding=\"utf-8\") as f:\n",
328
- " f.write(hindi_translation)\n",
329
- "\n",
330
- "print(\"Saved to hindi_translation.txt\")"
331
- ],
332
  "metadata": {
333
- "id": "f6YatTXjT5EH",
334
  "colab": {
335
  "base_uri": "https://localhost:8080/"
336
  },
 
337
  "outputId": "459cc759-e54e-4376-eae5-ba6dba2a67e7"
338
  },
339
- "execution_count": 10,
340
  "outputs": [
341
  {
342
- "output_type": "stream",
343
  "name": "stdout",
 
344
  "text": [
345
  "Saved to hindi_translation.txt\n"
346
  ]
347
  }
 
 
 
 
 
 
 
348
  ]
349
  },
350
  {
351
  "cell_type": "markdown",
352
- "source": [
353
- "# Text to Speech using Suno-Bark"
354
- ],
355
  "metadata": {
356
  "id": "1BECaz-clOJB"
357
- }
 
 
 
358
  },
359
  {
360
  "cell_type": "code",
 
 
 
 
 
361
  "source": [
362
  "from transformers import BarkModel\n",
363
  "\n",
364
  "model = BarkModel.from_pretrained(\"suno/bark-small\")"
365
- ],
366
- "metadata": {
367
- "id": "E2VsTRWLMAqX"
368
- },
369
- "execution_count": null,
370
- "outputs": []
371
  },
372
  {
373
  "cell_type": "code",
 
 
 
 
 
374
  "source": [
375
  "import torch\n",
376
  "\n",
377
  "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
378
  "model = model.to(device)"
379
- ],
380
- "metadata": {
381
- "id": "b8itQckZJqm6"
382
- },
383
- "execution_count": 5,
384
- "outputs": []
385
  },
386
  {
387
  "cell_type": "code",
 
 
 
 
 
388
  "source": [
389
  "from transformers import AutoProcessor\n",
390
  "\n",
391
  "processor = AutoProcessor.from_pretrained(\"suno/bark\")"
392
- ],
393
- "metadata": {
394
- "id": "UG5dU7LpL6rv"
395
- },
396
- "execution_count": null,
397
- "outputs": []
398
  },
399
  {
400
  "cell_type": "code",
401
- "source": [
402
- "with open(\"hindi_translation.txt\", \"r\", encoding=\"utf-8\") as f:\n",
403
- " hindi_translation = f.read()\n",
404
- " print(hindi_translation)"
405
- ],
406
  "metadata": {
407
  "colab": {
408
  "base_uri": "https://localhost:8080/",
@@ -411,19 +362,28 @@
411
  "id": "Qho3YgeEZHJx",
412
  "outputId": "707a7092-cc95-4680-91d5-44367fd7b092"
413
  },
414
- "execution_count": 7,
415
  "outputs": [
416
  {
417
- "output_type": "stream",
418
  "name": "stdout",
 
419
  "text": [
420
  "क्योंकि अंततः यह देखने की बात हमारे लिए है, हर कोई चमकता है जब एक फिल्म चमकती है। हमारे लिए फिल्म फिल्म का हीरो है। इसलिए हम हमेशा उम्मीद करते हैं और प्रार्थना करते हैं कि फिल्म वह चीज है जिसे लोग वापस लेते हैं। लेकिन यह भी ठीक है। यह एक अच्छा दूसरा पुरस्कार है।\n"
421
  ]
422
  }
 
 
 
 
 
423
  ]
424
  },
425
  {
426
  "cell_type": "code",
 
 
 
 
 
427
  "source": [
428
  "voice_preset = \"v2/hi_speaker_2\"\n",
429
  "# prepare the inputs\n",
@@ -431,48 +391,30 @@
431
  "inputs = processor(text_prompt, voice_preset=voice_preset)\n",
432
  "for key, value in inputs.items():\n",
433
  " inputs[key] = inputs[key].to(device)"
434
- ],
435
- "metadata": {
436
- "id": "9WnxxN8oOGk0"
437
- },
438
- "execution_count": null,
439
- "outputs": []
440
  },
441
  {
442
  "cell_type": "code",
443
- "source": [
444
- "from IPython.display import Audio\n",
445
- "sampling_rate = model.generation_config.sample_rate\n",
446
- "# generate speech\n",
447
- "speech_output = model.generate(**inputs)\n",
448
- "\n",
449
- "# let's hear it\n",
450
- "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
451
- ],
452
  "metadata": {
453
- "id": "5nAg0xqOWz9W",
454
  "colab": {
455
  "base_uri": "https://localhost:8080/",
456
  "height": 131
457
  },
 
458
  "outputId": "e0476c8e-9513-433d-82b7-2372d936a168"
459
  },
460
- "execution_count": 11,
461
  "outputs": [
462
  {
463
- "output_type": "stream",
464
  "name": "stderr",
 
465
  "text": [
466
  "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
467
  "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n"
468
  ]
469
  },
470
  {
471
- "output_type": "execute_result",
472
  "data": {
473
- "text/plain": [
474
- "<IPython.lib.display.Audio object>"
475
- ],
476
  "text/html": [
477
  "\n",
478
  " <audio controls=\"controls\" >\n",
@@ -480,64 +422,75 @@
480
  " Your browser does not support the audio element.\n",
481
  " </audio>\n",
482
  " "
 
 
 
483
  ]
484
  },
 
485
  "metadata": {},
486
- "execution_count": 11
487
  }
 
 
 
 
 
 
 
 
 
488
  ]
489
  },
490
  {
491
  "cell_type": "code",
 
 
 
 
 
492
  "source": [
493
  "import scipy\n",
494
  "\n",
495
  "sample_rate = model.generation_config.sample_rate\n",
496
  "scipy.io.wavfile.write(\"output_audio.wav\", rate=sample_rate, data=audio_array)"
497
- ],
498
- "metadata": {
499
- "id": "Sc2zcYQMW48e"
500
- },
501
- "execution_count": null,
502
- "outputs": []
503
  },
504
  {
505
  "cell_type": "markdown",
506
- "source": [
507
- "# Git"
508
- ],
509
  "metadata": {
510
  "id": "GZDcQfEFxznp"
511
- }
 
 
 
512
  },
513
  {
514
  "cell_type": "code",
515
- "source": [
516
- "!git config --global user.email \"rohitptnk03@gmail.com\"\n",
517
- "!git config --global user.name \"Rohit Patnaik\""
518
- ],
519
  "metadata": {
520
  "id": "Be0dEZo0fnrX"
521
  },
522
- "execution_count": null,
523
- "outputs": []
 
 
 
524
  },
525
  {
526
  "cell_type": "code",
527
- "source": [
528
- "!git add ."
529
- ],
530
  "metadata": {
531
  "id": "y_LWxXcR6jik"
532
  },
533
- "execution_count": null,
534
- "outputs": []
 
 
535
  },
536
  {
537
  "cell_type": "code",
538
- "source": [
539
- "!git status"
540
- ],
541
  "metadata": {
542
  "colab": {
543
  "base_uri": "https://localhost:8080/"
@@ -545,11 +498,10 @@
545
  "id": "GJQBBcsMfZlK",
546
  "outputId": "395d67da-bb71-46b4-d68a-b5e656ec53f3"
547
  },
548
- "execution_count": null,
549
  "outputs": [
550
  {
551
- "output_type": "stream",
552
  "name": "stdout",
 
553
  "text": [
554
  "On branch main\n",
555
  "Your branch is up to date with 'origin/main'.\n",
@@ -557,13 +509,14 @@
557
  "nothing to commit, working tree clean\n"
558
  ]
559
  }
 
 
 
560
  ]
561
  },
562
  {
563
  "cell_type": "code",
564
- "source": [
565
- "!git commit -m\"save output_audio from suno_bark\""
566
- ],
567
  "metadata": {
568
  "colab": {
569
  "base_uri": "https://localhost:8080/"
@@ -571,24 +524,24 @@
571
  "id": "-CIUFNSsfatq",
572
  "outputId": "cdd48a15-cd1c-4648-ad64-550853e60fa9"
573
  },
574
- "execution_count": null,
575
  "outputs": [
576
  {
577
- "output_type": "stream",
578
  "name": "stdout",
 
579
  "text": [
580
  "[main 7ec7d5d] save output_audio from suno_bark\n",
581
  " 1 file changed, 0 insertions(+), 0 deletions(-)\n",
582
  " rewrite output_audio.wav (82%)\n"
583
  ]
584
  }
 
 
 
585
  ]
586
  },
587
  {
588
  "cell_type": "code",
589
- "source": [
590
- "!git push origin main"
591
- ],
592
  "metadata": {
593
  "colab": {
594
  "base_uri": "https://localhost:8080/"
@@ -596,25 +549,44 @@
596
  "id": "cvivcfrxflSu",
597
  "outputId": "e1aba536-b30c-429e-a7c4-b5cc8152fd8f"
598
  },
599
- "execution_count": null,
600
  "outputs": [
601
  {
602
- "output_type": "stream",
603
  "name": "stdout",
 
604
  "text": [
605
  "fatal: could not read Username for 'https://github.com': No such device or address\n"
606
  ]
607
  }
 
 
 
608
  ]
609
  },
610
  {
611
  "cell_type": "code",
612
- "source": [],
613
  "metadata": {
614
  "id": "gwN-bzaSgnJX"
615
  },
616
- "execution_count": null,
617
- "outputs": []
618
  }
619
- ]
620
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "cells": [
3
  {
4
  "cell_type": "markdown",
5
  "metadata": {
6
+ "colab_type": "text",
7
+ "id": "view-in-github"
8
  },
9
  "source": [
10
  "<a href=\"https://colab.research.google.com/github/rohitptnk/Voice2VoiceTranslate/blob/main/Voice2VoiceTranslation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 1,
 
 
16
  "metadata": {
17
  "colab": {
18
  "base_uri": "https://localhost:8080/"
 
20
  "id": "8locV9sOyhkn",
21
  "outputId": "23b45964-93da-47fe-f12f-04f9a857e205"
22
  },
 
23
  "outputs": [
24
  {
 
25
  "name": "stdout",
26
+ "output_type": "stream",
27
  "text": [
28
  "Thu Jun 12 10:52:12 2025 \n",
29
  "+-----------------------------------------------------------------------------------------+\n",
 
47
  "+-----------------------------------------------------------------------------------------+\n"
48
  ]
49
  }
50
+ ],
51
+ "source": [
52
+ "!nvidia-smi"
53
  ]
54
  },
55
  {
56
  "cell_type": "code",
57
+ "execution_count": 2,
 
 
 
58
  "metadata": {
59
  "colab": {
60
  "base_uri": "https://localhost:8080/"
 
62
  "id": "NnO-D2woRdMR",
63
  "outputId": "16075dc5-7586-45e5-d6a9-0bd0a5c8b70a"
64
  },
 
65
  "outputs": [
66
  {
 
67
  "name": "stdout",
68
+ "output_type": "stream",
69
  "text": [
70
  "Cloning into 'Voice2VoiceTranslate'...\n",
71
  "remote: Enumerating objects: 44, done.\u001b[K\n",
 
77
  "/content/Voice2VoiceTranslate\n"
78
  ]
79
  }
80
+ ],
81
+ "source": [
82
+ "!git clone https://github.com/rohitptnk/Voice2VoiceTranslate.git\n",
83
+ "%cd Voice2VoiceTranslate"
84
  ]
85
  },
86
  {
87
  "cell_type": "markdown",
 
 
 
88
  "metadata": {
89
  "id": "UCVhY-RBkP_Q"
90
+ },
91
+ "source": [
92
+ "# Install and Import"
93
+ ]
94
  },
95
  {
96
  "cell_type": "code",
97
+ "execution_count": 12,
 
 
98
  "metadata": {
99
  "colab": {
100
  "base_uri": "https://localhost:8080/"
101
  },
102
+ "collapsed": true,
103
  "id": "poBUySdreSA9",
104
+ "outputId": "323ac8fa-2e91-4b81-8a79-a17dda19d1c8"
 
105
  },
 
106
  "outputs": [
107
  {
 
108
  "name": "stdout",
109
+ "output_type": "stream",
110
  "text": [
111
  " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
112
  " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
 
116
  " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
117
  ]
118
  }
119
+ ],
120
+ "source": [
121
+ "!pip install --quiet -r requirements.txt"
122
  ]
123
  },
124
  {
125
  "cell_type": "markdown",
 
 
 
126
  "metadata": {
127
  "id": "HBzVTrKIjCFz"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  },
129
+ "source": [
130
+ "# Convert Speech to Text using Whisper"
131
+ ]
132
  },
133
  {
134
  "cell_type": "code",
135
+ "execution_count": null,
 
 
 
 
136
  "metadata": {
 
137
  "colab": {
138
  "base_uri": "https://localhost:8080/"
139
  },
140
+ "id": "QIlmE2rffVCc",
141
  "outputId": "1c313b3e-a79b-4cc3-f9e8-6a55f891526f"
142
  },
 
143
  "outputs": [
144
  {
 
145
  "name": "stderr",
146
+ "output_type": "stream",
147
  "text": [
148
  "100%|███████████████████████████████████████| 139M/139M [00:06<00:00, 20.9MiB/s]\n"
149
  ]
150
  }
151
+ ],
152
+ "source": [
153
+ "# Transcribe audio\n",
154
+ "from transcribe import transcribe_audio_locally\n",
155
+ "audio_file = \"Input Audio Sample.wav\" # Supports many audio formats\n",
156
+ "result = transcribe_audio_locally(audio_file, \"base\") # Using base model"
157
  ]
158
  },
159
  {
160
  "cell_type": "code",
161
+ "execution_count": 5,
 
 
 
162
  "metadata": {
163
  "colab": {
164
  "base_uri": "https://localhost:8080/"
 
166
  "id": "czqyT0rziVZz",
167
  "outputId": "fdac7ad1-b51d-4ab4-d556-8477494206e4"
168
  },
 
169
  "outputs": [
170
  {
 
171
  "name": "stdout",
172
+ "output_type": "stream",
173
  "text": [
174
  " Not the best because eventually see the thing is for us everybody shines when a film shines. For us the film is the hero of the film. So we are always hoping and praying that the film is the thing that people take back the most. But this is also fine. It's like a good second prize.\n"
175
  ]
176
  }
177
+ ],
178
+ "source": [
179
+ "text = result[\"text\"]\n",
180
+ "print(text)"
181
  ]
182
  },
183
  {
184
  "cell_type": "code",
185
+ "execution_count": 6,
 
 
 
 
 
 
186
  "metadata": {
 
187
  "colab": {
188
  "base_uri": "https://localhost:8080/"
189
  },
190
+ "id": "mSsUsAS-UdSw",
191
  "outputId": "5033abbe-4d7f-42bd-ffcd-7f2290690fd2"
192
  },
 
193
  "outputs": [
194
  {
 
195
  "name": "stdout",
196
+ "output_type": "stream",
197
  "text": [
198
  "Saved to transcribed_text.txt\n"
199
  ]
200
  }
201
+ ],
202
+ "source": [
203
+ "# Save to a text file\n",
204
+ "with open(\"transcribed_text.txt\", \"w\", encoding=\"utf-8\") as f:\n",
205
+ " f.write(text)\n",
206
+ "\n",
207
+ "print(\"Saved to transcribed_text.txt\")"
208
  ]
209
  },
210
  {
211
  "cell_type": "markdown",
 
 
 
212
  "metadata": {
213
  "id": "V7ipQTRbjPXq"
214
+ },
215
+ "source": [
216
+ "# Translate text-to-text using Argos Translate"
217
+ ]
218
  },
219
  {
220
  "cell_type": "code",
221
+ "execution_count": 7,
222
+ "metadata": {
223
+ "id": "NNkRgTgkjwoG"
224
+ },
225
+ "outputs": [],
226
  "source": [
227
  "import argostranslate.package\n",
228
  "import argostranslate.translate\n",
 
233
  "package = next(filter(lambda x: x.from_code == \"en\" and x.to_code == \"hi\", available_packages))\n",
234
  "argostranslate.package.install_from_path(package.download())\n",
235
  "\n"
236
+ ]
 
 
 
 
 
237
  },
238
  {
239
  "cell_type": "code",
240
+ "execution_count": 8,
 
 
 
241
  "metadata": {
242
  "id": "fR_Q7Bb8w2ho"
243
  },
244
+ "outputs": [],
245
+ "source": [
246
+ "with open(\"transcribed_text.txt\", \"r\", encoding=\"utf-8\") as f:\n",
247
+ " text = f.read()"
248
+ ]
249
  },
250
  {
251
  "cell_type": "code",
252
+ "execution_count": 9,
 
 
 
 
 
 
253
  "metadata": {
254
  "colab": {
255
  "base_uri": "https://localhost:8080/"
 
257
  "id": "ksXnYJNTkSji",
258
  "outputId": "7e5ef285-d3aa-4a0b-89f5-0f0a81efda8f"
259
  },
 
260
  "outputs": [
261
  {
 
262
  "name": "stdout",
263
+ "output_type": "stream",
264
  "text": [
265
  "English: Not the best because eventually see the thing is for us everybody shines when a film shines. For us the film is the hero of the film. So we are always hoping and praying that the film is the thing that people take back the most. But this is also fine. It's like a good second prize.\n",
266
  "Hindi: क्योंकि अंततः यह देखने की बात हमारे लिए है, हर कोई चमकता है जब एक फिल्म चमकती है। हमारे लिए फिल्म फिल्म का हीरो है। इसलिए हम हमेशा उम्मीद करते हैं और प्रार्थना करते हैं कि फिल्म वह चीज है जिसे लोग वापस लेते हैं। लेकिन यह भी ठीक है। यह एक अच्छा दूसरा पुरस्कार है।\n"
267
  ]
268
  }
269
+ ],
270
+ "source": [
271
+ "# Translate offline\n",
272
+ "hindi_translation = argostranslate.translate.translate(text, \"en\", \"hi\")\n",
273
+ "\n",
274
+ "print(\"English:\", text)\n",
275
+ "print(\"Hindi:\", hindi_translation)"
276
  ]
277
  },
278
  {
279
  "cell_type": "code",
280
+ "execution_count": 10,
 
 
 
 
 
 
281
  "metadata": {
 
282
  "colab": {
283
  "base_uri": "https://localhost:8080/"
284
  },
285
+ "id": "f6YatTXjT5EH",
286
  "outputId": "459cc759-e54e-4376-eae5-ba6dba2a67e7"
287
  },
 
288
  "outputs": [
289
  {
 
290
  "name": "stdout",
291
+ "output_type": "stream",
292
  "text": [
293
  "Saved to hindi_translation.txt\n"
294
  ]
295
  }
296
+ ],
297
+ "source": [
298
+ "# Save to a text file\n",
299
+ "with open(\"hindi_translation.txt\", \"w\", encoding=\"utf-8\") as f:\n",
300
+ " f.write(hindi_translation)\n",
301
+ "\n",
302
+ "print(\"Saved to hindi_translation.txt\")"
303
  ]
304
  },
305
  {
306
  "cell_type": "markdown",
 
 
 
307
  "metadata": {
308
  "id": "1BECaz-clOJB"
309
+ },
310
+ "source": [
311
+ "# Text to Speech using Suno-Bark"
312
+ ]
313
  },
314
  {
315
  "cell_type": "code",
316
+ "execution_count": null,
317
+ "metadata": {
318
+ "id": "E2VsTRWLMAqX"
319
+ },
320
+ "outputs": [],
321
  "source": [
322
  "from transformers import BarkModel\n",
323
  "\n",
324
  "model = BarkModel.from_pretrained(\"suno/bark-small\")"
325
+ ]
 
 
 
 
 
326
  },
327
  {
328
  "cell_type": "code",
329
+ "execution_count": 5,
330
+ "metadata": {
331
+ "id": "b8itQckZJqm6"
332
+ },
333
+ "outputs": [],
334
  "source": [
335
  "import torch\n",
336
  "\n",
337
  "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
338
  "model = model.to(device)"
339
+ ]
 
 
 
 
 
340
  },
341
  {
342
  "cell_type": "code",
343
+ "execution_count": null,
344
+ "metadata": {
345
+ "id": "UG5dU7LpL6rv"
346
+ },
347
+ "outputs": [],
348
  "source": [
349
  "from transformers import AutoProcessor\n",
350
  "\n",
351
  "processor = AutoProcessor.from_pretrained(\"suno/bark\")"
352
+ ]
 
 
 
 
 
353
  },
354
  {
355
  "cell_type": "code",
356
+ "execution_count": 7,
 
 
 
 
357
  "metadata": {
358
  "colab": {
359
  "base_uri": "https://localhost:8080/",
 
362
  "id": "Qho3YgeEZHJx",
363
  "outputId": "707a7092-cc95-4680-91d5-44367fd7b092"
364
  },
 
365
  "outputs": [
366
  {
 
367
  "name": "stdout",
368
+ "output_type": "stream",
369
  "text": [
370
  "क्योंकि अंततः यह देखने की बात हमारे लिए है, हर कोई चमकता है जब एक फिल्म चमकती है। हमारे लिए फिल्म फिल्म का हीरो है। इसलिए हम हमेशा उम्मीद करते हैं और प्रार्थना करते हैं कि फिल्म वह चीज है जिसे लोग वापस लेते हैं। लेकिन यह भी ठीक है। यह एक अच्छा दूसरा पुरस्कार है।\n"
371
  ]
372
  }
373
+ ],
374
+ "source": [
375
+ "with open(\"hindi_translation.txt\", \"r\", encoding=\"utf-8\") as f:\n",
376
+ " hindi_translation = f.read()\n",
377
+ " print(hindi_translation)"
378
  ]
379
  },
380
  {
381
  "cell_type": "code",
382
+ "execution_count": null,
383
+ "metadata": {
384
+ "id": "9WnxxN8oOGk0"
385
+ },
386
+ "outputs": [],
387
  "source": [
388
  "voice_preset = \"v2/hi_speaker_2\"\n",
389
  "# prepare the inputs\n",
 
391
  "inputs = processor(text_prompt, voice_preset=voice_preset)\n",
392
  "for key, value in inputs.items():\n",
393
  " inputs[key] = inputs[key].to(device)"
394
+ ]
 
 
 
 
 
395
  },
396
  {
397
  "cell_type": "code",
398
+ "execution_count": 11,
 
 
 
 
 
 
 
 
399
  "metadata": {
 
400
  "colab": {
401
  "base_uri": "https://localhost:8080/",
402
  "height": 131
403
  },
404
+ "id": "5nAg0xqOWz9W",
405
  "outputId": "e0476c8e-9513-433d-82b7-2372d936a168"
406
  },
 
407
  "outputs": [
408
  {
 
409
  "name": "stderr",
410
+ "output_type": "stream",
411
  "text": [
412
  "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
413
  "Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n"
414
  ]
415
  },
416
  {
 
417
  "data": {
 
 
 
418
  "text/html": [
419
  "\n",
420
  " <audio controls=\"controls\" >\n",
 
422
  " Your browser does not support the audio element.\n",
423
  " </audio>\n",
424
  " "
425
+ ],
426
+ "text/plain": [
427
+ "<IPython.lib.display.Audio object>"
428
  ]
429
  },
430
+ "execution_count": 11,
431
  "metadata": {},
432
+ "output_type": "execute_result"
433
  }
434
+ ],
435
+ "source": [
436
+ "from IPython.display import Audio\n",
437
+ "sampling_rate = model.generation_config.sample_rate\n",
438
+ "# generate speech\n",
439
+ "speech_output = model.generate(**inputs)\n",
440
+ "\n",
441
+ "# let's hear it\n",
442
+ "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
443
  ]
444
  },
445
  {
446
  "cell_type": "code",
447
+ "execution_count": null,
448
+ "metadata": {
449
+ "id": "Sc2zcYQMW48e"
450
+ },
451
+ "outputs": [],
452
  "source": [
453
  "import scipy\n",
454
  "\n",
455
  "sample_rate = model.generation_config.sample_rate\n",
456
  "scipy.io.wavfile.write(\"output_audio.wav\", rate=sample_rate, data=audio_array)"
457
+ ]
 
 
 
 
 
458
  },
459
  {
460
  "cell_type": "markdown",
 
 
 
461
  "metadata": {
462
  "id": "GZDcQfEFxznp"
463
+ },
464
+ "source": [
465
+ "# Git"
466
+ ]
467
  },
468
  {
469
  "cell_type": "code",
470
+ "execution_count": null,
 
 
 
471
  "metadata": {
472
  "id": "Be0dEZo0fnrX"
473
  },
474
+ "outputs": [],
475
+ "source": [
476
+ "!git config --global user.email \"rohitptnk03@gmail.com\"\n",
477
+ "!git config --global user.name \"Rohit Patnaik\""
478
+ ]
479
  },
480
  {
481
  "cell_type": "code",
482
+ "execution_count": null,
 
 
483
  "metadata": {
484
  "id": "y_LWxXcR6jik"
485
  },
486
+ "outputs": [],
487
+ "source": [
488
+ "!git add ."
489
+ ]
490
  },
491
  {
492
  "cell_type": "code",
493
+ "execution_count": null,
 
 
494
  "metadata": {
495
  "colab": {
496
  "base_uri": "https://localhost:8080/"
 
498
  "id": "GJQBBcsMfZlK",
499
  "outputId": "395d67da-bb71-46b4-d68a-b5e656ec53f3"
500
  },
 
501
  "outputs": [
502
  {
 
503
  "name": "stdout",
504
+ "output_type": "stream",
505
  "text": [
506
  "On branch main\n",
507
  "Your branch is up to date with 'origin/main'.\n",
 
509
  "nothing to commit, working tree clean\n"
510
  ]
511
  }
512
+ ],
513
+ "source": [
514
+ "!git status"
515
  ]
516
  },
517
  {
518
  "cell_type": "code",
519
+ "execution_count": null,
 
 
520
  "metadata": {
521
  "colab": {
522
  "base_uri": "https://localhost:8080/"
 
524
  "id": "-CIUFNSsfatq",
525
  "outputId": "cdd48a15-cd1c-4648-ad64-550853e60fa9"
526
  },
 
527
  "outputs": [
528
  {
 
529
  "name": "stdout",
530
+ "output_type": "stream",
531
  "text": [
532
  "[main 7ec7d5d] save output_audio from suno_bark\n",
533
  " 1 file changed, 0 insertions(+), 0 deletions(-)\n",
534
  " rewrite output_audio.wav (82%)\n"
535
  ]
536
  }
537
+ ],
538
+ "source": [
539
+ "!git commit -m\"save output_audio from suno_bark\""
540
  ]
541
  },
542
  {
543
  "cell_type": "code",
544
+ "execution_count": null,
 
 
545
  "metadata": {
546
  "colab": {
547
  "base_uri": "https://localhost:8080/"
 
549
  "id": "cvivcfrxflSu",
550
  "outputId": "e1aba536-b30c-429e-a7c4-b5cc8152fd8f"
551
  },
 
552
  "outputs": [
553
  {
 
554
  "name": "stdout",
555
+ "output_type": "stream",
556
  "text": [
557
  "fatal: could not read Username for 'https://github.com': No such device or address\n"
558
  ]
559
  }
560
+ ],
561
+ "source": [
562
+ "!git push origin main"
563
  ]
564
  },
565
  {
566
  "cell_type": "code",
567
+ "execution_count": null,
568
  "metadata": {
569
  "id": "gwN-bzaSgnJX"
570
  },
571
+ "outputs": [],
572
+ "source": []
573
  }
574
+ ],
575
+ "metadata": {
576
+ "accelerator": "GPU",
577
+ "colab": {
578
+ "gpuType": "T4",
579
+ "include_colab_link": true,
580
+ "provenance": []
581
+ },
582
+ "kernelspec": {
583
+ "display_name": "Python 3",
584
+ "name": "python3"
585
+ },
586
+ "language_info": {
587
+ "name": "python"
588
+ }
589
+ },
590
+ "nbformat": 4,
591
+ "nbformat_minor": 0
592
+ }
transcribe.py ADDED
File without changes