matdmiller commited on
Commit
7c60390
1 Parent(s): 9c145c1

fixed cartesia audio concat/conversion

Browse files
Files changed (2) hide show
  1. app.ipynb +71 -102
  2. app.py +33 -29
app.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "id": "3bedf0dc-8d8e-4ede-a9e6-b8f35136aa00",
7
  "metadata": {},
8
  "outputs": [],
@@ -42,7 +42,7 @@
42
  },
43
  {
44
  "cell_type": "code",
45
- "execution_count": 2,
46
  "id": "667802a7-0f36-4136-a381-e66210b20462",
47
  "metadata": {},
48
  "outputs": [
@@ -94,7 +94,7 @@
94
  },
95
  {
96
  "cell_type": "code",
97
- "execution_count": 3,
98
  "id": "7664bc24-e8a7-440d-851d-eb16dc2d69fb",
99
  "metadata": {},
100
  "outputs": [
@@ -128,7 +128,7 @@
128
  },
129
  {
130
  "cell_type": "code",
131
- "execution_count": 4,
132
  "id": "4d9863fc-969e-409b-8e20-b9c3cd2cc3e7",
133
  "metadata": {},
134
  "outputs": [],
@@ -142,7 +142,7 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": 5,
146
  "id": "4f486d3a",
147
  "metadata": {},
148
  "outputs": [],
@@ -187,7 +187,7 @@
187
  },
188
  {
189
  "cell_type": "code",
190
- "execution_count": 6,
191
  "id": "ecb7f207-0fc2-4d19-a313-356c05776832",
192
  "metadata": {},
193
  "outputs": [
@@ -208,7 +208,7 @@
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": 7,
212
  "id": "e5d6cac2-0dee-42d8-9b41-184b5be9cc3f",
213
  "metadata": {},
214
  "outputs": [],
@@ -219,7 +219,7 @@
219
  },
220
  {
221
  "cell_type": "code",
222
- "execution_count": 8,
223
  "id": "b77ad8d6-3289-463c-b213-1c0cc215b141",
224
  "metadata": {},
225
  "outputs": [
@@ -248,7 +248,7 @@
248
  },
249
  {
250
  "cell_type": "code",
251
- "execution_count": 9,
252
  "id": "87fca48b-a16a-4d2b-919c-75e88e4e5eb5",
253
  "metadata": {},
254
  "outputs": [
@@ -316,7 +316,7 @@
316
  },
317
  {
318
  "cell_type": "code",
319
- "execution_count": 10,
320
  "id": "8eb7e7d5-7121-4762-b8d1-e5a9539e2b36",
321
  "metadata": {},
322
  "outputs": [],
@@ -327,7 +327,7 @@
327
  },
328
  {
329
  "cell_type": "code",
330
- "execution_count": 11,
331
  "id": "52d373be-3a79-412e-8ca2-92bb443fa52d",
332
  "metadata": {},
333
  "outputs": [],
@@ -352,7 +352,7 @@
352
  },
353
  {
354
  "cell_type": "code",
355
- "execution_count": 12,
356
  "id": "b5b29507-92bc-453d-bcc5-6402c17e9a0d",
357
  "metadata": {},
358
  "outputs": [],
@@ -372,7 +372,7 @@
372
  },
373
  {
374
  "cell_type": "code",
375
- "execution_count": 13,
376
  "id": "24674094-4d47-4e48-b591-55faabcff8df",
377
  "metadata": {},
378
  "outputs": [],
@@ -413,26 +413,22 @@
413
  },
414
  {
415
  "cell_type": "code",
416
- "execution_count": 14,
417
  "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
418
  "metadata": {},
419
  "outputs": [],
420
  "source": [
421
  "#| export\n",
422
- "def concatenate_mp3(mp3_files:list):\n",
423
  "\n",
424
  " # Initialize an empty AudioSegment object for concatenation\n",
425
  " combined = AudioSegment.empty()\n",
426
- " \n",
427
- " # Write out audio file responses as individual files for debugging\n",
428
- " # for idx, mp3_data in enumerate(mp3_files):\n",
429
- " # with open(f'./{idx}.mp3', 'wb') as f:\n",
430
- " # f.write(mp3_data)\n",
431
  "\n",
432
  " # Loop through the list of mp3 binary data\n",
433
- " for mp3_data in mp3_files:\n",
434
  " # Convert binary data to an audio segment\n",
435
- " audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format=\"mp3\")\n",
 
436
  " # Concatenate this segment to the combined segment\n",
437
  " combined += audio_segment\n",
438
  "\n",
@@ -456,7 +452,7 @@
456
  },
457
  {
458
  "cell_type": "code",
459
- "execution_count": 15,
460
  "id": "4691703d-ed0f-4481-8006-b2906289b780",
461
  "metadata": {},
462
  "outputs": [],
@@ -508,7 +504,7 @@
508
  },
509
  {
510
  "cell_type": "code",
511
- "execution_count": 16,
512
  "id": "3420c868-71cb-4ac6-ac65-6f02bfd841d1",
513
  "metadata": {},
514
  "outputs": [],
@@ -516,19 +512,36 @@
516
  "#| export\n",
517
  "def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon', \n",
518
  " voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah\n",
519
- " websocket=False, output_format='pcm_44100', **kwargs):\n",
 
 
520
  " client = cartesia.tts.CartesiaTTS()\n",
521
  " \n",
522
- " @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))\n",
523
  " def _create_speech_with_backoff(**kwargs):\n",
524
  " return client.generate(**kwargs)\n",
525
  " \n",
526
- " response = _create_speech_with_backoff(transcript=input, model_id=model, voice=voice, \n",
527
- " websocket=websocket, output_format=output_format, **kwargs)\n",
 
 
 
528
  " client.close()\n",
529
  " return chunk_idx, response[\"audio\"]"
530
  ]
531
  },
 
 
 
 
 
 
 
 
 
 
 
 
532
  {
533
  "cell_type": "code",
534
  "execution_count": 17,
@@ -537,23 +550,26 @@
537
  "outputs": [],
538
  "source": [
539
  "#| export\n",
540
- "def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: gr.OAuthProfile|None=None, progress=gr.Progress(), **kwargs):\n",
 
 
541
  "\n",
542
  " #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!\n",
543
- " verify_authorization(profile)\n",
544
  " start = datetime.now()\n",
545
- "\n",
546
  " \n",
547
  " if provider == 'cartesiaai':\n",
548
  " create_speech_func = create_speech_cartesiaai\n",
549
  " max_chunk_size = 500\n",
550
  " chunk_processing_time = 20\n",
551
  " threads = CARTESIAAI_CLIENT_TTS_THREADS\n",
 
552
  " elif provider == 'openai':\n",
553
  " create_speech_func = create_speech_openai\n",
554
  " max_chunk_size = 4000\n",
555
  " chunk_processing_time = 60\n",
556
  " threads = OPENAI_CLIENT_TTS_THREADS\n",
 
557
  " else:\n",
558
  " raise ValueError(f'Invalid argument provider: {provider}')\n",
559
  " \n",
@@ -578,7 +594,7 @@
578
  " progress(.9, desc=f\"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.\")\n",
579
  " \n",
580
  " # Concatenate the audio data from all chunks\n",
581
- " combined_audio = concatenate_mp3(audio_data)\n",
582
  "\n",
583
  " # Final update to the progress bar\n",
584
  " progress(1, desc=f\"Processing completed... {(datetime.now()-start).seconds} seconds to process.\")\n",
@@ -590,17 +606,19 @@
590
  },
591
  {
592
  "cell_type": "code",
593
- "execution_count": 18,
594
  "id": "ca2c6f8c-62ed-4ac1-9c2f-e3b2bfb47e8d",
595
  "metadata": {},
596
  "outputs": [],
597
  "source": [
598
- "# create_speech(\"Hi. What's your name?\", provider='openai', model='tts-1', voice='alloy')"
 
 
599
  ]
600
  },
601
  {
602
  "cell_type": "code",
603
- "execution_count": 19,
604
  "id": "236dd8d3-4364-4731-af93-7dcdec6f18a1",
605
  "metadata": {},
606
  "outputs": [],
@@ -612,7 +630,7 @@
612
  },
613
  {
614
  "cell_type": "code",
615
- "execution_count": 20,
616
  "id": "0523a158-ee07-48b3-9350-ee39d4deee7f",
617
  "metadata": {},
618
  "outputs": [],
@@ -634,7 +652,7 @@
634
  },
635
  {
636
  "cell_type": "code",
637
- "execution_count": 21,
638
  "id": "f4d1ba0b-6960-4e22-8dba-7de70370753a",
639
  "metadata": {},
640
  "outputs": [],
@@ -646,7 +664,7 @@
646
  },
647
  {
648
  "cell_type": "code",
649
- "execution_count": 22,
650
  "id": "efa28cf2-548d-439f-bf2a-21a5edbf9eba",
651
  "metadata": {},
652
  "outputs": [],
@@ -654,12 +672,12 @@
654
  "#| export\n",
655
  "def update_model_choices(provider):\n",
656
  " choices = get_model_choices(provider)\n",
657
- " return gr.update(choices=choices,value=choices[0])"
658
  ]
659
  },
660
  {
661
  "cell_type": "code",
662
- "execution_count": 23,
663
  "id": "cdc1dde5-5edd-4dbf-bd11-30eb418c571d",
664
  "metadata": {},
665
  "outputs": [],
@@ -671,7 +689,7 @@
671
  },
672
  {
673
  "cell_type": "code",
674
- "execution_count": 24,
675
  "id": "035c33dd-c8e6-42b4-91d4-6bc5f1b36df3",
676
  "metadata": {},
677
  "outputs": [],
@@ -679,12 +697,12 @@
679
  "#| export\n",
680
  "def update_voice_choices(provider, model):\n",
681
  " choices = get_voice_choices(provider, model)\n",
682
- " return gr.update(choices=choices,value=choices[0])"
683
  ]
684
  },
685
  {
686
  "cell_type": "code",
687
- "execution_count": 29,
688
  "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
689
  "metadata": {},
690
  "outputs": [],
@@ -739,7 +757,7 @@
739
  },
740
  {
741
  "cell_type": "code",
742
- "execution_count": 30,
743
  "id": "a00648a1-891b-470b-9959-f5d502055713",
744
  "metadata": {},
745
  "outputs": [],
@@ -753,7 +771,7 @@
753
  },
754
  {
755
  "cell_type": "code",
756
- "execution_count": 31,
757
  "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
758
  "metadata": {},
759
  "outputs": [
@@ -761,7 +779,7 @@
761
  "name": "stdout",
762
  "output_type": "stream",
763
  "text": [
764
- "Running on local URL: http://127.0.0.1:7860\n",
765
  "\n",
766
  "To create a public link, set `share=True` in `launch()`.\n"
767
  ]
@@ -769,7 +787,7 @@
769
  {
770
  "data": {
771
  "text/html": [
772
- "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
773
  ],
774
  "text/plain": [
775
  "<IPython.core.display.HTML object>"
@@ -782,59 +800,9 @@
782
  "data": {
783
  "text/plain": []
784
  },
785
- "execution_count": 31,
786
  "metadata": {},
787
  "output_type": "execute_result"
788
- },
789
- {
790
- "name": "stderr",
791
- "output_type": "stream",
792
- "text": [
793
- "/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/components/dropdown.py:181: UserWarning: The value passed into gr.Dropdown() is not in the list of choices. Please update the list of choices to include: $0.000 or set allow_custom_value=True.\n",
794
- " warnings.warn(\n",
795
- "Traceback (most recent call last):\n",
796
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/queueing.py\", line 532, in process_events\n",
797
- " response = await route_utils.call_process_api(\n",
798
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
799
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 276, in call_process_api\n",
800
- " output = await app.get_blocks().process_api(\n",
801
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
802
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1928, in process_api\n",
803
- " result = await self.call_function(\n",
804
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
805
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1500, in call_function\n",
806
- " processed_input, progress_index, _ = special_args(\n",
807
- " ^^^^^^^^^^^^^\n",
808
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/helpers.py\", line 891, in special_args\n",
809
- " getattr(request, \"session\", {})\n",
810
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 158, in __getattr__\n",
811
- " return self.dict_to_obj(getattr(self.request, name))\n",
812
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
813
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/starlette/requests.py\", line 157, in session\n",
814
- " \"session\" in self.scope\n",
815
- "AssertionError: SessionMiddleware must be installed to access request.session\n",
816
- "Traceback (most recent call last):\n",
817
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/queueing.py\", line 532, in process_events\n",
818
- " response = await route_utils.call_process_api(\n",
819
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
820
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 276, in call_process_api\n",
821
- " output = await app.get_blocks().process_api(\n",
822
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
823
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1928, in process_api\n",
824
- " result = await self.call_function(\n",
825
- " ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
826
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/blocks.py\", line 1500, in call_function\n",
827
- " processed_input, progress_index, _ = special_args(\n",
828
- " ^^^^^^^^^^^^^\n",
829
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/helpers.py\", line 891, in special_args\n",
830
- " getattr(request, \"session\", {})\n",
831
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/gradio/route_utils.py\", line 158, in __getattr__\n",
832
- " return self.dict_to_obj(getattr(self.request, name))\n",
833
- " ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
834
- " File \"/Users/mathewmiller/anaconda3/envs/gradio1/lib/python3.11/site-packages/starlette/requests.py\", line 157, in session\n",
835
- " \"session\" in self.scope\n",
836
- "AssertionError: SessionMiddleware must be installed to access request.session\n"
837
- ]
838
  }
839
  ],
840
  "source": [
@@ -860,7 +828,7 @@
860
  },
861
  {
862
  "cell_type": "code",
863
- "execution_count": 33,
864
  "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
865
  "metadata": {},
866
  "outputs": [
@@ -868,7 +836,7 @@
868
  "name": "stdout",
869
  "output_type": "stream",
870
  "text": [
871
- "Closing server running on port: 7860\n"
872
  ]
873
  }
874
  ],
@@ -879,7 +847,7 @@
879
  },
880
  {
881
  "cell_type": "code",
882
- "execution_count": 37,
883
  "id": "afbc9699-4d16-4060-88f4-cd1251754cbd",
884
  "metadata": {},
885
  "outputs": [],
@@ -890,12 +858,13 @@
890
  },
891
  {
892
  "cell_type": "code",
893
- "execution_count": 38,
894
  "id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
895
  "metadata": {},
896
  "outputs": [],
897
  "source": [
898
  "#| hide\n",
 
899
  "nbdev.export.nb_export('app.ipynb',lib_path='.')"
900
  ]
901
  },
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 30,
6
  "id": "3bedf0dc-8d8e-4ede-a9e6-b8f35136aa00",
7
  "metadata": {},
8
  "outputs": [],
 
42
  },
43
  {
44
  "cell_type": "code",
45
+ "execution_count": 1,
46
  "id": "667802a7-0f36-4136-a381-e66210b20462",
47
  "metadata": {},
48
  "outputs": [
 
94
  },
95
  {
96
  "cell_type": "code",
97
+ "execution_count": 2,
98
  "id": "7664bc24-e8a7-440d-851d-eb16dc2d69fb",
99
  "metadata": {},
100
  "outputs": [
 
128
  },
129
  {
130
  "cell_type": "code",
131
+ "execution_count": 3,
132
  "id": "4d9863fc-969e-409b-8e20-b9c3cd2cc3e7",
133
  "metadata": {},
134
  "outputs": [],
 
142
  },
143
  {
144
  "cell_type": "code",
145
+ "execution_count": 4,
146
  "id": "4f486d3a",
147
  "metadata": {},
148
  "outputs": [],
 
187
  },
188
  {
189
  "cell_type": "code",
190
+ "execution_count": 5,
191
  "id": "ecb7f207-0fc2-4d19-a313-356c05776832",
192
  "metadata": {},
193
  "outputs": [
 
208
  },
209
  {
210
  "cell_type": "code",
211
+ "execution_count": 6,
212
  "id": "e5d6cac2-0dee-42d8-9b41-184b5be9cc3f",
213
  "metadata": {},
214
  "outputs": [],
 
219
  },
220
  {
221
  "cell_type": "code",
222
+ "execution_count": 7,
223
  "id": "b77ad8d6-3289-463c-b213-1c0cc215b141",
224
  "metadata": {},
225
  "outputs": [
 
248
  },
249
  {
250
  "cell_type": "code",
251
+ "execution_count": 8,
252
  "id": "87fca48b-a16a-4d2b-919c-75e88e4e5eb5",
253
  "metadata": {},
254
  "outputs": [
 
316
  },
317
  {
318
  "cell_type": "code",
319
+ "execution_count": 9,
320
  "id": "8eb7e7d5-7121-4762-b8d1-e5a9539e2b36",
321
  "metadata": {},
322
  "outputs": [],
 
327
  },
328
  {
329
  "cell_type": "code",
330
+ "execution_count": 10,
331
  "id": "52d373be-3a79-412e-8ca2-92bb443fa52d",
332
  "metadata": {},
333
  "outputs": [],
 
352
  },
353
  {
354
  "cell_type": "code",
355
+ "execution_count": 11,
356
  "id": "b5b29507-92bc-453d-bcc5-6402c17e9a0d",
357
  "metadata": {},
358
  "outputs": [],
 
372
  },
373
  {
374
  "cell_type": "code",
375
+ "execution_count": 12,
376
  "id": "24674094-4d47-4e48-b591-55faabcff8df",
377
  "metadata": {},
378
  "outputs": [],
 
413
  },
414
  {
415
  "cell_type": "code",
416
+ "execution_count": 13,
417
  "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
418
  "metadata": {},
419
  "outputs": [],
420
  "source": [
421
  "#| export\n",
422
+ "def concatenate_audio(files:list, **kwargs):\n",
423
  "\n",
424
  " # Initialize an empty AudioSegment object for concatenation\n",
425
  " combined = AudioSegment.empty()\n",
 
 
 
 
 
426
  "\n",
427
  " # Loop through the list of mp3 binary data\n",
428
+ " for data in files:\n",
429
  " # Convert binary data to an audio segment\n",
430
+ " audio_segment = AudioSegment.from_file(io.BytesIO(data), **kwargs)\n",
431
+ " \n",
432
  " # Concatenate this segment to the combined segment\n",
433
  " combined += audio_segment\n",
434
  "\n",
 
452
  },
453
  {
454
  "cell_type": "code",
455
+ "execution_count": 14,
456
  "id": "4691703d-ed0f-4481-8006-b2906289b780",
457
  "metadata": {},
458
  "outputs": [],
 
504
  },
505
  {
506
  "cell_type": "code",
507
+ "execution_count": 15,
508
  "id": "3420c868-71cb-4ac6-ac65-6f02bfd841d1",
509
  "metadata": {},
510
  "outputs": [],
 
512
  "#| export\n",
513
  "def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon', \n",
514
  " voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah\n",
515
+ " websocket=False, \n",
516
+ " output_format='pcm_44100', \n",
517
+ " **kwargs):\n",
518
  " client = cartesia.tts.CartesiaTTS()\n",
519
  " \n",
520
+ " # @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))\n",
521
  " def _create_speech_with_backoff(**kwargs):\n",
522
  " return client.generate(**kwargs)\n",
523
  " \n",
524
+ " response = _create_speech_with_backoff(transcript=input, model_id=model, \n",
525
+ " voice=client.get_voice_embedding(voice_id=voice), \n",
526
+ " websocket=websocket, \n",
527
+ " output_format=output_format, \n",
528
+ " **kwargs)\n",
529
  " client.close()\n",
530
  " return chunk_idx, response[\"audio\"]"
531
  ]
532
  },
533
+ {
534
+ "cell_type": "code",
535
+ "execution_count": 16,
536
+ "id": "d0082383-9d03-4b25-b68a-080d0b28caa9",
537
+ "metadata": {},
538
+ "outputs": [],
539
+ "source": [
540
+ "# test\n",
541
+ "# create_speech_cartesiaai(1,\"Hi. What's your name?\", model='upbeat-moon',\n",
542
+ "# voice='63ff761f-c1e8-414b-b969-d1833d1c870c')"
543
+ ]
544
+ },
545
  {
546
  "cell_type": "code",
547
  "execution_count": 17,
 
550
  "outputs": [],
551
  "source": [
552
  "#| export\n",
553
+ "def create_speech(input_text, provider, model='tts-1', voice='alloy', \n",
554
+ " # profile: gr.OAuthProfile|None=None, \n",
555
+ " progress=gr.Progress(), **kwargs):\n",
556
  "\n",
557
  " #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!\n",
558
+ " if REQUIRE_AUTH: verify_authorization(profile)\n",
559
  " start = datetime.now()\n",
 
560
  " \n",
561
  " if provider == 'cartesiaai':\n",
562
  " create_speech_func = create_speech_cartesiaai\n",
563
  " max_chunk_size = 500\n",
564
  " chunk_processing_time = 20\n",
565
  " threads = CARTESIAAI_CLIENT_TTS_THREADS\n",
566
+ " audio_file_conversion_kwargs = {'format': 'raw', 'frame_rate': 44100, 'channels': 1, 'sample_width': 2}\n",
567
  " elif provider == 'openai':\n",
568
  " create_speech_func = create_speech_openai\n",
569
  " max_chunk_size = 4000\n",
570
  " chunk_processing_time = 60\n",
571
  " threads = OPENAI_CLIENT_TTS_THREADS\n",
572
+ " audio_file_conversion_kwargs = {'format': 'mp3'}\n",
573
  " else:\n",
574
  " raise ValueError(f'Invalid argument provider: {provider}')\n",
575
  " \n",
 
594
  " progress(.9, desc=f\"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.\")\n",
595
  " \n",
596
  " # Concatenate the audio data from all chunks\n",
597
+ " combined_audio = concatenate_audio(audio_data, **audio_file_conversion_kwargs)\n",
598
  "\n",
599
  " # Final update to the progress bar\n",
600
  " progress(1, desc=f\"Processing completed... {(datetime.now()-start).seconds} seconds to process.\")\n",
 
606
  },
607
  {
608
  "cell_type": "code",
609
+ "execution_count": 19,
610
  "id": "ca2c6f8c-62ed-4ac1-9c2f-e3b2bfb47e8d",
611
  "metadata": {},
612
  "outputs": [],
613
  "source": [
614
+ "# create_speech(\"Hi. What's your name?\", provider='openai', model='tts-1', voice='alloy')\n",
615
+ "# create_speech(\"Hi. What's your name?\", provider='cartesiaai', model='upbeat-moon',\n",
616
+ "# voice='63ff761f-c1e8-414b-b969-d1833d1c870c')"
617
  ]
618
  },
619
  {
620
  "cell_type": "code",
621
+ "execution_count": 20,
622
  "id": "236dd8d3-4364-4731-af93-7dcdec6f18a1",
623
  "metadata": {},
624
  "outputs": [],
 
630
  },
631
  {
632
  "cell_type": "code",
633
+ "execution_count": 21,
634
  "id": "0523a158-ee07-48b3-9350-ee39d4deee7f",
635
  "metadata": {},
636
  "outputs": [],
 
652
  },
653
  {
654
  "cell_type": "code",
655
+ "execution_count": 22,
656
  "id": "f4d1ba0b-6960-4e22-8dba-7de70370753a",
657
  "metadata": {},
658
  "outputs": [],
 
664
  },
665
  {
666
  "cell_type": "code",
667
+ "execution_count": 23,
668
  "id": "efa28cf2-548d-439f-bf2a-21a5edbf9eba",
669
  "metadata": {},
670
  "outputs": [],
 
672
  "#| export\n",
673
  "def update_model_choices(provider):\n",
674
  " choices = get_model_choices(provider)\n",
675
+ " return gr.update(choices=choices,value=choices[0][1])"
676
  ]
677
  },
678
  {
679
  "cell_type": "code",
680
+ "execution_count": 24,
681
  "id": "cdc1dde5-5edd-4dbf-bd11-30eb418c571d",
682
  "metadata": {},
683
  "outputs": [],
 
689
  },
690
  {
691
  "cell_type": "code",
692
+ "execution_count": 25,
693
  "id": "035c33dd-c8e6-42b4-91d4-6bc5f1b36df3",
694
  "metadata": {},
695
  "outputs": [],
 
697
  "#| export\n",
698
  "def update_voice_choices(provider, model):\n",
699
  " choices = get_voice_choices(provider, model)\n",
700
+ " return gr.update(choices=choices,value=choices[0][1])"
701
  ]
702
  },
703
  {
704
  "cell_type": "code",
705
+ "execution_count": 26,
706
  "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
707
  "metadata": {},
708
  "outputs": [],
 
757
  },
758
  {
759
  "cell_type": "code",
760
+ "execution_count": 27,
761
  "id": "a00648a1-891b-470b-9959-f5d502055713",
762
  "metadata": {},
763
  "outputs": [],
 
771
  },
772
  {
773
  "cell_type": "code",
774
+ "execution_count": 28,
775
  "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
776
  "metadata": {},
777
  "outputs": [
 
779
  "name": "stdout",
780
  "output_type": "stream",
781
  "text": [
782
+ "Running on local URL: http://127.0.0.1:7861\n",
783
  "\n",
784
  "To create a public link, set `share=True` in `launch()`.\n"
785
  ]
 
787
  {
788
  "data": {
789
  "text/html": [
790
+ "<div><iframe src=\"http://127.0.0.1:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
791
  ],
792
  "text/plain": [
793
  "<IPython.core.display.HTML object>"
 
800
  "data": {
801
  "text/plain": []
802
  },
803
+ "execution_count": 28,
804
  "metadata": {},
805
  "output_type": "execute_result"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
  }
807
  ],
808
  "source": [
 
828
  },
829
  {
830
  "cell_type": "code",
831
+ "execution_count": 29,
832
  "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
833
  "metadata": {},
834
  "outputs": [
 
836
  "name": "stdout",
837
  "output_type": "stream",
838
  "text": [
839
+ "Closing server running on port: 7861\n"
840
  ]
841
  }
842
  ],
 
847
  },
848
  {
849
  "cell_type": "code",
850
+ "execution_count": 30,
851
  "id": "afbc9699-4d16-4060-88f4-cd1251754cbd",
852
  "metadata": {},
853
  "outputs": [],
 
858
  },
859
  {
860
  "cell_type": "code",
861
+ "execution_count": 53,
862
  "id": "0420310d-930b-4904-8bd4-3458ad8bdbd3",
863
  "metadata": {},
864
  "outputs": [],
865
  "source": [
866
  "#| hide\n",
867
+ "import nbdev\n",
868
  "nbdev.export.nb_export('app.ipynb',lib_path='.')"
869
  ]
870
  },
app.py CHANGED
@@ -3,7 +3,7 @@
3
  # %% auto 0
4
  __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'providers', 'clean_text_prompt', 'OPENAI_CLIENT_TTS_THREADS',
5
  'CARTESIAAI_CLIENT_TTS_THREADS', 'DEFAULT_PROVIDER', 'DEFAULT_MODEL', 'DEFAULT_VOICE', 'launch_kwargs',
6
- 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_mp3', 'create_speech_openai',
7
  'create_speech_cartesiaai', 'create_speech', 'get_input_text_len', 'get_generation_cost',
8
  'get_model_choices', 'update_model_choices', 'get_voice_choices', 'update_voice_choices']
9
 
@@ -163,20 +163,16 @@ def split_text(input_text, max_length=4000, lookback=1000):
163
  return chunks
164
 
165
  # %% app.ipynb 21
166
- def concatenate_mp3(mp3_files:list):
167
 
168
  # Initialize an empty AudioSegment object for concatenation
169
  combined = AudioSegment.empty()
170
-
171
- # Write out audio file responses as individual files for debugging
172
- # for idx, mp3_data in enumerate(mp3_files):
173
- # with open(f'./{idx}.mp3', 'wb') as f:
174
- # f.write(mp3_data)
175
 
176
  # Loop through the list of mp3 binary data
177
- for mp3_data in mp3_files:
178
  # Convert binary data to an audio segment
179
- audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format="mp3")
 
180
  # Concatenate this segment to the combined segment
181
  combined += audio_segment
182
 
@@ -212,36 +208,44 @@ def create_speech_openai(chunk_idx, input, model='tts-1', voice='alloy', speed=1
212
  # %% app.ipynb 24
213
  def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon',
214
  voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah
215
- websocket=False, output_format='pcm_44100', **kwargs):
 
 
216
  client = cartesia.tts.CartesiaTTS()
217
 
218
- @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))
219
  def _create_speech_with_backoff(**kwargs):
220
  return client.generate(**kwargs)
221
 
222
- response = _create_speech_with_backoff(transcript=input, model_id=model, voice=voice,
223
- websocket=websocket, output_format=output_format, **kwargs)
 
 
 
224
  client.close()
225
  return chunk_idx, response["audio"]
226
 
227
- # %% app.ipynb 25
228
- def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: gr.OAuthProfile|None=None, progress=gr.Progress(), **kwargs):
 
 
229
 
230
  #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!
231
- verify_authorization(profile)
232
  start = datetime.now()
233
-
234
 
235
  if provider == 'cartesiaai':
236
  create_speech_func = create_speech_cartesiaai
237
  max_chunk_size = 500
238
  chunk_processing_time = 20
239
  threads = CARTESIAAI_CLIENT_TTS_THREADS
 
240
  elif provider == 'openai':
241
  create_speech_func = create_speech_openai
242
  max_chunk_size = 4000
243
  chunk_processing_time = 60
244
  threads = OPENAI_CLIENT_TTS_THREADS
 
245
  else:
246
  raise ValueError(f'Invalid argument provider: {provider}')
247
 
@@ -266,7 +270,7 @@ def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: g
266
  progress(.9, desc=f"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.")
267
 
268
  # Concatenate the audio data from all chunks
269
- combined_audio = concatenate_mp3(audio_data)
270
 
271
  # Final update to the progress bar
272
  progress(1, desc=f"Processing completed... {(datetime.now()-start).seconds} seconds to process.")
@@ -276,11 +280,11 @@ def create_speech(input_text, provider, model='tts-1', voice='alloy', profile: g
276
  return combined_audio
277
 
278
 
279
- # %% app.ipynb 27
280
  def get_input_text_len(input_text):
281
  return len(input_text)
282
 
283
- # %% app.ipynb 28
284
  def get_generation_cost(input_text, tts_model_dropdown, provider):
285
  text_len = len(input_text)
286
  if provider == 'openai':
@@ -294,25 +298,25 @@ def get_generation_cost(input_text, tts_model_dropdown, provider):
294
  raise ValueError(f'Invalid argument provider: {provider}')
295
  return "${:,.3f}".format(cost)
296
 
297
- # %% app.ipynb 29
298
  def get_model_choices(provider):
299
  return sorted([(v,k) for k,v in providers[provider]['models'].items()])
300
 
301
- # %% app.ipynb 30
302
  def update_model_choices(provider):
303
  choices = get_model_choices(provider)
304
- return gr.update(choices=choices,value=choices[0])
305
 
306
- # %% app.ipynb 31
307
  def get_voice_choices(provider, model):
308
  return sorted([(v['name'],v['id']) for v in providers[provider]['voices'].values()])
309
 
310
- # %% app.ipynb 32
311
  def update_voice_choices(provider, model):
312
  choices = get_voice_choices(provider, model)
313
- return gr.update(choices=choices,value=choices[0])
314
 
315
- # %% app.ipynb 33
316
  with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
317
  gr.Markdown("# TTS")
318
  gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
@@ -359,13 +363,13 @@ For requests longer than allowed by the API they will be broken into chunks auto
359
  app.load(verify_authorization, None, m)
360
 
361
 
362
- # %% app.ipynb 34
363
  # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
364
  # 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
365
  launch_kwargs = {}
366
  queue_kwargs = {'default_concurrency_limit':10}
367
 
368
- # %% app.ipynb 36
369
  #.py launch
370
  if __name__ == "__main__":
371
  app.queue(**queue_kwargs)
 
3
  # %% auto 0
4
  __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'providers', 'clean_text_prompt', 'OPENAI_CLIENT_TTS_THREADS',
5
  'CARTESIAAI_CLIENT_TTS_THREADS', 'DEFAULT_PROVIDER', 'DEFAULT_MODEL', 'DEFAULT_VOICE', 'launch_kwargs',
6
+ 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio', 'create_speech_openai',
7
  'create_speech_cartesiaai', 'create_speech', 'get_input_text_len', 'get_generation_cost',
8
  'get_model_choices', 'update_model_choices', 'get_voice_choices', 'update_voice_choices']
9
 
 
163
  return chunks
164
 
165
  # %% app.ipynb 21
166
+ def concatenate_audio(files:list, **kwargs):
167
 
168
  # Initialize an empty AudioSegment object for concatenation
169
  combined = AudioSegment.empty()
 
 
 
 
 
170
 
171
  # Loop through the list of mp3 binary data
172
+ for data in files:
173
  # Convert binary data to an audio segment
174
+ audio_segment = AudioSegment.from_file(io.BytesIO(data), **kwargs)
175
+
176
  # Concatenate this segment to the combined segment
177
  combined += audio_segment
178
 
 
208
  # %% app.ipynb 24
209
  def create_speech_cartesiaai(chunk_idx, input, model='upbeat-moon',
210
  voice='248be419-c632-4f23-adf1-5324ed7dbf1d', #Hannah
211
+ websocket=False,
212
+ output_format='pcm_44100',
213
+ **kwargs):
214
  client = cartesia.tts.CartesiaTTS()
215
 
216
+ # @retry(wait=wait_random_exponential(min=1, max=180), stop=stop_after_attempt(6))
217
  def _create_speech_with_backoff(**kwargs):
218
  return client.generate(**kwargs)
219
 
220
+ response = _create_speech_with_backoff(transcript=input, model_id=model,
221
+ voice=client.get_voice_embedding(voice_id=voice),
222
+ websocket=websocket,
223
+ output_format=output_format,
224
+ **kwargs)
225
  client.close()
226
  return chunk_idx, response["audio"]
227
 
228
+ # %% app.ipynb 26
229
+ def create_speech(input_text, provider, model='tts-1', voice='alloy',
230
+ # profile: gr.OAuthProfile|None=None,
231
+ progress=gr.Progress(), **kwargs):
232
 
233
  #Verify auth if it is required. This is very important if this is in a HF space. DO NOT DELETE!!!
234
+ if REQUIRE_AUTH: verify_authorization(profile)
235
  start = datetime.now()
 
236
 
237
  if provider == 'cartesiaai':
238
  create_speech_func = create_speech_cartesiaai
239
  max_chunk_size = 500
240
  chunk_processing_time = 20
241
  threads = CARTESIAAI_CLIENT_TTS_THREADS
242
+ audio_file_conversion_kwargs = {'format': 'raw', 'frame_rate': 44100, 'channels': 1, 'sample_width': 2}
243
  elif provider == 'openai':
244
  create_speech_func = create_speech_openai
245
  max_chunk_size = 4000
246
  chunk_processing_time = 60
247
  threads = OPENAI_CLIENT_TTS_THREADS
248
+ audio_file_conversion_kwargs = {'format': 'mp3'}
249
  else:
250
  raise ValueError(f'Invalid argument provider: {provider}')
251
 
 
270
  progress(.9, desc=f"Merging audio chunks... {(datetime.now()-start).seconds} seconds to process.")
271
 
272
  # Concatenate the audio data from all chunks
273
+ combined_audio = concatenate_audio(audio_data, **audio_file_conversion_kwargs)
274
 
275
  # Final update to the progress bar
276
  progress(1, desc=f"Processing completed... {(datetime.now()-start).seconds} seconds to process.")
 
280
  return combined_audio
281
 
282
 
283
+ # %% app.ipynb 28
284
  def get_input_text_len(input_text):
285
  return len(input_text)
286
 
287
+ # %% app.ipynb 29
288
  def get_generation_cost(input_text, tts_model_dropdown, provider):
289
  text_len = len(input_text)
290
  if provider == 'openai':
 
298
  raise ValueError(f'Invalid argument provider: {provider}')
299
  return "${:,.3f}".format(cost)
300
 
301
+ # %% app.ipynb 30
302
  def get_model_choices(provider):
303
  return sorted([(v,k) for k,v in providers[provider]['models'].items()])
304
 
305
+ # %% app.ipynb 31
306
  def update_model_choices(provider):
307
  choices = get_model_choices(provider)
308
+ return gr.update(choices=choices,value=choices[0][1])
309
 
310
+ # %% app.ipynb 32
311
  def get_voice_choices(provider, model):
312
  return sorted([(v['name'],v['id']) for v in providers[provider]['voices'].values()])
313
 
314
+ # %% app.ipynb 33
315
  def update_voice_choices(provider, model):
316
  choices = get_voice_choices(provider, model)
317
+ return gr.update(choices=choices,value=choices[0][1])
318
 
319
+ # %% app.ipynb 34
320
  with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
321
  gr.Markdown("# TTS")
322
  gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
 
363
  app.load(verify_authorization, None, m)
364
 
365
 
366
+ # %% app.ipynb 35
367
  # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
368
  # 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
369
  launch_kwargs = {}
370
  queue_kwargs = {'default_concurrency_limit':10}
371
 
372
+ # %% app.ipynb 37
373
  #.py launch
374
  if __name__ == "__main__":
375
  app.queue(**queue_kwargs)