matdmiller commited on
Commit
99246a0
1 Parent(s): e8a8dd0

added get and clean url text contents functionality

Browse files
Files changed (2) hide show
  1. app.ipynb +149 -64
  2. app.py +66 -7
app.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 2,
6
  "id": "3bedf0dc-8d8e-4ede-a9e6-b8f35136aa00",
7
  "metadata": {},
8
  "outputs": [],
@@ -42,7 +42,7 @@
42
  },
43
  {
44
  "cell_type": "code",
45
- "execution_count": 3,
46
  "id": "667802a7-0f36-4136-a381-e66210b20462",
47
  "metadata": {},
48
  "outputs": [
@@ -94,7 +94,7 @@
94
  },
95
  {
96
  "cell_type": "code",
97
- "execution_count": 4,
98
  "id": "7664bc24-e8a7-440d-851d-eb16dc2d69fb",
99
  "metadata": {},
100
  "outputs": [
@@ -128,7 +128,7 @@
128
  },
129
  {
130
  "cell_type": "code",
131
- "execution_count": 5,
132
  "id": "4d9863fc-969e-409b-8e20-b9c3cd2cc3e7",
133
  "metadata": {},
134
  "outputs": [],
@@ -142,7 +142,7 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": 6,
146
  "id": "4f486d3a",
147
  "metadata": {},
148
  "outputs": [],
@@ -166,7 +166,9 @@
166
  ") # for exponential backoff\n",
167
  "import traceback\n",
168
  "# from cartesia.tts import CartesiaTTS\n",
169
- "import cartesia"
 
 
170
  ]
171
  },
172
  {
@@ -187,7 +189,7 @@
187
  },
188
  {
189
  "cell_type": "code",
190
- "execution_count": 7,
191
  "id": "ecb7f207-0fc2-4d19-a313-356c05776832",
192
  "metadata": {},
193
  "outputs": [
@@ -208,7 +210,7 @@
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": 8,
212
  "id": "52d373be-3a79-412e-8ca2-92bb443fa52d",
213
  "metadata": {},
214
  "outputs": [],
@@ -225,7 +227,7 @@
225
  },
226
  {
227
  "cell_type": "code",
228
- "execution_count": 9,
229
  "id": "e5d6cac2-0dee-42d8-9b41-184b5be9cc3f",
230
  "metadata": {},
231
  "outputs": [],
@@ -236,7 +238,7 @@
236
  },
237
  {
238
  "cell_type": "code",
239
- "execution_count": 10,
240
  "id": "b77ad8d6-3289-463c-b213-1c0cc215b141",
241
  "metadata": {},
242
  "outputs": [
@@ -268,7 +270,7 @@
268
  },
269
  {
270
  "cell_type": "code",
271
- "execution_count": 11,
272
  "id": "87fca48b-a16a-4d2b-919c-75e88e4e5eb5",
273
  "metadata": {},
274
  "outputs": [
@@ -340,7 +342,7 @@
340
  },
341
  {
342
  "cell_type": "code",
343
- "execution_count": 12,
344
  "id": "d1352f28-f761-4e91-a9bc-4efe47552f4d",
345
  "metadata": {},
346
  "outputs": [],
@@ -389,13 +391,13 @@
389
  },
390
  {
391
  "cell_type": "code",
392
- "execution_count": 13,
393
  "id": "8eb7e7d5-7121-4762-b8d1-e5a9539e2b36",
394
  "metadata": {},
395
  "outputs": [],
396
  "source": [
397
  "#| export\n",
398
- "clean_text_prompt = \"\"\"Your job is to clean up text that is going to be fed into a text to speech (TTS) model. You must remove parts of the text that would not normally be spoken such as reference marks `[1]`, spurious citations such as `(Reddy et al., 2021; Wu et al., 2022; Chang et al., 2022; Kondratyuk et al., 2023)` and any other part of the text that is not normally spoken. Please also clean up sections and headers so they are on new lines with proper numbering. You must also clean up any math formulas that are salvageable from being copied from a scientific paper. If they are garbled and do not make sense then remove them. You must carefully perform the text cleanup so it is translated into speech that is easy to listen to however you must not modify the text otherwise. It is critical that you repeat all of the text without modifications except for the cleanup activities you've been instructed to do. Also you must clean all of the text you are given, you may not omit any of it or stop the cleanup task early.\"\"\"\n"
399
  ]
400
  },
401
  {
@@ -408,7 +410,7 @@
408
  },
409
  {
410
  "cell_type": "code",
411
- "execution_count": 14,
412
  "id": "b5b29507-92bc-453d-bcc5-6402c17e9a0d",
413
  "metadata": {},
414
  "outputs": [],
@@ -428,7 +430,7 @@
428
  },
429
  {
430
  "cell_type": "code",
431
- "execution_count": 15,
432
  "id": "24674094-4d47-4e48-b591-55faabcff8df",
433
  "metadata": {},
434
  "outputs": [],
@@ -473,7 +475,7 @@
473
  },
474
  {
475
  "cell_type": "code",
476
- "execution_count": 16,
477
  "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
478
  "metadata": {},
479
  "outputs": [],
@@ -512,7 +514,7 @@
512
  },
513
  {
514
  "cell_type": "code",
515
- "execution_count": 17,
516
  "id": "4691703d-ed0f-4481-8006-b2906289b780",
517
  "metadata": {},
518
  "outputs": [
@@ -575,7 +577,7 @@
575
  },
576
  {
577
  "cell_type": "code",
578
- "execution_count": 18,
579
  "id": "3420c868-71cb-4ac6-ac65-6f02bfd841d1",
580
  "metadata": {},
581
  "outputs": [
@@ -614,7 +616,7 @@
614
  },
615
  {
616
  "cell_type": "code",
617
- "execution_count": 19,
618
  "id": "d0082383-9d03-4b25-b68a-080d0b28caa9",
619
  "metadata": {},
620
  "outputs": [],
@@ -635,7 +637,7 @@
635
  },
636
  {
637
  "cell_type": "code",
638
- "execution_count": 20,
639
  "id": "649d90a5-9398-4cb5-a1e8-a464d463a11c",
640
  "metadata": {},
641
  "outputs": [],
@@ -646,7 +648,7 @@
646
  },
647
  {
648
  "cell_type": "code",
649
- "execution_count": 21,
650
  "id": "e34bb4aa-698c-4452-8cda-bd02b38f7122",
651
  "metadata": {},
652
  "outputs": [],
@@ -699,7 +701,7 @@
699
  },
700
  {
701
  "cell_type": "code",
702
- "execution_count": 22,
703
  "id": "ca2c6f8c-62ed-4ac1-9c2f-e3b2bfb47e8d",
704
  "metadata": {},
705
  "outputs": [],
@@ -711,7 +713,7 @@
711
  },
712
  {
713
  "cell_type": "code",
714
- "execution_count": 23,
715
  "id": "236dd8d3-4364-4731-af93-7dcdec6f18a1",
716
  "metadata": {},
717
  "outputs": [],
@@ -723,7 +725,7 @@
723
  },
724
  {
725
  "cell_type": "code",
726
- "execution_count": 24,
727
  "id": "0523a158-ee07-48b3-9350-ee39d4deee7f",
728
  "metadata": {},
729
  "outputs": [],
@@ -745,7 +747,7 @@
745
  },
746
  {
747
  "cell_type": "code",
748
- "execution_count": 25,
749
  "id": "f4d1ba0b-6960-4e22-8dba-7de70370753a",
750
  "metadata": {},
751
  "outputs": [],
@@ -757,7 +759,7 @@
757
  },
758
  {
759
  "cell_type": "code",
760
- "execution_count": 26,
761
  "id": "efa28cf2-548d-439f-bf2a-21a5edbf9eba",
762
  "metadata": {},
763
  "outputs": [],
@@ -770,7 +772,7 @@
770
  },
771
  {
772
  "cell_type": "code",
773
- "execution_count": 27,
774
  "id": "cdc1dde5-5edd-4dbf-bd11-30eb418c571d",
775
  "metadata": {},
776
  "outputs": [],
@@ -782,7 +784,7 @@
782
  },
783
  {
784
  "cell_type": "code",
785
- "execution_count": 28,
786
  "id": "035c33dd-c8e6-42b4-91d4-6bc5f1b36df3",
787
  "metadata": {},
788
  "outputs": [],
@@ -795,7 +797,7 @@
795
  },
796
  {
797
  "cell_type": "code",
798
- "execution_count": 29,
799
  "id": "c97c03af-a377-42e1-93e0-1df957c0e4cc",
800
  "metadata": {},
801
  "outputs": [],
@@ -806,12 +808,120 @@
806
  " return '# Text Splits:\\n' + '<br>----------<br>'.join(output)"
807
  ]
808
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  {
810
  "cell_type": "code",
811
  "execution_count": 30,
812
- "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
 
 
 
 
 
 
 
 
 
 
 
813
  "metadata": {},
814
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
815
  "source": [
816
  "#| export\n",
817
  "with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:\n",
@@ -821,6 +931,9 @@
821
  " gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
822
  "For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\"\"\")\n",
823
  " with gr.Row():\n",
 
 
 
824
  " input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
825
  " with gr.Row():\n",
826
  " tts_provider_dropdown = gr.Dropdown(value=DEFAULT_PROVIDER,\n",
@@ -843,6 +956,8 @@
843
  " \n",
844
  "\n",
845
  " ### Define UI Actions ###\n",
 
 
846
  " \n",
847
  " # input_text \n",
848
  " input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
@@ -878,7 +993,7 @@
878
  },
879
  {
880
  "cell_type": "code",
881
- "execution_count": 31,
882
  "id": "a00648a1-891b-470b-9959-f5d502055713",
883
  "metadata": {},
884
  "outputs": [],
@@ -892,40 +1007,10 @@
892
  },
893
  {
894
  "cell_type": "code",
895
- "execution_count": 32,
896
  "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
897
  "metadata": {},
898
- "outputs": [
899
- {
900
- "name": "stdout",
901
- "output_type": "stream",
902
- "text": [
903
- "Running on local URL: http://127.0.0.1:7860\n",
904
- "\n",
905
- "To create a public link, set `share=True` in `launch()`.\n"
906
- ]
907
- },
908
- {
909
- "data": {
910
- "text/html": [
911
- "<div><iframe src=\"http://127.0.0.1:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
912
- ],
913
- "text/plain": [
914
- "<IPython.core.display.HTML object>"
915
- ]
916
- },
917
- "metadata": {},
918
- "output_type": "display_data"
919
- },
920
- {
921
- "data": {
922
- "text/plain": []
923
- },
924
- "execution_count": 32,
925
- "metadata": {},
926
- "output_type": "execute_result"
927
- }
928
- ],
929
  "source": [
930
  "#| hide\n",
931
  "#Notebook launch\n",
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "id": "3bedf0dc-8d8e-4ede-a9e6-b8f35136aa00",
7
  "metadata": {},
8
  "outputs": [],
 
42
  },
43
  {
44
  "cell_type": "code",
45
+ "execution_count": 2,
46
  "id": "667802a7-0f36-4136-a381-e66210b20462",
47
  "metadata": {},
48
  "outputs": [
 
94
  },
95
  {
96
  "cell_type": "code",
97
+ "execution_count": 3,
98
  "id": "7664bc24-e8a7-440d-851d-eb16dc2d69fb",
99
  "metadata": {},
100
  "outputs": [
 
128
  },
129
  {
130
  "cell_type": "code",
131
+ "execution_count": 4,
132
  "id": "4d9863fc-969e-409b-8e20-b9c3cd2cc3e7",
133
  "metadata": {},
134
  "outputs": [],
 
142
  },
143
  {
144
  "cell_type": "code",
145
+ "execution_count": 5,
146
  "id": "4f486d3a",
147
  "metadata": {},
148
  "outputs": [],
 
166
  ") # for exponential backoff\n",
167
  "import traceback\n",
168
  "# from cartesia.tts import CartesiaTTS\n",
169
+ "import cartesia\n",
170
+ "import requests\n",
171
+ "import urllib"
172
  ]
173
  },
174
  {
 
189
  },
190
  {
191
  "cell_type": "code",
192
+ "execution_count": 6,
193
  "id": "ecb7f207-0fc2-4d19-a313-356c05776832",
194
  "metadata": {},
195
  "outputs": [
 
210
  },
211
  {
212
  "cell_type": "code",
213
+ "execution_count": 7,
214
  "id": "52d373be-3a79-412e-8ca2-92bb443fa52d",
215
  "metadata": {},
216
  "outputs": [],
 
227
  },
228
  {
229
  "cell_type": "code",
230
+ "execution_count": 8,
231
  "id": "e5d6cac2-0dee-42d8-9b41-184b5be9cc3f",
232
  "metadata": {},
233
  "outputs": [],
 
238
  },
239
  {
240
  "cell_type": "code",
241
+ "execution_count": 9,
242
  "id": "b77ad8d6-3289-463c-b213-1c0cc215b141",
243
  "metadata": {},
244
  "outputs": [
 
270
  },
271
  {
272
  "cell_type": "code",
273
+ "execution_count": 10,
274
  "id": "87fca48b-a16a-4d2b-919c-75e88e4e5eb5",
275
  "metadata": {},
276
  "outputs": [
 
342
  },
343
  {
344
  "cell_type": "code",
345
+ "execution_count": 11,
346
  "id": "d1352f28-f761-4e91-a9bc-4efe47552f4d",
347
  "metadata": {},
348
  "outputs": [],
 
391
  },
392
  {
393
  "cell_type": "code",
394
+ "execution_count": 12,
395
  "id": "8eb7e7d5-7121-4762-b8d1-e5a9539e2b36",
396
  "metadata": {},
397
  "outputs": [],
398
  "source": [
399
  "#| export\n",
400
+ "CLEAN_TEXT_SYSTEM_PROMPT = \"\"\"You are a helpful expert AI assistant. You are an autoregressive LLM. Your job is to take markdown that was created from a web page html and clean it up so it can be fed to a text to speech model. Remove all hyperlink URL's, navigation references, citations or complex formulas that are not useful when only listening to in audio format. It is also helpful to spell out dates, long numbers, abbreviations, acronym, units etc. For example if you see `50C` in the context of temperature change it to `50 degrees celsius`. If you see an acronym, for example NASA, please spell it out `National Aeronautics and Space Administration`. When you have finished your task please finish the text you return with <<COMPLETE>>. The maximum context length you can return in one shot is 4,000 tokens so you may get cut off. If that happens I will send you another message with the text <<CONTINUE>> and you should continue the task where you had previously left off. This is why I need you finish your response with <<COMPLETE>> once you have fully completed your task. DO NOT MODIFY THE TEXT IN ANY WAY EXCEPT FOR AS INSTRUCTED HERE.\"\"\"\n"
401
  ]
402
  },
403
  {
 
410
  },
411
  {
412
  "cell_type": "code",
413
+ "execution_count": 13,
414
  "id": "b5b29507-92bc-453d-bcc5-6402c17e9a0d",
415
  "metadata": {},
416
  "outputs": [],
 
430
  },
431
  {
432
  "cell_type": "code",
433
+ "execution_count": 14,
434
  "id": "24674094-4d47-4e48-b591-55faabcff8df",
435
  "metadata": {},
436
  "outputs": [],
 
475
  },
476
  {
477
  "cell_type": "code",
478
+ "execution_count": 15,
479
  "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
480
  "metadata": {},
481
  "outputs": [],
 
514
  },
515
  {
516
  "cell_type": "code",
517
+ "execution_count": 16,
518
  "id": "4691703d-ed0f-4481-8006-b2906289b780",
519
  "metadata": {},
520
  "outputs": [
 
577
  },
578
  {
579
  "cell_type": "code",
580
+ "execution_count": 17,
581
  "id": "3420c868-71cb-4ac6-ac65-6f02bfd841d1",
582
  "metadata": {},
583
  "outputs": [
 
616
  },
617
  {
618
  "cell_type": "code",
619
+ "execution_count": 18,
620
  "id": "d0082383-9d03-4b25-b68a-080d0b28caa9",
621
  "metadata": {},
622
  "outputs": [],
 
637
  },
638
  {
639
  "cell_type": "code",
640
+ "execution_count": 19,
641
  "id": "649d90a5-9398-4cb5-a1e8-a464d463a11c",
642
  "metadata": {},
643
  "outputs": [],
 
648
  },
649
  {
650
  "cell_type": "code",
651
+ "execution_count": 20,
652
  "id": "e34bb4aa-698c-4452-8cda-bd02b38f7122",
653
  "metadata": {},
654
  "outputs": [],
 
701
  },
702
  {
703
  "cell_type": "code",
704
+ "execution_count": 21,
705
  "id": "ca2c6f8c-62ed-4ac1-9c2f-e3b2bfb47e8d",
706
  "metadata": {},
707
  "outputs": [],
 
713
  },
714
  {
715
  "cell_type": "code",
716
+ "execution_count": 22,
717
  "id": "236dd8d3-4364-4731-af93-7dcdec6f18a1",
718
  "metadata": {},
719
  "outputs": [],
 
725
  },
726
  {
727
  "cell_type": "code",
728
+ "execution_count": 23,
729
  "id": "0523a158-ee07-48b3-9350-ee39d4deee7f",
730
  "metadata": {},
731
  "outputs": [],
 
747
  },
748
  {
749
  "cell_type": "code",
750
+ "execution_count": 24,
751
  "id": "f4d1ba0b-6960-4e22-8dba-7de70370753a",
752
  "metadata": {},
753
  "outputs": [],
 
759
  },
760
  {
761
  "cell_type": "code",
762
+ "execution_count": 25,
763
  "id": "efa28cf2-548d-439f-bf2a-21a5edbf9eba",
764
  "metadata": {},
765
  "outputs": [],
 
772
  },
773
  {
774
  "cell_type": "code",
775
+ "execution_count": 26,
776
  "id": "cdc1dde5-5edd-4dbf-bd11-30eb418c571d",
777
  "metadata": {},
778
  "outputs": [],
 
784
  },
785
  {
786
  "cell_type": "code",
787
+ "execution_count": 27,
788
  "id": "035c33dd-c8e6-42b4-91d4-6bc5f1b36df3",
789
  "metadata": {},
790
  "outputs": [],
 
797
  },
798
  {
799
  "cell_type": "code",
800
+ "execution_count": 28,
801
  "id": "c97c03af-a377-42e1-93e0-1df957c0e4cc",
802
  "metadata": {},
803
  "outputs": [],
 
808
  " return '# Text Splits:\\n' + '<br>----------<br>'.join(output)"
809
  ]
810
  },
811
+ {
812
+ "cell_type": "code",
813
+ "execution_count": 29,
814
+ "id": "db54a6a6-4bdc-430a-b1ea-444c249b77fb",
815
+ "metadata": {},
816
+ "outputs": [],
817
+ "source": [
818
+ "#| export\n",
819
+ "def get_page_md(url):\n",
820
+ " # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))\n",
821
+ " result = requests.get('https://r.jina.ai/'+url)\n",
822
+ " result.raise_for_status()\n",
823
+ " return result.text"
824
+ ]
825
+ },
826
  {
827
  "cell_type": "code",
828
  "execution_count": 30,
829
+ "id": "75891855-6c08-4a42-9ad5-a02e0b43bb3d",
830
+ "metadata": {},
831
+ "outputs": [],
832
+ "source": [
833
+ "# test_page_md = get_page_md('https://simonwillison.net/2024/Jun/16/jina-ai-reader/')\n",
834
+ "# test_page_md"
835
+ ]
836
+ },
837
+ {
838
+ "cell_type": "code",
839
+ "execution_count": 31,
840
+ "id": "340089c7-0693-43bc-8fc0-cea4fcd0f3f0",
841
  "metadata": {},
842
  "outputs": [],
843
+ "source": [
844
+ "#| export\n",
845
+ "# import json\n",
846
+ "def clean_page_md(text):\n",
847
+ " max_iters = 15\n",
848
+ " complete = False\n",
849
+ " client = openai.OpenAI()\n",
850
+ "\n",
851
+ " tokens = 0\n",
852
+ " messages = messages=[\n",
853
+ " {\"role\": \"system\", \"content\": CLEAN_TEXT_SYSTEM_PROMPT},\n",
854
+ " {\"role\": \"user\", \"content\": text},\n",
855
+ " # {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
856
+ " # {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
857
+ " ]\n",
858
+ "\n",
859
+ " idx = 0\n",
860
+ " while complete == False and idx < max_iters:\n",
861
+ " idx += 1\n",
862
+ " response = client.chat.completions.create(\n",
863
+ " model=\"gpt-4o\",\n",
864
+ " messages=messages\n",
865
+ " )\n",
866
+ " # print(response,'\\n\\n\\n')\n",
867
+ " response_text = response.choices[0].message.content\n",
868
+ " if '<<complete>>' in response_text.lower():\n",
869
+ " complete = True\n",
870
+ " messages += [\n",
871
+ " {\"role\": \"assistant\", \"content\": response_text},\n",
872
+ " {\"role\": \"user\", \"content\": \"Please continue.\"},\n",
873
+ " ]\n",
874
+ " tokens += response.usage.total_tokens\n",
875
+ " # print(json.dumps(messages, indent=4))\n",
876
+ "\n",
877
+ " print('TOKENS CLEANUP:', tokens)\n",
878
+ " result = ' '.join([o['content'] for o in messages if o['role'] == 'assistant'])\n",
879
+ " \n",
880
+ " return result.replace('<<COMPLETE>>','')\n",
881
+ "# res = clean_page_md(test_page_md)\n",
882
+ "# res"
883
+ ]
884
+ },
885
+ {
886
+ "cell_type": "code",
887
+ "execution_count": 32,
888
+ "id": "d55dbe5b-83c6-4ba9-836c-48a181badd38",
889
+ "metadata": {},
890
+ "outputs": [],
891
+ "source": [
892
+ "# clean_page_md(get_page_md('https://www.ineteconomics.org/perspectives/blog/from-long-covid-odds-to-lost-iq-points-ongoing-threats-you-dont-know-about'))"
893
+ ]
894
+ },
895
+ {
896
+ "cell_type": "code",
897
+ "execution_count": 33,
898
+ "id": "7899e7b2-beeb-40a4-a571-a2ccfc7c9618",
899
+ "metadata": {},
900
+ "outputs": [],
901
+ "source": [
902
+ "#| export\n",
903
+ "def get_page_text(url):\n",
904
+ " return clean_page_md(get_page_md(url))"
905
+ ]
906
+ },
907
+ {
908
+ "cell_type": "code",
909
+ "execution_count": 34,
910
+ "id": "e4fb3159-579b-4271-bc96-4cd1e2816eca",
911
+ "metadata": {},
912
+ "outputs": [
913
+ {
914
+ "ename": "TypeError",
915
+ "evalue": "EventListener._setup.<locals>.event_trigger() got an unexpected keyword argument 'input'",
916
+ "output_type": "error",
917
+ "traceback": [
918
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
919
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
920
+ "Cell \u001b[0;32mIn[34], line 35\u001b[0m\n\u001b[1;32m 30\u001b[0m chunks_md \u001b[38;5;241m=\u001b[39m gr\u001b[38;5;241m.\u001b[39mMarkdown(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m,label\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mChunks\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 33\u001b[0m \u001b[38;5;66;03m### Define UI Actions ###\u001b[39;00m\n\u001b[0;32m---> 35\u001b[0m \u001b[43mget_url_content_btn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclick\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mget_page_text\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_url\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_text\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;66;03m# input_text \u001b[39;00m\n\u001b[1;32m 38\u001b[0m input_text\u001b[38;5;241m.\u001b[39minput(fn\u001b[38;5;241m=\u001b[39mget_input_text_len, inputs\u001b[38;5;241m=\u001b[39minput_text, outputs\u001b[38;5;241m=\u001b[39minput_text_length)\n",
921
+ "\u001b[0;31mTypeError\u001b[0m: EventListener._setup.<locals>.event_trigger() got an unexpected keyword argument 'input'"
922
+ ]
923
+ }
924
+ ],
925
  "source": [
926
  "#| export\n",
927
  "with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:\n",
 
931
  " gr.Markdown(\"\"\"Start typing below and then click **Go** to create the speech from your text.\n",
932
  "For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href=\"https://matdmiller-tts-openai.hf.space/\" target=\"_blank\">Spaces Link HTML</a>\"\"\")\n",
933
  " with gr.Row():\n",
934
+ " input_url = gr.Textbox(max_lines=1, label=\"Optional - Enter a URL\")\n",
935
+ " get_url_content_btn = gr.Button(\"Get URL Contents\")\n",
936
+ " with gr.Row():\n",
937
  " input_text = gr.Textbox(max_lines=100, label=\"Enter text here\")\n",
938
  " with gr.Row():\n",
939
  " tts_provider_dropdown = gr.Dropdown(value=DEFAULT_PROVIDER,\n",
 
956
  " \n",
957
  "\n",
958
  " ### Define UI Actions ###\n",
959
+ "\n",
960
+ " get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)\n",
961
  " \n",
962
  " # input_text \n",
963
  " input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)\n",
 
993
  },
994
  {
995
  "cell_type": "code",
996
+ "execution_count": null,
997
  "id": "a00648a1-891b-470b-9959-f5d502055713",
998
  "metadata": {},
999
  "outputs": [],
 
1007
  },
1008
  {
1009
  "cell_type": "code",
1010
+ "execution_count": null,
1011
  "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
1012
  "metadata": {},
1013
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1014
  "source": [
1015
  "#| hide\n",
1016
  "#Notebook launch\n",
app.py CHANGED
@@ -2,10 +2,11 @@
2
 
3
  # %% auto 0
4
  __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'OPENAI_CLIENT_TTS_THREADS', 'CARTESIAAI_CLIENT_TTS_THREADS',
5
- 'DEFAULT_PROVIDER', 'DEFAULT_MODEL', 'DEFAULT_VOICE', 'providers', 'clean_text_prompt', 'launch_kwargs',
6
- 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio', 'create_speech_openai',
7
- 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len', 'get_generation_cost',
8
- 'get_model_choices', 'update_model_choices', 'get_voice_choices', 'update_voice_choices', 'split_text_as_md']
 
9
 
10
  # %% app.ipynb 4
11
  import os
@@ -69,6 +70,8 @@ from tenacity import (
69
  import traceback
70
  # from cartesia.tts import CartesiaTTS
71
  import cartesia
 
 
72
 
73
  # %% app.ipynb 11
74
  TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
@@ -121,7 +124,7 @@ except Exception as e:
121
  # providers
122
 
123
  # %% app.ipynb 19
124
- clean_text_prompt = """Your job is to clean up text that is going to be fed into a text to speech (TTS) model. You must remove parts of the text that would not normally be spoken such as reference marks `[1]`, spurious citations such as `(Reddy et al., 2021; Wu et al., 2022; Chang et al., 2022; Kondratyuk et al., 2023)` and any other part of the text that is not normally spoken. Please also clean up sections and headers so they are on new lines with proper numbering. You must also clean up any math formulas that are salvageable from being copied from a scientific paper. If they are garbled and do not make sense then remove them. You must carefully perform the text cleanup so it is translated into speech that is easy to listen to however you must not modify the text otherwise. It is critical that you repeat all of the text without modifications except for the cleanup activities you've been instructed to do. Also you must clean all of the text you are given, you may not omit any of it or stop the cleanup task early."""
125
 
126
 
127
  # %% app.ipynb 21
@@ -330,12 +333,66 @@ def split_text_as_md(*args, **kwargs):
330
  return '# Text Splits:\n' + '<br>----------<br>'.join(output)
331
 
332
  # %% app.ipynb 38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
334
 
335
  ### Define UI ###
336
  gr.Markdown("# TTS")
337
  gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
338
  For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>""")
 
 
 
339
  with gr.Row():
340
  input_text = gr.Textbox(max_lines=100, label="Enter text here")
341
  with gr.Row():
@@ -359,6 +416,8 @@ For requests longer than allowed by the API they will be broken into chunks auto
359
 
360
 
361
  ### Define UI Actions ###
 
 
362
 
363
  # input_text
364
  input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
@@ -391,13 +450,13 @@ For requests longer than allowed by the API they will be broken into chunks auto
391
 
392
 
393
 
394
- # %% app.ipynb 39
395
  # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
396
  # 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
397
  launch_kwargs = {}
398
  queue_kwargs = {'default_concurrency_limit':10}
399
 
400
- # %% app.ipynb 41
401
  #.py launch
402
  if __name__ == "__main__":
403
  app.queue(**queue_kwargs)
 
2
 
3
  # %% auto 0
4
  __all__ = ['secret_import_failed', 'TEMP', 'TEMP_DIR', 'OPENAI_CLIENT_TTS_THREADS', 'CARTESIAAI_CLIENT_TTS_THREADS',
5
+ 'DEFAULT_PROVIDER', 'DEFAULT_MODEL', 'DEFAULT_VOICE', 'providers', 'CLEAN_TEXT_SYSTEM_PROMPT',
6
+ 'launch_kwargs', 'queue_kwargs', 'verify_authorization', 'split_text', 'concatenate_audio',
7
+ 'create_speech_openai', 'create_speech_cartesiaai', 'create_speech', 'get_input_text_len',
8
+ 'get_generation_cost', 'get_model_choices', 'update_model_choices', 'get_voice_choices',
9
+ 'update_voice_choices', 'split_text_as_md', 'get_page_md', 'clean_page_md', 'get_page_text']
10
 
11
  # %% app.ipynb 4
12
  import os
 
70
  import traceback
71
  # from cartesia.tts import CartesiaTTS
72
  import cartesia
73
+ import requests
74
+ import urllib
75
 
76
  # %% app.ipynb 11
77
  TEMP = os.environ.get('GRADIO_TEMP_DIR','/tmp/')
 
124
  # providers
125
 
126
  # %% app.ipynb 19
127
+ CLEAN_TEXT_SYSTEM_PROMPT = """You are a helpful expert AI assistant. You are an autoregressive LLM. Your job is to take markdown that was created from a web page html and clean it up so it can be fed to a text to speech model. Remove all hyperlink URL's, navigation references, citations or complex formulas that are not useful when only listening to in audio format. It is also helpful to spell out dates, long numbers, abbreviations, acronym, units etc. For example if you see `50C` in the context of temperature change it to `50 degrees celsius`. If you see an acronym, for example NASA, please spell it out `National Aeronautics and Space Administration`. When you have finished your task please finish the text you return with <<COMPLETE>>. The maximum context length you can return in one shot is 4,000 tokens so you may get cut off. If that happens I will send you another message with the text <<CONTINUE>> and you should continue the task where you had previously left off. This is why I need you finish your response with <<COMPLETE>> once you have fully completed your task. DO NOT MODIFY THE TEXT IN ANY WAY EXCEPT FOR AS INSTRUCTED HERE."""
128
 
129
 
130
  # %% app.ipynb 21
 
333
  return '# Text Splits:\n' + '<br>----------<br>'.join(output)
334
 
335
  # %% app.ipynb 38
336
+ def get_page_md(url):
337
+ # result = requests.get('https://r.jina.ai/'+urllib.parse.quote_plus(url))
338
+ result = requests.get('https://r.jina.ai/'+url)
339
+ result.raise_for_status()
340
+ return result.text
341
+
342
+ # %% app.ipynb 40
343
+ # import json
344
+ def clean_page_md(text):
345
+ max_iters = 15
346
+ complete = False
347
+ client = openai.OpenAI()
348
+
349
+ tokens = 0
350
+ messages = messages=[
351
+ {"role": "system", "content": CLEAN_TEXT_SYSTEM_PROMPT},
352
+ {"role": "user", "content": text},
353
+ # {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
354
+ # {"role": "user", "content": "Where was it played?"}
355
+ ]
356
+
357
+ idx = 0
358
+ while complete == False and idx < max_iters:
359
+ idx += 1
360
+ response = client.chat.completions.create(
361
+ model="gpt-4o",
362
+ messages=messages
363
+ )
364
+ # print(response,'\n\n\n')
365
+ response_text = response.choices[0].message.content
366
+ if '<<complete>>' in response_text.lower():
367
+ complete = True
368
+ messages += [
369
+ {"role": "assistant", "content": response_text},
370
+ {"role": "user", "content": "Please continue."},
371
+ ]
372
+ tokens += response.usage.total_tokens
373
+ # print(json.dumps(messages, indent=4))
374
+
375
+ print('TOKENS CLEANUP:', tokens)
376
+ result = ' '.join([o['content'] for o in messages if o['role'] == 'assistant'])
377
+
378
+ return result.replace('<<COMPLETE>>','')
379
+ # res = clean_page_md(test_page_md)
380
+ # res
381
+
382
+ # %% app.ipynb 42
383
+ def get_page_text(url):
384
+ return clean_page_md(get_page_md(url))
385
+
386
+ # %% app.ipynb 43
387
  with gr.Blocks(title='TTS', head='TTS', delete_cache=(3600,3600)) as app:
388
 
389
  ### Define UI ###
390
  gr.Markdown("# TTS")
391
  gr.Markdown("""Start typing below and then click **Go** to create the speech from your text.
392
  For requests longer than allowed by the API they will be broken into chunks automatically. [Spaces Link](https://matdmiller-tts-openai.hf.space/) | <a href="https://matdmiller-tts-openai.hf.space/" target="_blank">Spaces Link HTML</a>""")
393
+ with gr.Row():
394
+ input_url = gr.Textbox(max_lines=1, label="Optional - Enter a URL")
395
+ get_url_content_btn = gr.Button("Get URL Contents")
396
  with gr.Row():
397
  input_text = gr.Textbox(max_lines=100, label="Enter text here")
398
  with gr.Row():
 
416
 
417
 
418
  ### Define UI Actions ###
419
+
420
+ get_url_content_btn.click(fn=get_page_text, inputs=input_url, outputs=input_text)
421
 
422
  # input_text
423
  input_text.input(fn=get_input_text_len, inputs=input_text, outputs=input_text_length)
 
450
 
451
 
452
 
453
+ # %% app.ipynb 44
454
  # launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
455
  # 'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
456
  launch_kwargs = {}
457
  queue_kwargs = {'default_concurrency_limit':10}
458
 
459
+ # %% app.ipynb 46
460
  #.py launch
461
  if __name__ == "__main__":
462
  app.queue(**queue_kwargs)