Spaces:

matdmiller
/

tts-openai

Runtime error

App Files Files Community

matdmiller commited on Jan 31, 2024

Commit

d9a62b3

1 Parent(s): 990159e

added text chunking for text over 4,000 chars

Browse files

Files changed (4) hide show

app.ipynb +125 -99
app.py +110 -13
packages.txt +1 -0
requirements.txt +3 -2

app.ipynb CHANGED Viewed

@@ -15,16 +15,7 @@
    "execution_count": null,
    "id": "667802a7-0f36-4136-a381-e66210b20462",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OPENAI_API_KEY var not found. Trying import tts_openai_secrets\n",
-      "import tts_openai_secrets succeeded\n"
-     ]
-    }
-   ],
    "source": [
     "#| export\n",
     "#tts_openai_secrets.py content:\n",
@@ -74,7 +65,9 @@
    "source": [
     "#| export\n",
     "import gradio as gr\n",
-    "import openai"
    ]
   },
   {
@@ -82,15 +75,7 @@
    "execution_count": null,
    "id": "0ffd33b4-cb9b-4c01-bff6-4c3102854ab6",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "successfully got tts model list: ['tts-1-hd', 'tts-1-hd-1106', 'canary-tts', 'tts-1', 'tts-1-1106']\n"
-     ]
-    }
-   ],
    "source": [
     "#| export\n",
     "try:\n",
@@ -111,6 +96,85 @@
     "tts_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -119,16 +183,46 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "def create_speech(input_text, model='tts-1', voice='alloy'):\n",
     "    client = openai.OpenAI()\n",
-    "    response = client.audio.speech.create(\n",
-    "        model=model,\n",
-    "        voice=voice,\n",
-    "        input=input_text,\n",
-    "        speed=1.0\n",
-    "    )\n",
     "    client.close()\n",
-    "    return response.content"
    ]
   },
   {
@@ -186,37 +280,7 @@
    "execution_count": null,
    "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running on local URL:  http://0.0.0.0:7860\n",
-      "\n",
-      "To create a public link, set `share=True` in `launch()`.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><iframe src=\"http://localhost:7860/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": []
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "#| hide\n",
     "#Notebook launch\n",
@@ -228,37 +292,7 @@
    "execution_count": null,
    "id": "cb886d45",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Running on local URL:  http://0.0.0.0:7861\n",
-      "\n",
-      "To create a public link, set `share=True` in `launch()`.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div><iframe src=\"http://localhost:7861/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": []
-     },
-     "execution_count": null,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
    "source": [
     "#| export\n",
     "#.py launch\n",
@@ -271,15 +305,7 @@
    "execution_count": null,
    "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Closing server running on port: 7861\n"
-     ]
-    }
-   ],
    "source": [
     "#| hide\n",
     "app.close()"

    "execution_count": null,
    "id": "667802a7-0f36-4136-a381-e66210b20462",
    "metadata": {},
+   "outputs": [],
    "source": [
     "#| export\n",
     "#tts_openai_secrets.py content:\n",
    "source": [
     "#| export\n",
     "import gradio as gr\n",
+    "import openai\n",
+    "from pydub import AudioSegment\n",
+    "import io"
    ]
   },
   {
    "execution_count": null,
    "id": "0ffd33b4-cb9b-4c01-bff6-4c3102854ab6",
    "metadata": {},
+   "outputs": [],
    "source": [
     "#| export\n",
     "try:\n",
     "tts_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24674094-4d47-4e48-b591-55faabcff8df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def split_text(input_text, max_length=4000, lookback=1000):\n",
+    "    # If the text is shorter than the max_length, return it as is\n",
+    "    if len(input_text) <= max_length:\n",
+    "        return [input_text]\n",
+    "\n",
+    "    chunks = []\n",
+    "    while input_text:\n",
+    "        # Check if the remaining text is shorter than the max_length\n",
+    "        if len(input_text) <= max_length:\n",
+    "            chunks.append(input_text)\n",
+    "            break\n",
+    "\n",
+    "        # Define the split point, initially set to max_length\n",
+    "        split_point = max_length\n",
+    "\n",
+    "        # Look for a newline in the last 'lookback' characters\n",
+    "        newline_index = input_text.rfind('\\n', max_length-lookback, max_length)\n",
+    "        if newline_index != -1:\n",
+    "            split_point = newline_index + 1  # Include the newline in the current chunk\n",
+    "\n",
+    "        # If no newline, look for a period followed by space\n",
+    "        elif '. ' in input_text[max_length-lookback:max_length]:\n",
+    "            # Find the last '. ' in the lookback range\n",
+    "            period_index = input_text.rfind('. ', max_length-lookback, max_length)\n",
+    "            split_point = period_index + 2  # Split after the space\n",
+    "\n",
+    "        # Split the text and update the input_text\n",
+    "        chunks.append(input_text[:split_point])\n",
+    "        input_text = input_text[split_point:]\n",
+    "\n",
+    "    return chunks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6224ae5-3792-42b2-8392-3abd42998a50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "def concatenate_mp3(mp3_files):\n",
+    "    if len(mp3_files) == 1:\n",
+    "        return mp3_files[0]\n",
+    "    else:\n",
+    "        # Initialize an empty AudioSegment object for concatenation\n",
+    "        combined = AudioSegment.empty()\n",
+    "        \n",
+    "        # Write out audio file responses as individual files for debugging\n",
+    "        # for idx, mp3_data in enumerate(mp3_files):\n",
+    "        #     with open(f'./{idx}.mp3', 'wb') as f:\n",
+    "        #         f.write(mp3_data)\n",
+    "\n",
+    "        # Loop through the list of mp3 binary data\n",
+    "        for mp3_data in mp3_files:\n",
+    "            # Convert binary data to an audio segment\n",
+    "            audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format=\"mp3\")\n",
+    "            # Concatenate this segment to the combined segment\n",
+    "            combined += audio_segment\n",
+    "\n",
+    "        # Export the combined segment to a new mp3 file\n",
+    "        # Use a BytesIO object to handle this in memory\n",
+    "        combined_mp3 = io.BytesIO()\n",
+    "        combined.export(combined_mp3, format=\"mp3\")\n",
+    "\n",
+    "        # Seek to the start so it's ready for reading\n",
+    "        combined_mp3.seek(0)\n",
+    "\n",
+    "        return combined_mp3.getvalue()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "outputs": [],
    "source": [
     "#| export\n",
+    "def create_speech(input_text, model='tts-1', voice='alloy', progress=gr.Progress()):\n",
+    "    # Split the input text into chunks\n",
+    "    chunks = split_text(input_text)\n",
+    "\n",
+    "    # Initialize the progress bar\n",
+    "    progress(0, desc=\"Starting TTS processing...\")\n",
+    "\n",
+    "    # Initialize a list to hold the audio data of each chunk\n",
+    "    audio_data = []\n",
+    "\n",
+    "    # Create a client instance for OpenAI\n",
     "    client = openai.OpenAI()\n",
+    "\n",
+    "    # Calculate the progress increment for each chunk\n",
+    "    progress_increment = 1.0 / len(chunks)\n",
+    "\n",
+    "    # Process each chunk\n",
+    "    for i, chunk in enumerate(chunks):\n",
+    "        response = client.audio.speech.create(\n",
+    "            model=model,\n",
+    "            voice=voice,\n",
+    "            input=chunk,\n",
+    "            speed=1.0\n",
+    "        )\n",
+    "        # Append the audio content of the response to the list\n",
+    "        audio_data.append(response.content)\n",
+    "\n",
+    "        # Update the progress bar\n",
+    "        progress((i + 1) * progress_increment, desc=f\"Processing chunk {i + 1} of {len(chunks)}\")\n",
+    "\n",
+    "    # Close the client connection\n",
     "    client.close()\n",
+    "\n",
+    "    # Concatenate the audio data from all chunks\n",
+    "    combined_audio = concatenate_mp3(audio_data)\n",
+    "\n",
+    "    # Final update to the progress bar\n",
+    "    progress(1, desc=\"Processing completed\")\n",
+    "\n",
+    "    return combined_audio\n"
    ]
   },
   {
    "execution_count": null,
    "id": "4b534fe7-4337-423e-846a-1bdb7cccc4ea",
    "metadata": {},
+   "outputs": [],
    "source": [
     "#| hide\n",
     "#Notebook launch\n",
    "execution_count": null,
    "id": "cb886d45",
    "metadata": {},
+   "outputs": [],
    "source": [
     "#| export\n",
     "#.py launch\n",
    "execution_count": null,
    "id": "28e8d888-e790-46fa-bbac-4511b9ab796c",
    "metadata": {},
+   "outputs": [],
    "source": [
     "#| hide\n",
     "app.close()"

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
 # %% auto 0
-__all__ = ['secret_import_failed', 'tts_voices', 'launch_kwargs', 'create_speech', 'get_input_text_len']
 # %% app.ipynb 1
 #tts_openai_secrets.py content:
@@ -30,6 +31,8 @@ if secret_import_failed == True:
 # %% app.ipynb 3
 import gradio as gr
 import openai
 # %% app.ipynb 4
 try:
@@ -42,22 +45,116 @@ except:
 tts_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
 # %% app.ipynb 6
-def create_speech(input_text, model='tts-1', voice='alloy'):
     client = openai.OpenAI()
-    response = client.audio.speech.create(
-        model=model,
-        voice=voice,
-        input=input_text,
-        speed=1.0
-    )
     client.close()
-    return response.content
-# %% app.ipynb 7
 def get_input_text_len(input_text):
     return len(input_text)
-# %% app.ipynb 8
 with gr.Blocks(title='OpenAI TTS', head='OpenAI TTS') as app:
     gr.Markdown("# OpenAI TTS")
     gr.Markdown("Start typing below and then click **Go** to create the speech from your text. The current limit is 4,000 characters.")
@@ -75,11 +172,11 @@ with gr.Blocks(title='OpenAI TTS', head='OpenAI TTS') as app:
     clear_btn.click(fn=lambda: '', outputs=input_text)
-# %% app.ipynb 9
 launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
                  'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
-# %% app.ipynb 11
 #.py launch
 if __name__ == "__main__":
     app.launch(**launch_kwargs)

 # AUTOGENERATED! DO NOT EDIT! File to edit: app.ipynb.
 # %% auto 0
+__all__ = ['secret_import_failed', 'tts_voices', 'launch_kwargs', 'split_text', 'concatenate_mp3', 'create_speech',
+           'get_input_text_len']
 # %% app.ipynb 1
 #tts_openai_secrets.py content:
 # %% app.ipynb 3
 import gradio as gr
 import openai
+from pydub import AudioSegment
+import io
 # %% app.ipynb 4
 try:
 tts_voices = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
 # %% app.ipynb 6
+def split_text(input_text, max_length=4000, lookback=1000):
+    # If the text is shorter than the max_length, return it as is
+    if len(input_text) <= max_length:
+        return [input_text]
+    chunks = []
+    while input_text:
+        # Check if the remaining text is shorter than the max_length
+        if len(input_text) <= max_length:
+            chunks.append(input_text)
+            break
+        # Define the split point, initially set to max_length
+        split_point = max_length
+        # Look for a newline in the last 'lookback' characters
+        newline_index = input_text.rfind('\n', max_length-lookback, max_length)
+        if newline_index != -1:
+            split_point = newline_index + 1  # Include the newline in the current chunk
+        # If no newline, look for a period followed by space
+        elif '. ' in input_text[max_length-lookback:max_length]:
+            # Find the last '. ' in the lookback range
+            period_index = input_text.rfind('. ', max_length-lookback, max_length)
+            split_point = period_index + 2  # Split after the space
+        # Split the text and update the input_text
+        chunks.append(input_text[:split_point])
+        input_text = input_text[split_point:]
+    return chunks
+# %% app.ipynb 7
+def concatenate_mp3(mp3_files):
+    if len(mp3_files) == 1:
+        return mp3_files[0]
+    else:
+        # Initialize an empty AudioSegment object for concatenation
+        combined = AudioSegment.empty()
+        # Write out audio file responses as individual files for debugging
+        # for idx, mp3_data in enumerate(mp3_files):
+        #     with open(f'./{idx}.mp3', 'wb') as f:
+        #         f.write(mp3_data)
+        # Loop through the list of mp3 binary data
+        for mp3_data in mp3_files:
+            # Convert binary data to an audio segment
+            audio_segment = AudioSegment.from_file(io.BytesIO(mp3_data), format="mp3")
+            # Concatenate this segment to the combined segment
+            combined += audio_segment
+        # Export the combined segment to a new mp3 file
+        # Use a BytesIO object to handle this in memory
+        combined_mp3 = io.BytesIO()
+        combined.export(combined_mp3, format="mp3")
+        # Seek to the start so it's ready for reading
+        combined_mp3.seek(0)
+        return combined_mp3.getvalue()
+# %% app.ipynb 8
+def create_speech(input_text, model='tts-1', voice='alloy', progress=gr.Progress()):
+    # Split the input text into chunks
+    chunks = split_text(input_text)
+    # Initialize the progress bar
+    progress(0, desc="Starting TTS processing...")
+    # Initialize a list to hold the audio data of each chunk
+    audio_data = []
+    # Create a client instance for OpenAI
     client = openai.OpenAI()
+    # Calculate the progress increment for each chunk
+    progress_increment = 1.0 / len(chunks)
+    # Process each chunk
+    for i, chunk in enumerate(chunks):
+        response = client.audio.speech.create(
+            model=model,
+            voice=voice,
+            input=chunk,
+            speed=1.0
+        )
+        # Append the audio content of the response to the list
+        audio_data.append(response.content)
+        # Update the progress bar
+        progress((i + 1) * progress_increment, desc=f"Processing chunk {i + 1} of {len(chunks)}")
+    # Close the client connection
     client.close()
+    # Concatenate the audio data from all chunks
+    combined_audio = concatenate_mp3(audio_data)
+    # Final update to the progress bar
+    progress(1, desc="Processing completed")
+    return combined_audio
+# %% app.ipynb 9
 def get_input_text_len(input_text):
     return len(input_text)
+# %% app.ipynb 10
 with gr.Blocks(title='OpenAI TTS', head='OpenAI TTS') as app:
     gr.Markdown("# OpenAI TTS")
     gr.Markdown("Start typing below and then click **Go** to create the speech from your text. The current limit is 4,000 characters.")
     clear_btn.click(fn=lambda: '', outputs=input_text)
+# %% app.ipynb 11
 launch_kwargs = {'auth':('username',GRADIO_PASSWORD),
                  'auth_message':'Please log in to Mat\'s TTS App with username: username and password.'}
+# %% app.ipynb 13
 #.py launch
 if __name__ == "__main__":
     app.launch(**launch_kwargs)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-openai==1.3.5
-gradio==4.7.1

+openai==1.10.0
+gradio==4.16.0
+pydub==0.25.1