Spaces:

ales
/

ai-audio-books

Running

App Files Files Community

Andrei Kulchyk commited on Oct 9, 2024

Commit

ddb099d

1 Parent(s): 01dd521

Add backbone Gradio application

Browse files

Files changed (3) hide show

.gitignore +3 -1
app.py +237 -0
requirements.txt +2 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 __pycache__
 data/books
-.env

 __pycache__
 data/books
+.env
+venv
+.python-version

app.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import json
+import os
+import re
+import librosa
+import requests
+import gradio as gr
+import pandas as pd
+from dotenv import load_dotenv
+from openai import OpenAI
+load_dotenv()
+api_key = os.getenv("AIML_API_KEY")
+CHARACTER_CLASSIFICATION_PROMPT = """
+**Task:**
+Analyze the provided story text and classify each character in the given list \
+by their gender. Use `"M"` for Male and `"F"` for Female. Classify the \
+characters based on contextual clues such as names, pronouns, descriptions, \
+roles, and interactions within the story.
+**Output Format:**
+Provide the classification in a JSON object where each key is a character's \
+name, and the value is `"M"` or `"F"`.
+**Example Input:**
+```
+### Story
+Once upon a time Alice met Bob and Charlie.
+### Characters
+["alice", "bob", "charlie"]
+```
+**Example Output:**
+```json
+{
+  "alice": "F",
+  "bob": "M",
+  "charlie": "M"
+}
+"""
+TEXT_ANNOTATION_PROMPT = """\
+**Task:**
+Analyze the provided text and annotate each segment by indicating whether it is \
+part of the narration or spoken by a specific character. Use "Narrator" for \
+narration and the character's name for dialogues. Format the annotated text in a \
+clear and consistent manner, suitable for subsequent text-to-speech processing.
+**Formatting Guidelines:**
+- Narration: Prefix with `[Narrator]`
+- Character Dialogue: Prefix with `[Character Name]`
+- Multiple Characters Speaking: Prefix with `[Character Name 1] [Character Name 2] ... [Character Name N]`
+- Consistent Line Breaks: Ensure each labeled segment starts on a new line for clarity.
+"""
+with open("data/the-three-little-pigs.txt") as f:
+    STORY = f.read()
+VOICES = pd.read_csv("all_voices.csv").query("language == 'en'")
+class AudiobookBuilder:
+    def __init__(
+            self,
+            *,
+            aiml_api_key: str | None = None,
+            aiml_base_url: str = "https://api.aimlapi.com/v1",
+            eleven_api_key: str | None = None,
+    ) -> None:
+        self._aiml_api_key = aiml_api_key or os.environ["AIML_API_KEY"]
+        self._aiml_base_url = aiml_base_url
+        self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
+        self._default_narrator_voice = "XALcFq0WF65uNKzmpcZW"
+        self._eleven_api_key = eleven_api_key or os.environ["ELEVEN_API_KEY"]
+    def annotate_text(self, text: str) -> str:
+        response = self._send_request_to_llm(messages=[
+            {
+                "role": "system",
+                "content": TEXT_ANNOTATION_PROMPT,
+            },
+            {
+                "role": "user",
+                "content": text,
+            }
+        ])
+        return response["choices"][0]["message"]["content"]
+    def classify_characters(self, annotated_text: str, unique_characters: list[str]) -> dict:
+        response = self._send_request_to_llm(
+            messages=[
+                {
+                    "role": "system",
+                    "content": CHARACTER_CLASSIFICATION_PROMPT,
+                },
+                {
+                    "role": "user",
+                    "content": f"### Story\n\n{annotated_text}\n\n### Characters\n\n{unique_characters}",
+                },
+            ],
+            response_format={"type": "json_object"},
+        )
+        return json.loads(response["choices"][0]["message"]["content"])
+    def generate_audio(
+            self,
+            annotated_text: str,
+            character_to_voice: dict[str, str],
+            *,
+            chunk_size: int = 1024,
+    ) -> None:
+        current_character = "narrator"
+        with open("audiobook.mp3", "wb") as ab:
+            for line in annotated_text.splitlines():
+                cleaned_line = line.strip().lower()
+                if not cleaned_line:
+                    continue
+                try:
+                    current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
+                except:
+                    pass
+                voice_id = character_to_voice[current_character]
+                character_text = cleaned_line[cleaned_line.rfind("]")+1:].lstrip()
+                fragment = self._send_request_to_tts(voice_id=voice_id, text=character_text)
+                for chunk in fragment.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        ab.write(chunk)
+    @staticmethod
+    def get_unique_characters(annotated_text: str) -> list[str]:
+        characters = set[str]()
+        for line in annotated_text.splitlines():
+            cleaned_line = line.strip().lower()
+            if not cleaned_line.startswith("["):
+                continue
+            line_characters = re.findall(r"\[[\w\s]+\]", cleaned_line)
+            characters = characters.union(ch[1:-1] for ch in line_characters)
+        return list(characters - {"narrator"})
+    def map_characters_to_voices(self, character_to_gender: dict[str, str]) -> dict[str, str]:
+        character_to_voice = {"narrator": self._default_narrator_voice}
+        # Damy vperyod!
+        f_characters = [character for character, gender in character_to_gender.items() if gender.strip().lower() == "f"]
+        if f_characters:
+            f_voices = VOICES.query("gender == 'female'").iloc[:len(f_characters)].copy()
+            f_voices["character"] = f_characters
+            character_to_voice |= f_voices.set_index("character")["voice_id"].to_dict()
+        m_characters = [character for character, gender in character_to_gender.items() if gender.strip().lower() == "m"]
+        if m_characters:
+            m_voices = VOICES.query("gender == 'male'").iloc[:len(m_characters)].copy()
+            m_voices["character"] = m_characters
+            character_to_voice |= m_voices.set_index("character")["voice_id"].to_dict()
+        return character_to_voice
+    def _send_request_to_llm(self, messages: list[dict], **kwargs) -> dict:
+        response = requests.post(
+            url=f"{self._aiml_base_url}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {self._aiml_api_key}",
+                "Content-Type": "application/json",
+            },
+            data=json.dumps({
+                "model": "gpt-4o",
+                "temperature": 0.0,
+                "messages": messages,
+                "stream": False,
+                "max_tokens": 16_384,
+                **kwargs,
+            }),
+        )
+        response.raise_for_status()
+        return response.json()
+    def _send_request_to_tts(self, voice_id: str, text: str):
+        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
+        headers = {
+            "Accept": "audio/mpeg",
+            "Content-Type": "application/json",
+            "xi-api-key": self._eleven_api_key,
+        }
+        data = {
+            "text": text,
+            "model_id": "eleven_monolingual_v1",
+            "voice_settings": {
+                "stability": 0.5,
+                "similarity_boost": 0.5
+            }
+        }
+        response = requests.post(url, json=data, headers=headers)
+        response.raise_for_status()
+        return response
+def respond(text):
+    builder = AudiobookBuilder()
+    annotated_text = builder.annotate_text(text)
+    unique_characters = builder.get_unique_characters(annotated_text)
+    character_to_gender = builder.classify_characters(text, unique_characters)
+    character_to_voice = builder.map_characters_to_voices(character_to_gender)
+    builder.generate_audio(annotated_text, character_to_voice)
+    audio, sr = librosa.load("audiobook.mp3", sr=None)
+    return (sr, audio)
+with gr.Blocks(title="Audiobooks Generation") as ui:
+    gr.Markdown("# Audiobooks Generation")
+    with gr.Row(variant="panel"):
+        text_input = gr.Textbox(label="Enter the book text", lines=20)
+    with gr.Row(variant="panel"):
+        audio_output = gr.Audio(label="Generated audio")
+    submit_button = gr.Button("Submit")
+    submit_button.click(
+        fn=respond,
+        inputs=[text_input],
+        outputs=[audio_output],
+    )
+ui.launch()

requirements.txt CHANGED Viewed

@@ -1,8 +1,10 @@
 langchain
 langchain-openai
 langchain-community
 jupyter
 openai
 elevenlabs
 gradio
 python-dotenv

 langchain
 langchain-openai
 langchain-community
+librosa
 jupyter
 openai
+pandas
 elevenlabs
 gradio
 python-dotenv