Andrei Kulchyk commited on
Commit
ddb099d
Β·
1 Parent(s): 01dd521

Add backbone Gradio application

Browse files
Files changed (3) hide show
  1. .gitignore +3 -1
  2. app.py +237 -0
  3. requirements.txt +2 -0
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  __pycache__
2
  data/books
3
- .env
 
 
 
1
  __pycache__
2
  data/books
3
+ .env
4
+ venv
5
+ .python-version
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ import librosa
6
+ import requests
7
+ import gradio as gr
8
+ import pandas as pd
9
+ from dotenv import load_dotenv
10
+ from openai import OpenAI
11
+
12
+
13
+ load_dotenv()
14
+
15
+
16
+ api_key = os.getenv("AIML_API_KEY")
17
+
18
+
19
+ CHARACTER_CLASSIFICATION_PROMPT = """
20
+ **Task:**
21
+ Analyze the provided story text and classify each character in the given list \
22
+ by their gender. Use `"M"` for Male and `"F"` for Female. Classify the \
23
+ characters based on contextual clues such as names, pronouns, descriptions, \
24
+ roles, and interactions within the story.
25
+
26
+ **Output Format:**
27
+ Provide the classification in a JSON object where each key is a character's \
28
+ name, and the value is `"M"` or `"F"`.
29
+
30
+ **Example Input:**
31
+ ```
32
+ ### Story
33
+ Once upon a time Alice met Bob and Charlie.
34
+
35
+ ### Characters
36
+ ["alice", "bob", "charlie"]
37
+ ```
38
+
39
+ **Example Output:**
40
+ ```json
41
+ {
42
+ "alice": "F",
43
+ "bob": "M",
44
+ "charlie": "M"
45
+ }
46
+ """
47
+
48
+
49
+ TEXT_ANNOTATION_PROMPT = """\
50
+ **Task:**
51
+ Analyze the provided text and annotate each segment by indicating whether it is \
52
+ part of the narration or spoken by a specific character. Use "Narrator" for \
53
+ narration and the character's name for dialogues. Format the annotated text in a \
54
+ clear and consistent manner, suitable for subsequent text-to-speech processing.
55
+
56
+ **Formatting Guidelines:**
57
+
58
+ - Narration: Prefix with `[Narrator]`
59
+ - Character Dialogue: Prefix with `[Character Name]`
60
+ - Multiple Characters Speaking: Prefix with `[Character Name 1] [Character Name 2] ... [Character Name N]`
61
+ - Consistent Line Breaks: Ensure each labeled segment starts on a new line for clarity.
62
+ """
63
+
64
+
65
+ with open("data/the-three-little-pigs.txt") as f:
66
+ STORY = f.read()
67
+
68
+
69
+ VOICES = pd.read_csv("all_voices.csv").query("language == 'en'")
70
+
71
+
72
+ class AudiobookBuilder:
73
+ def __init__(
74
+ self,
75
+ *,
76
+ aiml_api_key: str | None = None,
77
+ aiml_base_url: str = "https://api.aimlapi.com/v1",
78
+ eleven_api_key: str | None = None,
79
+ ) -> None:
80
+ self._aiml_api_key = aiml_api_key or os.environ["AIML_API_KEY"]
81
+ self._aiml_base_url = aiml_base_url
82
+ self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
83
+ self._default_narrator_voice = "XALcFq0WF65uNKzmpcZW"
84
+ self._eleven_api_key = eleven_api_key or os.environ["ELEVEN_API_KEY"]
85
+
86
+ def annotate_text(self, text: str) -> str:
87
+ response = self._send_request_to_llm(messages=[
88
+ {
89
+ "role": "system",
90
+ "content": TEXT_ANNOTATION_PROMPT,
91
+ },
92
+ {
93
+ "role": "user",
94
+ "content": text,
95
+ }
96
+ ])
97
+ return response["choices"][0]["message"]["content"]
98
+
99
+ def classify_characters(self, annotated_text: str, unique_characters: list[str]) -> dict:
100
+ response = self._send_request_to_llm(
101
+ messages=[
102
+ {
103
+ "role": "system",
104
+ "content": CHARACTER_CLASSIFICATION_PROMPT,
105
+ },
106
+ {
107
+ "role": "user",
108
+ "content": f"### Story\n\n{annotated_text}\n\n### Characters\n\n{unique_characters}",
109
+ },
110
+ ],
111
+ response_format={"type": "json_object"},
112
+ )
113
+ return json.loads(response["choices"][0]["message"]["content"])
114
+
115
+ def generate_audio(
116
+ self,
117
+ annotated_text: str,
118
+ character_to_voice: dict[str, str],
119
+ *,
120
+ chunk_size: int = 1024,
121
+ ) -> None:
122
+ current_character = "narrator"
123
+ with open("audiobook.mp3", "wb") as ab:
124
+ for line in annotated_text.splitlines():
125
+ cleaned_line = line.strip().lower()
126
+ if not cleaned_line:
127
+ continue
128
+ try:
129
+ current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
130
+ except:
131
+ pass
132
+ voice_id = character_to_voice[current_character]
133
+ character_text = cleaned_line[cleaned_line.rfind("]")+1:].lstrip()
134
+ fragment = self._send_request_to_tts(voice_id=voice_id, text=character_text)
135
+ for chunk in fragment.iter_content(chunk_size=chunk_size):
136
+ if chunk:
137
+ ab.write(chunk)
138
+
139
+ @staticmethod
140
+ def get_unique_characters(annotated_text: str) -> list[str]:
141
+ characters = set[str]()
142
+ for line in annotated_text.splitlines():
143
+ cleaned_line = line.strip().lower()
144
+ if not cleaned_line.startswith("["):
145
+ continue
146
+ line_characters = re.findall(r"\[[\w\s]+\]", cleaned_line)
147
+ characters = characters.union(ch[1:-1] for ch in line_characters)
148
+ return list(characters - {"narrator"})
149
+
150
+ def map_characters_to_voices(self, character_to_gender: dict[str, str]) -> dict[str, str]:
151
+ character_to_voice = {"narrator": self._default_narrator_voice}
152
+
153
+ # Damy vperyod!
154
+ f_characters = [character for character, gender in character_to_gender.items() if gender.strip().lower() == "f"]
155
+ if f_characters:
156
+ f_voices = VOICES.query("gender == 'female'").iloc[:len(f_characters)].copy()
157
+ f_voices["character"] = f_characters
158
+ character_to_voice |= f_voices.set_index("character")["voice_id"].to_dict()
159
+
160
+ m_characters = [character for character, gender in character_to_gender.items() if gender.strip().lower() == "m"]
161
+ if m_characters:
162
+ m_voices = VOICES.query("gender == 'male'").iloc[:len(m_characters)].copy()
163
+ m_voices["character"] = m_characters
164
+ character_to_voice |= m_voices.set_index("character")["voice_id"].to_dict()
165
+
166
+ return character_to_voice
167
+
168
+ def _send_request_to_llm(self, messages: list[dict], **kwargs) -> dict:
169
+ response = requests.post(
170
+ url=f"{self._aiml_base_url}/chat/completions",
171
+ headers={
172
+ "Authorization": f"Bearer {self._aiml_api_key}",
173
+ "Content-Type": "application/json",
174
+ },
175
+ data=json.dumps({
176
+ "model": "gpt-4o",
177
+ "temperature": 0.0,
178
+ "messages": messages,
179
+ "stream": False,
180
+ "max_tokens": 16_384,
181
+ **kwargs,
182
+ }),
183
+ )
184
+ response.raise_for_status()
185
+ return response.json()
186
+
187
+ def _send_request_to_tts(self, voice_id: str, text: str):
188
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
189
+ headers = {
190
+ "Accept": "audio/mpeg",
191
+ "Content-Type": "application/json",
192
+ "xi-api-key": self._eleven_api_key,
193
+ }
194
+ data = {
195
+ "text": text,
196
+ "model_id": "eleven_monolingual_v1",
197
+ "voice_settings": {
198
+ "stability": 0.5,
199
+ "similarity_boost": 0.5
200
+ }
201
+ }
202
+ response = requests.post(url, json=data, headers=headers)
203
+ response.raise_for_status()
204
+ return response
205
+
206
+
207
+ def respond(text):
208
+ builder = AudiobookBuilder()
209
+
210
+ annotated_text = builder.annotate_text(text)
211
+ unique_characters = builder.get_unique_characters(annotated_text)
212
+ character_to_gender = builder.classify_characters(text, unique_characters)
213
+ character_to_voice = builder.map_characters_to_voices(character_to_gender)
214
+ builder.generate_audio(annotated_text, character_to_voice)
215
+
216
+ audio, sr = librosa.load("audiobook.mp3", sr=None)
217
+ return (sr, audio)
218
+
219
+
220
+ with gr.Blocks(title="Audiobooks Generation") as ui:
221
+ gr.Markdown("# Audiobooks Generation")
222
+
223
+ with gr.Row(variant="panel"):
224
+ text_input = gr.Textbox(label="Enter the book text", lines=20)
225
+
226
+ with gr.Row(variant="panel"):
227
+ audio_output = gr.Audio(label="Generated audio")
228
+
229
+ submit_button = gr.Button("Submit")
230
+ submit_button.click(
231
+ fn=respond,
232
+ inputs=[text_input],
233
+ outputs=[audio_output],
234
+ )
235
+
236
+
237
+ ui.launch()
requirements.txt CHANGED
@@ -1,8 +1,10 @@
1
  langchain
2
  langchain-openai
3
  langchain-community
 
4
  jupyter
5
  openai
 
6
  elevenlabs
7
  gradio
8
  python-dotenv
 
1
  langchain
2
  langchain-openai
3
  langchain-community
4
+ librosa
5
  jupyter
6
  openai
7
+ pandas
8
  elevenlabs
9
  gradio
10
  python-dotenv