Spaces:
Running
Running
Andrei Kulchyk
commited on
Commit
Β·
ddb099d
1
Parent(s):
01dd521
Add backbone Gradio application
Browse files- .gitignore +3 -1
- app.py +237 -0
- requirements.txt +2 -0
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
__pycache__
|
2 |
data/books
|
3 |
-
.env
|
|
|
|
|
|
1 |
__pycache__
|
2 |
data/books
|
3 |
+
.env
|
4 |
+
venv
|
5 |
+
.python-version
|
app.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
|
5 |
+
import librosa
|
6 |
+
import requests
|
7 |
+
import gradio as gr
|
8 |
+
import pandas as pd
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from openai import OpenAI
|
11 |
+
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
|
16 |
+
api_key = os.getenv("AIML_API_KEY")
|
17 |
+
|
18 |
+
|
19 |
+
CHARACTER_CLASSIFICATION_PROMPT = """
|
20 |
+
**Task:**
|
21 |
+
Analyze the provided story text and classify each character in the given list \
|
22 |
+
by their gender. Use `"M"` for Male and `"F"` for Female. Classify the \
|
23 |
+
characters based on contextual clues such as names, pronouns, descriptions, \
|
24 |
+
roles, and interactions within the story.
|
25 |
+
|
26 |
+
**Output Format:**
|
27 |
+
Provide the classification in a JSON object where each key is a character's \
|
28 |
+
name, and the value is `"M"` or `"F"`.
|
29 |
+
|
30 |
+
**Example Input:**
|
31 |
+
```
|
32 |
+
### Story
|
33 |
+
Once upon a time Alice met Bob and Charlie.
|
34 |
+
|
35 |
+
### Characters
|
36 |
+
["alice", "bob", "charlie"]
|
37 |
+
```
|
38 |
+
|
39 |
+
**Example Output:**
|
40 |
+
```json
|
41 |
+
{
|
42 |
+
"alice": "F",
|
43 |
+
"bob": "M",
|
44 |
+
"charlie": "M"
|
45 |
+
}
|
46 |
+
"""
|
47 |
+
|
48 |
+
|
49 |
+
TEXT_ANNOTATION_PROMPT = """\
|
50 |
+
**Task:**
|
51 |
+
Analyze the provided text and annotate each segment by indicating whether it is \
|
52 |
+
part of the narration or spoken by a specific character. Use "Narrator" for \
|
53 |
+
narration and the character's name for dialogues. Format the annotated text in a \
|
54 |
+
clear and consistent manner, suitable for subsequent text-to-speech processing.
|
55 |
+
|
56 |
+
**Formatting Guidelines:**
|
57 |
+
|
58 |
+
- Narration: Prefix with `[Narrator]`
|
59 |
+
- Character Dialogue: Prefix with `[Character Name]`
|
60 |
+
- Multiple Characters Speaking: Prefix with `[Character Name 1] [Character Name 2] ... [Character Name N]`
|
61 |
+
- Consistent Line Breaks: Ensure each labeled segment starts on a new line for clarity.
|
62 |
+
"""
|
63 |
+
|
64 |
+
|
65 |
+
with open("data/the-three-little-pigs.txt") as f:
|
66 |
+
STORY = f.read()
|
67 |
+
|
68 |
+
|
69 |
+
VOICES = pd.read_csv("all_voices.csv").query("language == 'en'")
|
70 |
+
|
71 |
+
|
72 |
+
class AudiobookBuilder:
|
73 |
+
def __init__(
|
74 |
+
self,
|
75 |
+
*,
|
76 |
+
aiml_api_key: str | None = None,
|
77 |
+
aiml_base_url: str = "https://api.aimlapi.com/v1",
|
78 |
+
eleven_api_key: str | None = None,
|
79 |
+
) -> None:
|
80 |
+
self._aiml_api_key = aiml_api_key or os.environ["AIML_API_KEY"]
|
81 |
+
self._aiml_base_url = aiml_base_url
|
82 |
+
self._aiml_client = OpenAI(api_key=api_key, base_url=self._aiml_base_url)
|
83 |
+
self._default_narrator_voice = "XALcFq0WF65uNKzmpcZW"
|
84 |
+
self._eleven_api_key = eleven_api_key or os.environ["ELEVEN_API_KEY"]
|
85 |
+
|
86 |
+
def annotate_text(self, text: str) -> str:
|
87 |
+
response = self._send_request_to_llm(messages=[
|
88 |
+
{
|
89 |
+
"role": "system",
|
90 |
+
"content": TEXT_ANNOTATION_PROMPT,
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"role": "user",
|
94 |
+
"content": text,
|
95 |
+
}
|
96 |
+
])
|
97 |
+
return response["choices"][0]["message"]["content"]
|
98 |
+
|
99 |
+
def classify_characters(self, annotated_text: str, unique_characters: list[str]) -> dict:
|
100 |
+
response = self._send_request_to_llm(
|
101 |
+
messages=[
|
102 |
+
{
|
103 |
+
"role": "system",
|
104 |
+
"content": CHARACTER_CLASSIFICATION_PROMPT,
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"role": "user",
|
108 |
+
"content": f"### Story\n\n{annotated_text}\n\n### Characters\n\n{unique_characters}",
|
109 |
+
},
|
110 |
+
],
|
111 |
+
response_format={"type": "json_object"},
|
112 |
+
)
|
113 |
+
return json.loads(response["choices"][0]["message"]["content"])
|
114 |
+
|
115 |
+
def generate_audio(
|
116 |
+
self,
|
117 |
+
annotated_text: str,
|
118 |
+
character_to_voice: dict[str, str],
|
119 |
+
*,
|
120 |
+
chunk_size: int = 1024,
|
121 |
+
) -> None:
|
122 |
+
current_character = "narrator"
|
123 |
+
with open("audiobook.mp3", "wb") as ab:
|
124 |
+
for line in annotated_text.splitlines():
|
125 |
+
cleaned_line = line.strip().lower()
|
126 |
+
if not cleaned_line:
|
127 |
+
continue
|
128 |
+
try:
|
129 |
+
current_character = re.findall(r"\[[\w\s]+\]", cleaned_line)[0][1:-1]
|
130 |
+
except:
|
131 |
+
pass
|
132 |
+
voice_id = character_to_voice[current_character]
|
133 |
+
character_text = cleaned_line[cleaned_line.rfind("]")+1:].lstrip()
|
134 |
+
fragment = self._send_request_to_tts(voice_id=voice_id, text=character_text)
|
135 |
+
for chunk in fragment.iter_content(chunk_size=chunk_size):
|
136 |
+
if chunk:
|
137 |
+
ab.write(chunk)
|
138 |
+
|
139 |
+
@staticmethod
|
140 |
+
def get_unique_characters(annotated_text: str) -> list[str]:
|
141 |
+
characters = set[str]()
|
142 |
+
for line in annotated_text.splitlines():
|
143 |
+
cleaned_line = line.strip().lower()
|
144 |
+
if not cleaned_line.startswith("["):
|
145 |
+
continue
|
146 |
+
line_characters = re.findall(r"\[[\w\s]+\]", cleaned_line)
|
147 |
+
characters = characters.union(ch[1:-1] for ch in line_characters)
|
148 |
+
return list(characters - {"narrator"})
|
149 |
+
|
150 |
+
def map_characters_to_voices(self, character_to_gender: dict[str, str]) -> dict[str, str]:
|
151 |
+
character_to_voice = {"narrator": self._default_narrator_voice}
|
152 |
+
|
153 |
+
# Damy vperyod!
|
154 |
+
f_characters = [character for character, gender in character_to_gender.items() if gender.strip().lower() == "f"]
|
155 |
+
if f_characters:
|
156 |
+
f_voices = VOICES.query("gender == 'female'").iloc[:len(f_characters)].copy()
|
157 |
+
f_voices["character"] = f_characters
|
158 |
+
character_to_voice |= f_voices.set_index("character")["voice_id"].to_dict()
|
159 |
+
|
160 |
+
m_characters = [character for character, gender in character_to_gender.items() if gender.strip().lower() == "m"]
|
161 |
+
if m_characters:
|
162 |
+
m_voices = VOICES.query("gender == 'male'").iloc[:len(m_characters)].copy()
|
163 |
+
m_voices["character"] = m_characters
|
164 |
+
character_to_voice |= m_voices.set_index("character")["voice_id"].to_dict()
|
165 |
+
|
166 |
+
return character_to_voice
|
167 |
+
|
168 |
+
def _send_request_to_llm(self, messages: list[dict], **kwargs) -> dict:
|
169 |
+
response = requests.post(
|
170 |
+
url=f"{self._aiml_base_url}/chat/completions",
|
171 |
+
headers={
|
172 |
+
"Authorization": f"Bearer {self._aiml_api_key}",
|
173 |
+
"Content-Type": "application/json",
|
174 |
+
},
|
175 |
+
data=json.dumps({
|
176 |
+
"model": "gpt-4o",
|
177 |
+
"temperature": 0.0,
|
178 |
+
"messages": messages,
|
179 |
+
"stream": False,
|
180 |
+
"max_tokens": 16_384,
|
181 |
+
**kwargs,
|
182 |
+
}),
|
183 |
+
)
|
184 |
+
response.raise_for_status()
|
185 |
+
return response.json()
|
186 |
+
|
187 |
+
def _send_request_to_tts(self, voice_id: str, text: str):
|
188 |
+
url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
189 |
+
headers = {
|
190 |
+
"Accept": "audio/mpeg",
|
191 |
+
"Content-Type": "application/json",
|
192 |
+
"xi-api-key": self._eleven_api_key,
|
193 |
+
}
|
194 |
+
data = {
|
195 |
+
"text": text,
|
196 |
+
"model_id": "eleven_monolingual_v1",
|
197 |
+
"voice_settings": {
|
198 |
+
"stability": 0.5,
|
199 |
+
"similarity_boost": 0.5
|
200 |
+
}
|
201 |
+
}
|
202 |
+
response = requests.post(url, json=data, headers=headers)
|
203 |
+
response.raise_for_status()
|
204 |
+
return response
|
205 |
+
|
206 |
+
|
207 |
+
def respond(text):
|
208 |
+
builder = AudiobookBuilder()
|
209 |
+
|
210 |
+
annotated_text = builder.annotate_text(text)
|
211 |
+
unique_characters = builder.get_unique_characters(annotated_text)
|
212 |
+
character_to_gender = builder.classify_characters(text, unique_characters)
|
213 |
+
character_to_voice = builder.map_characters_to_voices(character_to_gender)
|
214 |
+
builder.generate_audio(annotated_text, character_to_voice)
|
215 |
+
|
216 |
+
audio, sr = librosa.load("audiobook.mp3", sr=None)
|
217 |
+
return (sr, audio)
|
218 |
+
|
219 |
+
|
220 |
+
with gr.Blocks(title="Audiobooks Generation") as ui:
|
221 |
+
gr.Markdown("# Audiobooks Generation")
|
222 |
+
|
223 |
+
with gr.Row(variant="panel"):
|
224 |
+
text_input = gr.Textbox(label="Enter the book text", lines=20)
|
225 |
+
|
226 |
+
with gr.Row(variant="panel"):
|
227 |
+
audio_output = gr.Audio(label="Generated audio")
|
228 |
+
|
229 |
+
submit_button = gr.Button("Submit")
|
230 |
+
submit_button.click(
|
231 |
+
fn=respond,
|
232 |
+
inputs=[text_input],
|
233 |
+
outputs=[audio_output],
|
234 |
+
)
|
235 |
+
|
236 |
+
|
237 |
+
ui.launch()
|
requirements.txt
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
langchain
|
2 |
langchain-openai
|
3 |
langchain-community
|
|
|
4 |
jupyter
|
5 |
openai
|
|
|
6 |
elevenlabs
|
7 |
gradio
|
8 |
python-dotenv
|
|
|
1 |
langchain
|
2 |
langchain-openai
|
3 |
langchain-community
|
4 |
+
librosa
|
5 |
jupyter
|
6 |
openai
|
7 |
+
pandas
|
8 |
elevenlabs
|
9 |
gradio
|
10 |
python-dotenv
|