gabrielchua commited on
Commit
8ddd281
1 Parent(s): 5534c51
Files changed (3) hide show
  1. app.py +17 -2
  2. requirements.txt +2 -1
  3. utils.py +38 -22
app.py CHANGED
@@ -21,6 +21,21 @@ from pydub import AudioSegment
21
  from prompts import SYSTEM_PROMPT
22
  from utils import generate_script, generate_audio, parse_url
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  class DialogueItem(BaseModel):
26
  """A single dialogue item."""
@@ -139,7 +154,7 @@ def generate_podcast(
139
 
140
  # Get audio file path
141
  audio_file_path = generate_audio(
142
- line.text, line.speaker, language_mapping[language]
143
  )
144
  # Read the audio file into an AudioSegment
145
  audio_segment = AudioSegment.from_file(audio_file_path)
@@ -206,7 +221,7 @@ demo = gr.Interface(
206
  value="Medium (3-5 min)"
207
  ),
208
  gr.Dropdown(
209
- choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
210
  value="English",
211
  label="6. 🌐 Choose the language"
212
  ),
 
21
  from prompts import SYSTEM_PROMPT
22
  from utils import generate_script, generate_audio, parse_url
23
 
24
+ LANGUAGE_MAPPING = {
25
+ "English": "en",
26
+ "Chinese": "zh",
27
+ "French": "fr",
28
+ "German": "de",
29
+ "Hindi": "hi",
30
+ "Italian": "it",
31
+ "Japanese": "ja",
32
+ "Korean": "ko",
33
+ "Polish": "pl",
34
+ "Portuguese": "pt",
35
+ "Russian": "ru",
36
+ "Spanish": "es",
37
+ "Turkish": "tr"
38
+ }
39
 
40
  class DialogueItem(BaseModel):
41
  """A single dialogue item."""
 
154
 
155
  # Get audio file path
156
  audio_file_path = generate_audio(
157
+ line.text, line.speaker, LANGUAGE_MAPPING[language]
158
  )
159
  # Read the audio file into an AudioSegment
160
  audio_segment = AudioSegment.from_file(audio_file_path)
 
221
  value="Medium (3-5 min)"
222
  ),
223
  gr.Dropdown(
224
+ choices=list(LANGUAGE_MAPPING.keys()),
225
  value="English",
226
  label="6. 🌐 Choose the language"
227
  ),
requirements.txt CHANGED
@@ -8,4 +8,5 @@ pypdf==4.1
8
  sentry-sdk==2.5
9
  spaces==0.30.2
10
 
11
- tenacity==8.3
 
 
8
  sentry-sdk==2.5
9
  spaces==0.30.2
10
 
11
+ tenacity==8.3
12
+ git+https://github.com/suno-ai/bark.git
utils.py CHANGED
@@ -9,11 +9,13 @@ Functions:
9
 
10
  import os
11
  import requests
12
-
13
  from gradio_client import Client
14
  from openai import OpenAI
15
  from pydantic import ValidationError
16
 
 
 
 
17
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
18
  JINA_URL = "https://r.jina.ai/"
19
 
@@ -22,7 +24,10 @@ client = OpenAI(
22
  api_key=os.getenv("FIREWORKS_API_KEY"),
23
  )
24
 
25
- hf_client = Client("mrfakename/MeloTTS")
 
 
 
26
 
27
 
28
  def generate_script(system_prompt: str, input_text: str, output_model):
@@ -73,23 +78,34 @@ def parse_url(url: str) -> str:
73
  return response.text
74
 
75
 
76
- def generate_audio(text: str, speaker: str, language: str) -> bytes:
77
- """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
78
- if speaker == "Guest":
79
- accent = "EN-US" if language == "EN" else language
80
- speed = 0.9
81
- else: # host
82
- accent = "EN-Default" if language == "EN" else language
83
- speed = 1
84
- if language != "EN" and speaker != "Guest":
85
- speed = 1.1
86
-
87
- # Generate audio
88
- result = hf_client.predict(
89
- text=text,
90
- language=language,
91
- speaker=accent,
92
- speed=speed,
93
- api_name="/synthesize",
94
- )
95
- return result
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  import os
11
  import requests
 
12
  from gradio_client import Client
13
  from openai import OpenAI
14
  from pydantic import ValidationError
15
 
16
+ from bark import SAMPLE_RATE, generate_audio, preload_models
17
+ from scipy.io.wavfile import write as write_wav
18
+
19
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
20
  JINA_URL = "https://r.jina.ai/"
21
 
 
24
  api_key=os.getenv("FIREWORKS_API_KEY"),
25
  )
26
 
27
+ # hf_client = Client("mrfakename/MeloTTS")
28
+
29
+ # download and load all models
30
+ preload_models()
31
 
32
 
33
  def generate_script(system_prompt: str, input_text: str, output_model):
 
78
  return response.text
79
 
80
 
81
+ def generate_audio(text: str, speaker: str, language: str) -> str:
82
+
83
+ audio_array = generate_audio(text, history_prompt=f"v2/{language}_speaker_{'1' if speaker == 'Host (Jane)' else '3'}")
84
+
85
+ file_path = f"audio_{language}_{speaker}.mp3"
86
+
87
+ # save audio to disk
88
+ write_wav(file_path, SAMPLE_RATE, audio_array)
89
+
90
+ return file_path
91
+
92
+
93
+ # """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
94
+ # if speaker == "Guest":
95
+ # accent = "EN-US" if language == "EN" else language
96
+ # speed = 0.9
97
+ # else: # host
98
+ # accent = "EN-Default" if language == "EN" else language
99
+ # speed = 1
100
+ # if language != "EN" and speaker != "Guest":
101
+ # speed = 1.1
102
+
103
+ # # Generate audio
104
+ # result = hf_client.predict(
105
+ # text=text,
106
+ # language=language,
107
+ # speaker=accent,
108
+ # speed=speed,
109
+ # api_name="/synthesize",
110
+ # )
111
+ # return result