Spaces:
Paused
Paused
Update for new arena
Browse files
app.py
CHANGED
|
@@ -36,10 +36,8 @@ with open('ja_sentences.txt') as f:
|
|
| 36 |
# Constants
|
| 37 |
####################################
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
SPK3 = os.getenv('KOTOBA_SPK3')
|
| 42 |
-
SPK4 = os.getenv('KOTOBA_SPK4')
|
| 43 |
|
| 44 |
AVAILABLE_MODELS = {
|
| 45 |
# 'XTTSv2': 'xtts',
|
|
@@ -57,10 +55,7 @@ AVAILABLE_MODELS = {
|
|
| 57 |
# 'Parler TTS': 'parler'
|
| 58 |
'MOE-VITS': 'moe-vits',
|
| 59 |
'BARK': 'bark',
|
| 60 |
-
|
| 61 |
-
f'KOTOBA-SPEECH-{SPK2.upper()}': f'kotoba-speech-{SPK2.lower()}',
|
| 62 |
-
f'KOTOBA-SPEECH-{SPK3.upper()}': f'kotoba-speech-{SPK3.lower()}',
|
| 63 |
-
f'KOTOBA-SPEECH-{SPK4.upper()}': f'kotoba-speech-{SPK4.lower()}',
|
| 64 |
#'BLANE-TTS': 'blane-tts',
|
| 65 |
'AMITARO-VITS': 'amitaro-vits',
|
| 66 |
'GOOGLE-TTS': 'google-tts',
|
|
@@ -130,60 +125,12 @@ def get_db():
|
|
| 130 |
|
| 131 |
def get_tts_file(text: str, model: str):
|
| 132 |
url = {
|
| 133 |
-
f"kotoba-speech-{SPK1.lower()}": "https://kotoba-tech-kotoba-speech.hf.space/gradio_api/call/tts",
|
| 134 |
-
f"kotoba-speech-{SPK2.lower()}": "https://kotoba-tech-kotoba-speech.hf.space/gradio_api/call/tts",
|
| 135 |
-
f"kotoba-speech-{SPK3.lower()}": "https://kotoba-tech-kotoba-speech.hf.space/gradio_api/call/tts",
|
| 136 |
-
f"kotoba-speech-{SPK4.lower()}": "https://kotoba-tech-kotoba-speech.hf.space/gradio_api/call/tts",
|
| 137 |
"blane-tts": "https://blane187-blane-tts.hf.space/call/get_audio_file"
|
| 138 |
}
|
| 139 |
headers = {
|
| 140 |
"Content-Type": "application/json"
|
| 141 |
}
|
| 142 |
data = {
|
| 143 |
-
f"kotoba-speech-{SPK1.lower()}": {
|
| 144 |
-
"data": [
|
| 145 |
-
text,
|
| 146 |
-
5,
|
| 147 |
-
5,
|
| 148 |
-
"Preset voices",
|
| 149 |
-
SPK1,
|
| 150 |
-
{"path": "fam/ui/voice01_A.mp3"},
|
| 151 |
-
{"path": "fam/ui/voice01_A.mp3"}
|
| 152 |
-
]
|
| 153 |
-
},
|
| 154 |
-
f"kotoba-speech-{SPK2.lower()}": {
|
| 155 |
-
"data": [
|
| 156 |
-
text,
|
| 157 |
-
5,
|
| 158 |
-
5,
|
| 159 |
-
"Preset voices",
|
| 160 |
-
SPK2,
|
| 161 |
-
{"path": "fam/ui/voice01_A.mp3"},
|
| 162 |
-
{"path": "fam/ui/voice01_A.mp3"}
|
| 163 |
-
]
|
| 164 |
-
},
|
| 165 |
-
f"kotoba-speech-{SPK3.lower()}": {
|
| 166 |
-
"data": [
|
| 167 |
-
text,
|
| 168 |
-
5,
|
| 169 |
-
5,
|
| 170 |
-
"Preset voices",
|
| 171 |
-
SPK3,
|
| 172 |
-
{"path": "fam/ui/voice01_A.mp3"},
|
| 173 |
-
{"path": "fam/ui/voice01_A.mp3"}
|
| 174 |
-
]
|
| 175 |
-
},
|
| 176 |
-
f"kotoba-speech-{SPK4.lower()}": {
|
| 177 |
-
"data": [
|
| 178 |
-
text,
|
| 179 |
-
5,
|
| 180 |
-
5,
|
| 181 |
-
"Preset voices",
|
| 182 |
-
SPK4,
|
| 183 |
-
{"path": "fam/ui/voice01_A.mp3"},
|
| 184 |
-
{"path": "fam/ui/voice01_A.mp3"}
|
| 185 |
-
]
|
| 186 |
-
},
|
| 187 |
"blane-tts": {
|
| 188 |
"data": [
|
| 189 |
text,
|
|
@@ -398,10 +345,7 @@ model_names = {
|
|
| 398 |
# 'metavoice': 'MetaVoice-1B',
|
| 399 |
'BARK': 'BARK',
|
| 400 |
'MOE-VITS': 'MOE-VITS',
|
| 401 |
-
|
| 402 |
-
f'KOTOBA-SPEECH-{SPK2.upper()}': 'KOTOBA-SPEECH-SPK2',
|
| 403 |
-
f'KOTOBA-SPEECH-{SPK3.upper()}': 'KOTOBA-SPEECH-SPK3',
|
| 404 |
-
f'KOTOBA-SPEECH-{SPK4.upper()}': 'KOTOBA-SPEECH-SPK4',
|
| 405 |
'BLANE-TTS': 'BLANE-TTS',
|
| 406 |
'AMITARO-VITS': 'AMITARO-VITS',
|
| 407 |
'GOOGLE-TTS': 'GOOGLE-TTS',
|
|
@@ -456,10 +400,7 @@ model_links = {
|
|
| 456 |
# 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
|
| 457 |
'bark': 'https://suno-bark.hf.space/',
|
| 458 |
'moe-vits': 'skytnt/moe-tts',
|
| 459 |
-
|
| 460 |
-
f'kotoba-speech-{SPK2.lower()}': 'https://kotoba-tech-kotoba-speech.hf.space/gradio_api/',
|
| 461 |
-
f'kotoba-speech-{SPK3.lower()}': 'https://kotoba-tech-kotoba-speech.hf.space/gradio_api/',
|
| 462 |
-
f'kotoba-speech-{SPK4.lower()}': 'https://kotoba-tech-kotoba-speech.hf.space/gradio_api/',
|
| 463 |
'blane-tts': 'https://blane187-blane-tts.hf.space/',
|
| 464 |
'amitaro-vits': 'https://lycoris53-vits-tts-japanese-only-amitaro.hf.space/'
|
| 465 |
}
|
|
@@ -706,6 +647,49 @@ def doresample(path_to_wav):
|
|
| 706 |
# 2x speedup (hopefully) #
|
| 707 |
##########################
|
| 708 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
def synthandreturn(text, retry=0):
|
| 710 |
text = text.strip()
|
| 711 |
if len(text) > MAX_SAMPLE_TXT_LENGTH:
|
|
@@ -759,7 +743,11 @@ def synthandreturn(text, retry=0):
|
|
| 759 |
elif model == "openai-tts":
|
| 760 |
local_filename = '/tmp/' + str(mkuuid(None)) + '.wav'
|
| 761 |
result = get_openai_tts(text, local_filename=local_filename)
|
|
|
|
|
|
|
|
|
|
| 762 |
else:
|
|
|
|
| 763 |
result = get_tts_file(text, model)
|
| 764 |
# URL to download the file from
|
| 765 |
url = f"{model_links[model]}file={result}"
|
|
|
|
| 36 |
# Constants
|
| 37 |
####################################
|
| 38 |
|
| 39 |
+
# Configure the API TTS URL here
|
| 40 |
+
KOTOBA_API_URL = os.getenv('KOTOBA_API_URL', 'https://api.example.com/tts')
|
|
|
|
|
|
|
| 41 |
|
| 42 |
AVAILABLE_MODELS = {
|
| 43 |
# 'XTTSv2': 'xtts',
|
|
|
|
| 55 |
# 'Parler TTS': 'parler'
|
| 56 |
'MOE-VITS': 'moe-vits',
|
| 57 |
'BARK': 'bark',
|
| 58 |
+
'KOTOBA-TTS': 'kotoba-tts',
|
|
|
|
|
|
|
|
|
|
| 59 |
#'BLANE-TTS': 'blane-tts',
|
| 60 |
'AMITARO-VITS': 'amitaro-vits',
|
| 61 |
'GOOGLE-TTS': 'google-tts',
|
|
|
|
| 125 |
|
| 126 |
def get_tts_file(text: str, model: str):
|
| 127 |
url = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
"blane-tts": "https://blane187-blane-tts.hf.space/call/get_audio_file"
|
| 129 |
}
|
| 130 |
headers = {
|
| 131 |
"Content-Type": "application/json"
|
| 132 |
}
|
| 133 |
data = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
"blane-tts": {
|
| 135 |
"data": [
|
| 136 |
text,
|
|
|
|
| 345 |
# 'metavoice': 'MetaVoice-1B',
|
| 346 |
'BARK': 'BARK',
|
| 347 |
'MOE-VITS': 'MOE-VITS',
|
| 348 |
+
'KOTOBA-TTS': 'kotoba-tts',
|
|
|
|
|
|
|
|
|
|
| 349 |
'BLANE-TTS': 'BLANE-TTS',
|
| 350 |
'AMITARO-VITS': 'AMITARO-VITS',
|
| 351 |
'GOOGLE-TTS': 'GOOGLE-TTS',
|
|
|
|
| 400 |
# 'metavoice': 'https://github.com/metavoiceio/metavoice-src',
|
| 401 |
'bark': 'https://suno-bark.hf.space/',
|
| 402 |
'moe-vits': 'skytnt/moe-tts',
|
| 403 |
+
'kotoba-tts': KOTOBA_API_URL,
|
|
|
|
|
|
|
|
|
|
| 404 |
'blane-tts': 'https://blane187-blane-tts.hf.space/',
|
| 405 |
'amitaro-vits': 'https://lycoris53-vits-tts-japanese-only-amitaro.hf.space/'
|
| 406 |
}
|
|
|
|
| 647 |
# 2x speedup (hopefully) #
|
| 648 |
##########################
|
| 649 |
|
| 650 |
+
def get_kotoba_tts(text):
|
| 651 |
+
"""
|
| 652 |
+
Call the Kotoba TTS API to generate speech from text.
|
| 653 |
+
|
| 654 |
+
Args:
|
| 655 |
+
text (str): The text to convert to speech
|
| 656 |
+
voice (str): The voice to use (e.g., "Newscaster (man)")
|
| 657 |
+
|
| 658 |
+
Returns:
|
| 659 |
+
str: Path to the generated audio file
|
| 660 |
+
"""
|
| 661 |
+
# Request headers
|
| 662 |
+
headers = {
|
| 663 |
+
"Content-Type": "application/json"
|
| 664 |
+
}
|
| 665 |
+
|
| 666 |
+
# Request payload
|
| 667 |
+
data = {
|
| 668 |
+
"text": text,
|
| 669 |
+
}
|
| 670 |
+
|
| 671 |
+
# Create a temporary file to save the audio
|
| 672 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
|
| 673 |
+
output_path = temp_file.name
|
| 674 |
+
|
| 675 |
+
# Make the POST request and save the response directly to the file
|
| 676 |
+
response = requests.post(
|
| 677 |
+
KOTOBA_API_URL,
|
| 678 |
+
headers=headers,
|
| 679 |
+
json=data,
|
| 680 |
+
stream=True
|
| 681 |
+
)
|
| 682 |
+
|
| 683 |
+
# Check if the request was successful
|
| 684 |
+
response.raise_for_status()
|
| 685 |
+
|
| 686 |
+
# Save the response content to the output file
|
| 687 |
+
with open(output_path, 'wb') as f:
|
| 688 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 689 |
+
f.write(chunk)
|
| 690 |
+
|
| 691 |
+
return output_path
|
| 692 |
+
|
| 693 |
def synthandreturn(text, retry=0):
|
| 694 |
text = text.strip()
|
| 695 |
if len(text) > MAX_SAMPLE_TXT_LENGTH:
|
|
|
|
| 743 |
elif model == "openai-tts":
|
| 744 |
local_filename = '/tmp/' + str(mkuuid(None)) + '.wav'
|
| 745 |
result = get_openai_tts(text, local_filename=local_filename)
|
| 746 |
+
elif model == "kotoba-tts":
|
| 747 |
+
result = get_kotoba_tts(text)
|
| 748 |
+
print(f"API TTS audio file: {result}")
|
| 749 |
else:
|
| 750 |
+
# For other models that use the original approach
|
| 751 |
result = get_tts_file(text, model)
|
| 752 |
# URL to download the file from
|
| 753 |
url = f"{model_links[model]}file={result}"
|