Spaces:
Running
Running
import tempfile | |
import asyncio | |
import aiohttp | |
import dotenv | |
import os | |
import time | |
import logging | |
from src.voicevox import VoiceVoxClient | |
from src.agent import Conversation | |
from src.podcast import PodcastStudio | |
from src.aivis import start_aivis_speech, download_model | |
import gradio as gr | |
dotenv.load_dotenv() | |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "") | |
DEFAULT_MODELS = [ | |
"https://hub.aivis-project.com/aivm-models/a59cb814-0083-4369-8542-f51a29e72af7", # Anneli | |
"https://hub.aivis-project.com/aivm-models/4cf3e1d8-5583-41a9-a554-b2d2cda2c569", # Anneli Whisper | |
"https://hub.aivis-project.com/aivm-models/6acf95e8-11a9-414e-aa9c-6dbebf9113ca", # F1 | |
"https://hub.aivis-project.com/aivm-models/25b39db7-5757-47ef-9fe4-2b7aff328a18", # F2 | |
"https://hub.aivis-project.com/aivm-models/d7255c2c-ddd0-425a-808c-662cd94c7f41", # M1 | |
"https://hub.aivis-project.com/aivm-models/d1a7446f-230d-4077-afdf-923eddabe53c", # M2 | |
"https://hub.aivis-project.com/aivm-models/6d11c6c2-f4a4-4435-887e-23dd60f8b8dd", # ใซใ | |
"https://hub.aivis-project.com/aivm-models/e9339137-2ae3-4d41-9394-fb757a7e61e6", # ใพใ | |
"https://hub.aivis-project.com/aivm-models/eefe1fbd-d15a-49ae-bc83-fc4aaad680e1", # ใใคใ | |
"https://hub.aivis-project.com/aivm-models/5d804388-665e-4174-ab60-53d448c0d7eb", # ่ๅฝไธป | |
"https://hub.aivis-project.com/aivm-models/71e72188-2726-4739-9aa9-39567396fb2a", # ใตใฟใตใฟ | |
] | |
AIVIS_ENDPOINT = "http://127.0.0.1:10101" | |
NAVIGATOR_SAMPLE = "ใใใซใกใฏ๏ผ็งใฎๅๅใฏ {nickname} ใงใใไปๅใฏ็งใใใใใญใฃในใใใใใฒใผใใใพใใใใใใใ้กใใใพใ๏ผ" | |
ASSISTANT_SAMPLE = "ใใใซใกใฏ๏ผ็งใฎๅๅใฏ {nickname} ใงใใ็งใฏใตใใผใฟใผใจใใฆใใใใฒใผใฟใผใจไธ็ทใซใใใใญใฃในใใ็ใไธใใฆใใใพใใ้ ๅผตใใพใ๏ผ" | |
async def generate_podcast( | |
voicevox_endpoint: str, | |
llm_api_key: str, | |
pdf_url: str, | |
speaker_name: str, | |
supporter_name: str, | |
speaker2id: dict[str, int], | |
) -> tuple[str, str, object, Conversation, str, dict]: | |
client = VoiceVoxClient(voicevox_endpoint) | |
speaker_id = speaker2id[speaker_name] | |
supporter_id = speaker2id[supporter_name] | |
podcast_studio = PodcastStudio( | |
api_key=llm_api_key, | |
logging_level=logging.DEBUG, | |
) | |
start_time = time.time() | |
blog, _dialogue, conversation = await podcast_studio.create_conversation(pdf_url) | |
podcast_audio = await podcast_studio.record_podcast( | |
conversation=conversation, | |
voicevox_client=client, | |
speaker_id=speaker_id, | |
supporter_id=supporter_id, | |
) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: | |
temp_file.write(podcast_audio.wav) | |
temp_file_path = temp_file.name | |
elapsed_time = time.time() - start_time | |
time_elapsed_text = f"ๅฆ็ๆ้: {elapsed_time:.2f} ็ง" | |
return ( | |
temp_file_path, | |
blog, | |
conversation.model_dump(), | |
conversation, | |
time_elapsed_text, | |
gr.update(visible=True), | |
) | |
async def change_speaker( | |
voicevox_endpoint: str, | |
speaker_name: str, | |
supporter_name: str, | |
speaker2id: dict[str, int], | |
conversation_cache: Conversation, | |
) -> tuple[str, str]: | |
client = VoiceVoxClient(voicevox_endpoint) | |
speaker_id = speaker2id[speaker_name] | |
supporter_id = speaker2id[supporter_name] | |
podcast_studio = PodcastStudio(api_key="") # only voice synthesis | |
start_time = time.time() | |
podcast_audio = await podcast_studio.record_podcast( | |
conversation=conversation_cache, | |
voicevox_client=client, | |
speaker_id=speaker_id, | |
supporter_id=supporter_id, | |
) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: | |
temp_file.write(podcast_audio.wav) | |
temp_file_path = temp_file.name | |
elapsed_time = time.time() - start_time | |
time_elapsed_text = f"ๅฆ็ๆ้: {elapsed_time:.2f} ็ง" | |
return temp_file_path, time_elapsed_text | |
async def get_speakers(endpoint: str): | |
client = VoiceVoxClient(endpoint) | |
speakers = await client.get_speakers() | |
print(f"Found {len(speakers)} speakers at {endpoint}") | |
choices = [] | |
speaker_ids = [] | |
for speaker in speakers: | |
for style in speaker.styles: | |
spekaer_name = f"{speaker.name} ({style.name})" | |
print(f"Speaker: {spekaer_name}, ID: {style.id}") | |
choices.append(spekaer_name) | |
speaker_ids.append(style.id) | |
speaker2id = dict(zip(choices, speaker_ids)) | |
return choices, speaker2id | |
async def on_endpoint_change(endpoint_text: str): | |
try: | |
speakers, speaker2id = await get_speakers(endpoint_text) | |
return ( | |
gr.update(choices=speakers, value=speakers[0]), | |
gr.update(choices=speakers, value=speakers[1]), | |
speaker2id, | |
) | |
except Exception as e: | |
return gr.update(), gr.update(), gr.update() | |
async def preview_speaker_voice( | |
voicevox_endpoint: str, | |
speaker_name: str, | |
speaker_id: int, | |
is_main_speaker: bool = True, | |
): | |
client = VoiceVoxClient(voicevox_endpoint) | |
speaker_nickname = speaker_name.split("(")[0].strip() | |
if is_main_speaker: | |
sample_text = NAVIGATOR_SAMPLE.format(nickname=speaker_nickname) | |
else: | |
sample_text = ASSISTANT_SAMPLE.format(nickname=speaker_nickname) | |
audio_query = await client.post_audio_query( | |
text=sample_text, | |
speaker=speaker_id, | |
) | |
if audio_query.tempoDynamicsScale is not None: | |
audio_query.tempoDynamicsScale = 1.1 | |
else: | |
audio_query.speedScale = 1.1 | |
audio = await client.post_synthesis( | |
speaker=speaker_id, | |
audio_query=audio_query, | |
) | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file: | |
temp_file.write(audio.wav) | |
temp_file_path = temp_file.name | |
return temp_file_path | |
async def on_change_speaker( | |
voicevox_endpoint: str, | |
speaker_name: str, | |
speaker2id: dict[str, int], | |
is_main_speaker: bool, | |
): | |
speaker_id = speaker2id[speaker_name] | |
return await preview_speaker_voice( | |
voicevox_endpoint=voicevox_endpoint, | |
speaker_name=speaker_name, | |
speaker_id=speaker_id, | |
is_main_speaker=is_main_speaker, | |
) | |
async def download_default_models(): | |
logging.info("Downloading default models...") | |
results = await asyncio.gather( | |
*[download_model(model_url) for model_url in DEFAULT_MODELS], | |
return_exceptions=True, | |
) | |
for result in results: | |
if isinstance(result, Exception): | |
logging.error(f"Failed to download model: {result}") | |
async def wait_for_endpoint(url: str, timeout: float = 30.0, interval: float = 0.5): | |
"""url ใ 200 ใ่ฟใใพใงๅพ ๆฉ""" | |
start = time.time() | |
while time.time() - start < timeout: | |
try: | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as res: | |
if res.status == 200: | |
return | |
except Exception: | |
pass | |
await asyncio.sleep(interval) | |
raise RuntimeError(f"Endpoint {url} did not become ready in {timeout}s") | |
async def main(): | |
await wait_for_endpoint(AIVIS_ENDPOINT) | |
initial_endpoint = AIVIS_ENDPOINT | |
try: | |
speakers, spaker2id = await get_speakers(initial_endpoint) | |
except Exception as _e: | |
speakers = [] | |
spaker2id = {} | |
main_speaker_name = "Anneli (ใใณใทใงใณ้ซใ)" | |
supporter_speaker_name = "ใพใ (ใใผใใซ)" | |
main_speaker_preview = None | |
supporter_speaker_preview = None | |
if main_speaker_name is not None: | |
main_speaker_preview = await preview_speaker_voice( | |
voicevox_endpoint=initial_endpoint, | |
speaker_name=main_speaker_name, | |
speaker_id=spaker2id.get(main_speaker_name, 0), | |
is_main_speaker=True, | |
) | |
if supporter_speaker_name is not None: | |
supporter_speaker_preview = await preview_speaker_voice( | |
voicevox_endpoint=initial_endpoint, | |
speaker_name=supporter_speaker_name, | |
speaker_id=spaker2id.get(supporter_speaker_name, 0), | |
is_main_speaker=False, | |
) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# PodcastVox (Aivis Speech) | |
Gemini Flash 2.5 ใจ Aivis Speech ใๅฉ็จใใฆใWeb ใตใคใใๆ ๅ ฑๆบใจใใ Podcast ใ็ๆใใใใจใใงใใพใใ | |
Gemini ใๅฉใใ ใใฎๅฐๆฌใฎ็ๆใฏ 2~3 ๅใงๆธใฟใพใใใ้ณๅฃฐๅๆใฎๆนใฏ Spaces ใฎใใใใ CPU ใไฝฟใใฎใงใ**15 ๅ็จๅบฆ** ใใใใพใใๆฐ้ทใซใๅพ ใกใใ ใใใ | |
[ใญใผใซใซ็](https://github.com/p1atdev/podcastvox) ใไฝฟ็จใใใจๆๅ ใฎ PC ใง้ณๅฃฐๅๆใใงใใใใใMacbook Air 2024 ใงใฏๅ จไฝใง 5 ๅ็จๅบฆใง็ๆใๅฏ่ฝใงใใ | |
## ๆณจๆ็น | |
**ๆ ๅ ฑใซๅบใฅใใไผ่ฉฑใ็ๆใใพใใใใใซใทใใผใทใงใณใ่ชคใฃใ่งฃ้ใ้้ใฃใๅ่ชใฎ่ชญใฟๆนใ็บ็ใใๅ ดๅใใใใพใใ็ๆใใใๅ ๅฎนใฎๆญฃ็ขบๆงใไฟก้ ผๆงใซใคใใฆใฏไฟ่จผใงใใพใใใฎใงใๆณจๆใใฆใๅฉ็จใใ ใใใ** | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Group(): | |
endpoint_text = gr.Textbox( | |
label="VOICEVOX ใจใณใใใคใณใ", | |
value=initial_endpoint, | |
placeholder=AIVIS_ENDPOINT, | |
info="VOICEVOX ๅ ใฎ REST API ใซๅฏพๅฟใใใจใณใใใคใณใใๅ ฅๅใใฆใใ ใใ", | |
visible=False, | |
) | |
with gr.Row(): | |
with gr.Column(): | |
speakers_dropdown = gr.Dropdown( | |
label="ใกใคใณ่ฉฑ่ ", | |
choices=speakers, | |
value=main_speaker_name, | |
multiselect=False, | |
) | |
speaker_preview_audio = gr.Audio( | |
label="ใกใคใณ่ฉฑ่ ้ณๅฃฐใใฌใใฅใผ", | |
type="filepath", | |
value=main_speaker_preview, | |
) | |
with gr.Column(): | |
supporter_dropdown = gr.Dropdown( | |
label="ใตใใผใฟใผ่ฉฑ่ ", | |
choices=speakers, | |
value=supporter_speaker_name, | |
multiselect=False, | |
) | |
supporter_preview_audio = gr.Audio( | |
label="ใตใใผใฟใผ้ณๅฃฐใใฌใใฅใผ", | |
type="filepath", | |
value=supporter_speaker_preview, | |
) | |
spaker2id_map = gr.State(value=spaker2id) | |
change_speaker_button = gr.Button( | |
"ใใฎ่ฉฑ่ ใงๅ็ๆ", | |
variant="secondary", | |
visible=False, | |
) | |
with gr.Group(): | |
llm_api_key_text = gr.Textbox( | |
label="Gemini API Key", | |
info="Podcast ใ็ๆใใใซใฏ API ใญใผใๅฟ ่ฆใงใใhttps://aistudio.google.com/apikey ใใๅๅพใงใใพใใ", | |
placeholder="Enter your Gemini API key", | |
value=GEMINI_API_KEY, | |
type="password", | |
visible=GEMINI_API_KEY == "", | |
) | |
with gr.Column(): | |
with gr.Group(): | |
pdf_url_text = gr.Textbox( | |
label="ๆ ๅ ฑๆบใจใชใ Web ใตใคใ ใฎ URL (1ใคใฎใฟ)", | |
placeholder="ไพ) https://arxiv.org/pdf/2308.06721, https://example.com/index.html", | |
lines=1, | |
info="Podcast ใฎใใผใใจใชใ Web ใตใคใ ใฎ URL ใๅ ฅๅใใฆใใ ใใใHTMLใPDF ใซๅฏพๅฟใใฆใใพใใ", | |
) | |
submit_button = gr.Button( | |
"็ๆ (็ด 20 ๅ็จๅบฆใใใใพใ)", variant="primary" | |
) | |
time_elapsed_text = gr.Markdown( | |
value="", | |
) | |
output_audio = gr.Audio( | |
label="Output Podcast Audio", | |
type="filepath", | |
autoplay=True, | |
) | |
conversation_cache = gr.State(value=None) | |
with gr.Accordion("็ๆใใใใใญใฐ", open=False): | |
blog_output = gr.Markdown( | |
label="Blog Output", | |
value="็ๆใใใใใญใฐใฏใใใซ่กจ็คบใใใพใใ", | |
) | |
with gr.Accordion("็ๆใใใไผ่ฉฑ", open=False): | |
conversation_output = gr.JSON(label="Conversation Output", value={}) | |
gr.Examples( | |
examples=[ | |
["https://arxiv.org/pdf/2308.06721"], | |
["https://www.aozora.gr.jp/cards/000879/files/127_15260.html"], | |
], | |
inputs=[pdf_url_text], | |
) | |
gr.on( | |
triggers=[endpoint_text.change], | |
fn=on_endpoint_change, | |
inputs=[endpoint_text], | |
outputs=[ | |
speakers_dropdown, | |
supporter_dropdown, | |
spaker2id_map, | |
], | |
concurrency_limit=10, | |
) | |
gr.on( | |
triggers=[submit_button.click], | |
fn=generate_podcast, | |
inputs=[ | |
endpoint_text, | |
llm_api_key_text, | |
pdf_url_text, | |
speakers_dropdown, | |
supporter_dropdown, | |
spaker2id_map, | |
], | |
outputs=[ | |
output_audio, | |
blog_output, | |
conversation_output, | |
conversation_cache, | |
time_elapsed_text, | |
change_speaker_button, # make visible after generation | |
], | |
concurrency_limit=10, | |
) | |
gr.on( | |
triggers=[change_speaker_button.click], | |
fn=change_speaker, | |
inputs=[ | |
endpoint_text, | |
speakers_dropdown, | |
supporter_dropdown, | |
spaker2id_map, | |
conversation_cache, | |
], | |
outputs=[ | |
output_audio, | |
time_elapsed_text, | |
], | |
concurrency_limit=10, | |
) | |
gr.on( | |
triggers=[ | |
speakers_dropdown.change, | |
], | |
fn=on_change_speaker, | |
inputs=[ | |
endpoint_text, | |
speakers_dropdown, | |
spaker2id_map, | |
gr.State(value=True), | |
], | |
outputs=[speaker_preview_audio], | |
concurrency_limit=10, | |
) | |
gr.on( | |
triggers=[ | |
supporter_dropdown.change, | |
], | |
fn=on_change_speaker, | |
inputs=[ | |
endpoint_text, | |
supporter_dropdown, | |
spaker2id_map, | |
gr.State(value=False), | |
], | |
outputs=[supporter_preview_audio], | |
concurrency_limit=10, | |
) | |
demo.launch() | |
async def runner(): | |
await download_default_models() | |
aivis = asyncio.to_thread(start_aivis_speech) | |
webui = asyncio.create_task(main()) | |
await asyncio.gather(aivis, webui) | |
if __name__ == "__main__": | |
asyncio.run(runner()) | |