Spaces:

p1atdev
/

PodcastVox

Running

File size: 3,661 Bytes

3a09141

from tqdm import tqdm
import logging

from .agent import BloggerAgent, WriterAgent, StructureAgent, Conversation
from .fetcher import AutoFetcher
from .voicevox import VoiceVoxClient, SpeakerId, Audio


class PodcastStudio:
    def __init__(self, api_key: str, logging_level: int = logging.INFO):
        self.blogger = BloggerAgent(api_key=api_key)
        self.writer = WriterAgent(api_key=api_key)
        self.structure_agent = StructureAgent(api_key=api_key)

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging_level)

        self.fetcher = AutoFetcher()

    async def create_conversation(self, url: str) -> tuple[str, str, Conversation]:
        self.logger.info(f"Fetching paper from {url}...")
        paper = await self.fetcher.fetch(url)
        self.logger.info("Paper fetched successfully.")
        self.logger.debug(
            f"Paper content: {paper[:100]}..."
        )  # Log first 100 characters

        self.logger.info("Creating blog from paper...")
        blog = await self.blogger.task(paper)
        self.logger.info("Blog created successfully.")
        self.logger.debug(f"{blog[:100]}...")  # Log first 100 characters

        self.logger.info("Creating dialogue from blog...")
        dialogue = await self.writer.task(paper, blog)
        self.logger.info("Dialogue created successfully.")
        self.logger.debug(f"{dialogue[:100]}...")  # Log first 100 characters

        self.logger.info("Structuring conversation from dialogue...")
        conversation = await self.structure_agent.task(dialogue)
        self.logger.info("Conversation structured successfully.")
        for _d in conversation.conversation:
            self.logger.debug(f"{_d.role}: {_d.content[:100]}...")

        return blog, dialogue, conversation

    async def record_podcast(
        self,
        conversation: Conversation,
        voicevox_client: VoiceVoxClient,
        speaker_id: SpeakerId,
        supporter_id: SpeakerId,
    ) -> Audio:
        progress_bar = tqdm(
            total=len(conversation.conversation),
            desc="Synthesizing audio",
            ncols=100,
        )

        async def _synthesis(
            speaker_id: SpeakerId,
            text: str,
            index: int,
            progress: tqdm,
        ) -> tuple[int, Audio]:
            audio_query = await voicevox_client.post_audio_query(
                text=text,
                speaker=speaker_id,
            )
            if audio_query.tempoDynamicsScale is not None:
                audio_query.tempoDynamicsScale = 1.1
            else:
                audio_query.speedScale = 1.1

            audio = await voicevox_client.post_synthesis(
                speaker=speaker_id,
                audio_query=audio_query,
            )
            progress.update(1)

            progress.set_postfix({"text": text[:20] + "..."})

            return index, audio

        results = []
        for i, dialogue in enumerate(conversation.conversation):
            results.append(
                await _synthesis(
                    speaker_id=(
                        speaker_id if dialogue.role == "speaker" else supporter_id
                    ),
                    text=dialogue.content,
                    index=i,
                    progress=progress_bar,
                )
            )
        progress_bar.close()

        # sort results by index
        results.sort(key=lambda x: x[0])

        audios = [audio for _, audio in results]

        # connect audio files
        podcast = await voicevox_client.post_connect_waves(
            audio_list=audios,
        )
        return podcast