import os import re import json import shutil import pyttsx3 from pydub import AudioSegment from transformers import GPT2LMHeadModel, GPT2Tokenizer from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # Initialize GPT-2 model and tokenizer model_name = "distilgpt2" tokenizer = GPT2Tokenizer.from_pretrained(model_name) model = GPT2LMHeadModel.from_pretrained(model_name) # System prompt and article content system_prompt = """Generate a conversation between Sascha and Marina based on the article content provided. Sascha is the article writer, and Marina is the interviewer. Make it engaging and emotional, with natural pauses (like "uh") to make it sound conversational. This is for a podcast called "The Machine Learning Engineer".""" # TTS voice map for Sascha and Marina speaker_voice_map = { "Sascha": "pyttsx3", # Sascha will use pyttsx3 for offline TTS "Marina": "pyttsx3" # Marina uses pyttsx3 for offline TTS } # Initialize pyttsx3 engine for offline TTS engine = pyttsx3.init() engine.setProperty('rate', 150) # Speed of speech engine.setProperty('volume', 0.9) # Volume (0.0 to 1.0) # Pyttsx3 TTS function for offline TTS def synthesize_speech_pyttsx3(text, speaker, index): filename = f"audio-files/{index}_{speaker}.mp3" engine.save_to_file(text, filename) engine.runAndWait() print(f'Audio content written to file "{filename}"') # Function to synthesize speech based on the speaker def synthesize_speech(text, speaker, index): synthesize_speech_pyttsx3(text, speaker, index) # Function to sort filenames naturally def natural_sort_key(filename): return [int(text) if text.isdigit() else text for text in re.split(r'(\d+)', filename)] # Function to merge audio files def merge_audios(audio_folder, output_file): combined = AudioSegment.empty() audio_files = sorted( [f for f in os.listdir(audio_folder) if f.endswith(".mp3") or f.endswith(".wav")], key=natural_sort_key ) for filename in audio_files: audio_path = os.path.join(audio_folder, filename) print(f"Processing: {audio_path}") audio = AudioSegment.from_file(audio_path) combined += audio combined.export(output_file, format="mp3") print(f"Merged audio saved as {output_file}") # Function to generate conversation using distilgpt2 def generate_conversation(article): input_text = f"{system_prompt}\n\n{article}\n\nSascha: " inputs = tokenizer.encode(input_text, return_tensors="pt") outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, temperature=1.0) generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Parse conversation into JSON format lines = re.split(r'(Sascha:|Marina:)', generated_text)[1:] # split by speaker names conversation = [{"speaker": lines[i].strip(), "text": lines[i + 1].strip()} for i in range(0, len(lines), 2)] formatted_json = json.dumps(conversation, indent=4) print(formatted_json) return conversation # Function to generate the podcast audio def generate_audio(conversation): if os.path.exists('audio-files'): shutil.rmtree('audio-files') os.makedirs('audio-files', exist_ok=True) for index, part in enumerate(conversation): speaker = part['speaker'] text = part['text'] synthesize_speech(text, speaker, index) output_file = "podcast.mp3" merge_audios("audio-files", output_file) return output_file # Read the article from the file with open('function-calling.txt', 'r') as file: article = file.read() # Generate conversation and audio conversation = generate_conversation(article) generate_audio(conversation)