emirhanbilgic's picture
Update app.py
53b808e verified
raw
history blame
5.37 kB
import spaces
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
from PyPDF2 import PdfReader
import re
import textwrap
import soundfile as SF
import numpy as np
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize models and tokenizers
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1")
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42
# Helper function to extract text from a PDF
def pdf_to_text(pdf_path):
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
text = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
return text
# Helper function to split text into sentences using regex
def split_text_into_sentences(text):
sentence_endings = re.compile(r'[.!?]')
sentences = sentence_endings.split(text)
return [sentence.strip() for sentence in sentences if sentence.strip()]
# Translation function
@spaces.GPU(duration=120)
def translate(source_text, source_lang, target_lang, batch_size=16):
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)
text_chunks = textwrap.wrap(source_text, 512)
translated_text = ""
for i in range(0, len(text_chunks), batch_size):
text_batch = text_chunks[i:i+batch_size]
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
output_ids = model.generate(input_ids, max_new_tokens=512)
for output in output_ids:
output_text = tokenizer.decode(output, skip_special_tokens=True)
translated_text += output_text + " "
return translated_text
# Function to preprocess the text (normalization, punctuation)
def preprocess(text):
text = text.replace("-", " ")
if text[-1] not in ".!?":
text += "."
return text
# Function to generate audio for a single sentence
@spaces.GPU(duration=120)
def generate_single_wav_from_text(sentence, description):
set_seed(SEED)
inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device)
generation = tts_model.generate(
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
)
audio_arr = generation.cpu().numpy().squeeze()
return SAMPLE_RATE, audio_arr
# Gradio Interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
description = gr.Textbox(label="Voice Description", lines=2,
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
run_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_container = gr.Column()
markdown_output = gr.Markdown()
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
text = pdf_to_text(pdf_input.name)
if translate_checkbox:
text = translate(text, source_lang, target_lang)
sentences = split_text_into_sentences(text)
all_audio_data = []
all_text = ""
for sentence in sentences:
sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
audio_data = (sample_rate, audio_arr)
all_audio_data.append(audio_data)
all_text += f"**Sentence**: {sentence}\n\n"
yield all_audio_data, all_text
def run_pipeline(pdf_input, translate_checkbox, source_lang, target_lang, description):
audio_container.clear_components() # Clear previous components
for audio_data_list, markdown_text in handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
for sample_rate, audio_arr in audio_data_list:
audio_container.append(gr.Audio(value=(np.array(audio_arr).astype(np.float32), sample_rate)))
yield None, markdown_text
run_button.click(run_pipeline, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_container, markdown_output])
demo.queue()
demo.launch(share=True)