import gradio as gr import wave import numpy as np from io import BytesIO from huggingface_hub import hf_hub_download from piper import PiperVoice from transformers import pipeline import hazm import typing normalizer = hazm.Normalizer() sent_tokenizer = hazm.SentenceTokenizer() word_tokenizer = hazm.WordTokenizer() tagger_path = hf_hub_download(repo_id="gyroing/HAZM_POS_TAGGER", filename="pos_tagger.model") tagger = hazm.POSTagger(model=tagger_path) def preprocess_text(text: str) -> typing.List[typing.List[str]]: """Split/normalize text into sentences/words with hazm""" text = normalizer.normalize(text) processed_sentences = [] for sentence in sent_tokenizer.tokenize(text): words = word_tokenizer.tokenize(sentence) processed_words = fix_words(words) processed_sentences.append(" ".join(processed_words)) return " ".join(processed_sentences) def fix_words(words: typing.List[str]) -> typing.List[str]: fixed_words = [] for word, pos in tagger.tag(words): if pos[-1] == "Z": if word[-1] != "ِ": if (word[-1] == "ه") and (word[-2] != "ا"): word += "‌ی" word += "ِ" fixed_words.append(word) return fixed_words def synthesize_speech(text): model_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-meduim.onnx") config_path = hf_hub_download(repo_id="gyroing/Persian-Piper-Model-gyro", filename="fa_IR-gyro-meduim.onnx.json") voice = PiperVoice.load(model_path, config_path) # Create an in-memory buffer for the WAV file buffer = BytesIO() with wave.open(buffer, 'wb') as wav_file: wav_file.setframerate(voice.config.sample_rate) wav_file.setsampwidth(2) # 16-bit wav_file.setnchannels(1) # mono # Synthesize speech voice.synthesize(text, wav_file) # Convert buffer to NumPy array for Gradio output buffer.seek(0) audio_data = np.frombuffer(buffer.read(), dtype=np.int16) return audio_data.tobytes(), None # Using Gradio Blocks with gr.Blocks(theme=gr.themes.Base()) as blocks: gr.Markdown("# Text to Speech Synthesizer") gr.Markdown("Enter text to synthesize it into speech using PiperVoice.") input_text = preprocess_text(gr.Textbox(label="Input Text")) output_audio = gr.Audio(label="Synthesized Speech", type="numpy") submit_button = gr.Button("Synthesize") submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio]) # Run the app blocks.launch()