TTSNepali /
aryamanstha's picture
2a70d4e verified
import gradio as gr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech,pipeline
from datasets import Dataset, Audio,load_dataset
import os
import torch
import pandas as pd
import numpy as np
import librosa
from speechbrain.inference.speaker import EncoderClassifier
from transformers import SpeechT5HifiGan
import soundfile as sf
import matplotlib.pyplot as plt
import lakh_crore_numbers as lcn
import re
from nepali.number import nepalinumber
# Convert Nepali digits to words
def convert_to_text(text):
nepali_numeral_pattern = '\(([०१२३४५६७८९0123456789]+)\)'
normal_pattern = '[०१२३४५६७८९0123456789]+'
transcriptions = {
'०': 'शून्य', '१': 'एक', '२': 'दुई', '३': 'तीन',
'४': 'चार', '५': 'पांच', '६': 'छ', '७': 'सात',
'८': 'आठ', '९': 'नौ', '1': 'एक', '2': 'दुई',
'3': 'तीन', '4': 'चार', '5': 'पांच', '6': 'छ',
'7': 'सात', '8': 'आठ', '9': 'नौ', '0': 'शून्य'
# Process numbers inside brackets
numerals_bracketed = re.findall(nepali_numeral_pattern, text)
for i in range(len(numerals_bracketed)):
num = nepalinumber(numerals_bracketed[i])
nepali_num = lcn.format_to_nepali_words(int(num))
text = text.replace(numerals_bracketed[i], nepali_num, 1)
# Process numbers outside brackets
numerals_outside = re.findall(normal_pattern, text)
for i in range(len(numerals_outside)):
for digit in numerals_outside[i]:
text = text.replace(digit, transcriptions[digit] + " ", 1)
return text
import datetime
from nepali.datetime import nepalihumanize, nepalidatetime
# Convert Nepali Date to words(the format for Nepali Date is yyyy/mm/dd or yyyy-mm-dd or
def nepali_date_to_words(input_text):
nepali_pattern = r'\b\d{4}[-/.]\d{2}[-/.]\d{2}\b|[\u0966-\u096F]+\d{4}[-/.]\d{2}[-/.]\d{2}\b'
dates = re.findall(nepali_pattern, input_text)
for date in dates:
year, month, day = [nepalinumber(x) for x in re.split(r'[-/.]', date)]
if int(month) > 12 or int(day) > 31:
raise ValueError(f"Invalid date: {date}")
np_datetime = nepalidatetime(int(year), int(month), int(day))
output = nepalihumanize(np_datetime,threshold=0,format="%Y %B %d")
output_words = lcn.format_to_nepali_words(int(output.split()[0])) +" "+ "साल" +" "+ output.split()[1]+" "+ lcn.format_to_nepali_words(int(output.split()[2]))+" "+"गते"
input_text = input_text.replace(date, output_words)
return input_text
# Convert English Date to words(the format for English Date is [yyyy/mm/dd] or [yyyy-mm-dd] or [])
def english_date_to_words(input_text):
english_pattern = r'\[(\d{4}[-/.]\d{2}[-/.]\d{2})\]'
dates = re.findall(english_pattern, input_text)
for date in dates:
year, month, day = [nepalinumber(x) for x in re.split(r'[-/.]', date)]
if int(month) > 12 or int(day) > 31:
raise ValueError(f"Invalid date: {date}")
century, decade = divmod(int(year), 100)
if century>=20:
year_words = lcn.format_to_nepali_words(int(year))
year_words = lcn.format_to_nepali_words(century)+" "+"सय"+" "+lcn.format_to_nepali_words(decade)
month_name = months.get(str(month))
day_words = lcn.format_to_nepali_words(int(day))
input_text = input_text.replace(date, f"{year_words} {month_name} {day_words}")
return input_text
def filter_len_text(input_length):
# dataset=load_dataset('awajai/transaction',split='train')
# dataset = load_dataset('awajai/phase2dataset-tts',split='train')
dataset['transcription'] = dataset['transcription'].astype(str)
dataset = dataset[dataset['transcription'].apply(lambda text: len(text.strip().split()) >= input_length)]
dataset.reset_index(drop=True, inplace=True)
print('Filtered Dataset:', len(dataset))
return dataset
from cosine_similarity import calculate_cosine_similarity
def get_embedding(input_text):
input_length = len([text for text in input_text.strip().split()])
dataset = filter_len_text(input_length=input_length)
return matching_audio_arr
def process_text(input_text):
text = english_date_to_words(input_text)
text = nepali_date_to_words(text)
text = convert_to_text(text)
return text
def text_to_speech(input_text,language):
if language=="Nepali":
# audio_array=get_embedding(input_text)
synthesiser = pipeline("text-to-speech", "aryamanstha/speecht5_nepali_oslr43_oslr143")
# np_data=audio_array["embedding_audio_2"]
speaker_embedding = torch.tensor(np_data).unsqueeze(0)
tts_output = synthesiser(process_text(input_text), forward_params={"speaker_embeddings": speaker_embedding})
speech = (np.array(tts_output["audio"]) * 32767).astype(np.int16)
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
tts_output = synthesiser(input_text, forward_params={"speaker_embeddings": speaker_embedding})
speech = (np.array(tts_output["audio"]) * 32767).astype(np.int16)
return (16000, speech)
demo = gr.Interface(
gr.Textbox(lines=5, label="Input Text"),
gr.Radio(["English","Nepali"],label="Choose Language")
outputs=[gr.Audio( label="Output Audio",type="numpy")],
title="SpeechT5: Text to Speech For Nepali Language",
description="SpeechT5 is a speech-to-text model that converts text into speech. Type in the text you want to convert into speech.",