Spaces:
Running
Running
import gradio as gr | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech,pipeline | |
from datasets import Dataset, Audio,load_dataset | |
import os | |
import torch | |
import pandas as pd | |
import numpy as np | |
import librosa | |
from speechbrain.inference.speaker import EncoderClassifier | |
from transformers import SpeechT5HifiGan | |
import soundfile as sf | |
import matplotlib.pyplot as plt | |
import lakh_crore_numbers as lcn | |
import re | |
from nepali.number import nepalinumber | |
# Convert Nepali digits to words | |
def convert_to_text(text): | |
nepali_numeral_pattern = '\(([०१२३४५६७८९0123456789]+)\)' | |
normal_pattern = '[०१२३४५६७८९0123456789]+' | |
transcriptions = { | |
'०': 'शून्य', '१': 'एक', '२': 'दुई', '३': 'तीन', | |
'४': 'चार', '५': 'पांच', '६': 'छ', '७': 'सात', | |
'८': 'आठ', '९': 'नौ', '1': 'एक', '2': 'दुई', | |
'3': 'तीन', '4': 'चार', '5': 'पांच', '6': 'छ', | |
'7': 'सात', '8': 'आठ', '9': 'नौ', '0': 'शून्य' | |
} | |
# Process numbers inside brackets | |
numerals_bracketed = re.findall(nepali_numeral_pattern, text) | |
for i in range(len(numerals_bracketed)): | |
num = nepalinumber(numerals_bracketed[i]) | |
nepali_num = lcn.format_to_nepali_words(int(num)) | |
text = text.replace(numerals_bracketed[i], nepali_num, 1) | |
# Process numbers outside brackets | |
numerals_outside = re.findall(normal_pattern, text) | |
for i in range(len(numerals_outside)): | |
for digit in numerals_outside[i]: | |
text = text.replace(digit, transcriptions[digit] + " ", 1) | |
return text | |
import datetime | |
from nepali.datetime import nepalihumanize, nepalidatetime | |
# Convert Nepali Date to words(the format for Nepali Date is yyyy/mm/dd or yyyy-mm-dd or yyyy.mm.dd) | |
def nepali_date_to_words(input_text): | |
nepali_pattern = r'\b\d{4}[-/.]\d{2}[-/.]\d{2}\b|[\u0966-\u096F]+\d{4}[-/.]\d{2}[-/.]\d{2}\b' | |
dates = re.findall(nepali_pattern, input_text) | |
for date in dates: | |
year, month, day = [nepalinumber(x) for x in re.split(r'[-/.]', date)] | |
if int(month) > 12 or int(day) > 31: | |
raise ValueError(f"Invalid date: {date}") | |
np_datetime = nepalidatetime(int(year), int(month), int(day)) | |
output = nepalihumanize(np_datetime,threshold=0,format="%Y %B %d") | |
output_words = lcn.format_to_nepali_words(int(output.split()[0])) +" "+ "साल" +" "+ output.split()[1]+" "+ lcn.format_to_nepali_words(int(output.split()[2]))+" "+"गते" | |
input_text = input_text.replace(date, output_words) | |
return input_text | |
# Convert English Date to words(the format for English Date is [yyyy/mm/dd] or [yyyy-mm-dd] or [yyyy.mm.dd]) | |
def english_date_to_words(input_text): | |
english_pattern = r'\[(\d{4}[-/.]\d{2}[-/.]\d{2})\]' | |
months={ | |
'1':'जनवरी','2':'फेब्रुअरी','3':'मार्च', | |
'4':'अप्रिल','5':'मे','6':'जुन', | |
'7':'जुलाई','8':'अगस्त','9':'सेप्टेम्बर', | |
'10':'अक्टोबर','11':'नोवेम्बर','12':'डिसेम्बर' | |
} | |
dates = re.findall(english_pattern, input_text) | |
for date in dates: | |
year, month, day = [nepalinumber(x) for x in re.split(r'[-/.]', date)] | |
if int(month) > 12 or int(day) > 31: | |
raise ValueError(f"Invalid date: {date}") | |
century, decade = divmod(int(year), 100) | |
if century>=20: | |
year_words = lcn.format_to_nepali_words(int(year)) | |
else: | |
year_words = lcn.format_to_nepali_words(century)+" "+"सय"+" "+lcn.format_to_nepali_words(decade) | |
month_name = months.get(str(month)) | |
day_words = lcn.format_to_nepali_words(int(day)) | |
input_text = input_text.replace(date, f"{year_words} {month_name} {day_words}") | |
return input_text | |
def filter_len_text(input_length): | |
# dataset=load_dataset('awajai/transaction',split='train') | |
# dataset = load_dataset('awajai/phase2dataset-tts',split='train') | |
dataset=pd.read_csv('transcription.csv') | |
dataset['transcription'] = dataset['transcription'].astype(str) | |
dataset = dataset[dataset['transcription'].apply(lambda text: len(text.strip().split()) >= input_length)] | |
dataset.reset_index(drop=True, inplace=True) | |
print('Filtered Dataset:', len(dataset)) | |
return dataset | |
from cosine_similarity import calculate_cosine_similarity | |
def get_embedding(input_text): | |
input_length = len([text for text in input_text.strip().split()]) | |
print(input_length) | |
dataset = filter_len_text(input_length=input_length) | |
matching_audio_arr=calculate_cosine_similarity(dataset,input_text=input_text) | |
return matching_audio_arr | |
def process_text(input_text): | |
text = english_date_to_words(input_text) | |
text = nepali_date_to_words(text) | |
text = convert_to_text(text) | |
return text | |
def text_to_speech(input_text,language): | |
if language=="Nepali": | |
# audio_array=get_embedding(input_text) | |
synthesiser = pipeline("text-to-speech", "aryamanstha/speecht5_nepali_oslr43_oslr143") | |
# np_data=audio_array["embedding_audio_2"] | |
np_data=np.load(file='embedding_audio_2.npy') | |
speaker_embedding = torch.tensor(np_data).unsqueeze(0) | |
tts_output = synthesiser(process_text(input_text), forward_params={"speaker_embeddings": speaker_embedding}) | |
speech = (np.array(tts_output["audio"]) * 32767).astype(np.int16) | |
else: | |
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
print(embeddings_dataset[7306]["xvector"]) | |
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
tts_output = synthesiser(input_text, forward_params={"speaker_embeddings": speaker_embedding}) | |
speech = (np.array(tts_output["audio"]) * 32767).astype(np.int16) | |
return (16000, speech) | |
demo = gr.Interface( | |
fn=text_to_speech, | |
inputs=[ | |
gr.Textbox(lines=5, label="Input Text"), | |
gr.Radio(["English","Nepali"],label="Choose Language") | |
], | |
outputs=[gr.Audio( label="Output Audio",type="numpy")], | |
title="SpeechT5: Text to Speech For Nepali Language", | |
description="SpeechT5 is a speech-to-text model that converts text into speech. Type in the text you want to convert into speech.", | |
) | |
demo.launch(share=True) | |