jaskaran Singh commited on Jan 6

Commit

390d94d

•

1 Parent(s): d0a22d2

indic

Browse files

Files changed (29) hide show

.gitattributes +3 -0
README.md +98 -18
maha_tts/__init__.py +3 -1
maha_tts/config.py +3 -2
maha_tts/inference.py +74 -24
maha_tts/models/__init__.py +0 -0
maha_tts/models/autoregressive.py +0 -135
maha_tts/models/diff_model.py +0 -303
maha_tts/models/modules.py +0 -406
maha_tts/models/vocoder.py +0 -342
maha_tts/pretrained_models/.DS_Store +0 -0
maha_tts/pretrained_models/Smolie-en/.DS_Store +0 -0
maha_tts/pretrained_models/Smolie-en/s2a_latest.pt +3 -0
maha_tts/pretrained_models/{smolie/T2S → Smolie-en}/t2s_best.pt +2 -2
maha_tts/pretrained_models/Smolie-in/.DS_Store +0 -0
maha_tts/pretrained_models/Smolie-in/s2a_latest.pt +3 -0
maha_tts/pretrained_models/{smolie/S2A/s2a_latest.pt → Smolie-in/t2s_best.pt} +2 -2
maha_tts/text/cleaners.py +2 -2
maha_tts/text/symbols.py +11 -1
maha_tts/utils/audio.py +2 -2
ref_clips/2971_4275_000003_000007.wav +0 -0
ref_clips/2971_4275_000020_000001.wav +0 -0
ref_clips/2971_4275_000023_000010.wav +0 -0
ref_clips/2971_4275_000049_000000.wav +0 -0
ref_clips/2971_4275_000049_000004.wav +0 -0
ref_clips/2971_4275_000050_000000.wav +0 -0
requirements.txt +46 -0
setup.py +23 -0
tts.py +6 -4

.gitattributes CHANGED Viewed

@@ -37,3 +37,6 @@ maha_tts/pretrained_models/smolie/T2S/t2s_best.pt filter=lfs diff=lfs merge=lfs
 maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt filter=lfs diff=lfs merge=lfs -text
 maha_tts/pretrained_models/hifigan/config.json filter=lfs diff=lfs merge=lfs -text
 maha_tts/pretrained_models/hifigan/g_02500000 filter=lfs diff=lfs merge=lfs -text

 maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt filter=lfs diff=lfs merge=lfs -text
 maha_tts/pretrained_models/hifigan/config.json filter=lfs diff=lfs merge=lfs -text
 maha_tts/pretrained_models/hifigan/g_02500000 filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/hifigan filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/Smolie-en filter=lfs diff=lfs merge=lfs -text
+maha_tts/pretrained_models/Smolie-in filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,37 +1,37 @@
 <div align="center">
 <h1>MahaTTS: An Open-Source Large Speech Generation Model in the making</h1>
-a Dubverse Black initiative <br> <br>
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-eOQqznKWwAfMdusJ_LDtDhjIyAlSMrG?usp=sharing)
 [![Discord Shield](https://discordapp.com/api/guilds/1162007551987171410/widget.png?style=shield)](https://discord.gg/4VGnrgpBN)
 </div>
 ------
 ## Description
-MahaTTS (Maha means 'Great' in sanskrit), is a speech generation model which is inspired from tortoise-tts, except it uses seamless M4t wav2vec2 to extract semantic tokens.
-Since seamless M4t wav2vec2 is trained on multilingual data, it makes this model easier to scale on multilingual data.
-<img width="993" alt="Screenshot 2023-11-19 at 11 53 52 PM" src="https://github.com/dubverse-ai/MahaTTS/assets/32906806/7429d3b6-3f19-4bd8-9005-ff9e16a698f8">
-### Architecture
-|      Model (Smolie)       | Parameters | Model Type |       Output      |
-|:-------------------------:|:----------:|------------|:-----------------:|
-|   Text to Semantic (M1)   |    69 M    | Causal LM  |   10,001 Tokens   |
-|  Semantic to MelSpec(M2)  |    108 M   | Diffusion  |   2x 80x Melspec  |
-|      Hifi Gan Vocoder     |    13 M    |    GAN     |   Audio Waveform  |
 ## Features
-1. Multilinguality
 2. Realistic Prosody and intonation
 3. Multi-voice capabilities
-## Current Progress
-Trained on 200 hours of LibriTTS model -> 'Smolie'
 ## Installation
 ```bash
 pip install git+https://github.com/dubverse-ai/MahaTTS.git
 ```
@@ -39,8 +39,88 @@ pip install git+https://github.com/dubverse-ai/MahaTTS.git
 ```bash
 pip install maha-tts
 ```
 ## Roadmap
-- [x] Smolie - eng
-- [ ] Smolie - indic
-- [ ] Optimizations for inference

 <div align="center">
 <h1>MahaTTS: An Open-Source Large Speech Generation Model in the making</h1>
+a <a href = "https://black.dubverse.ai">Dubverse Black</a> initiative <br> <br>
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-eOQqznKWwAfMdusJ_LDtDhjIyAlSMrG?usp=sharing)
 [![Discord Shield](https://discordapp.com/api/guilds/1162007551987171410/widget.png?style=shield)](https://discord.gg/4VGnrgpBN)
 </div>
 ------
 ## Description
+MahaTTS, with Maha signifying 'Great' in Sanskrit, is a Text to Speech Model developed by [Dubverse.ai](https://dubverse.ai). We drew inspiration from the [tortoise-tts](https://github.com/neonbjb/tortoise-tts) model, but our model uniquely utilizes seamless M4t wav2vec2 for semantic token extraction. As this specific variant of wav2vec2 is trained on multilingual data, it enhances our model's scalability across different languages.
+We are providing access to pretrained model checkpoints, which are ready for inference and available for commercial use.
+<img width="993" alt="MahaTTS Architecture" src="https://github.com/dubverse-ai/MahaTTS/assets/32906806/7429d3b6-3f19-4bd8-9005-ff9e16a698f8">
+## Updates
+**2023-11-13**
+- MahaTTS Released! Open sourced Smolie
+- Community and access to new features on our [Discord](https://discord.gg/uFPrzBqyF2)
 ## Features
+1. Multilinguality (coming soon)
 2. Realistic Prosody and intonation
 3. Multi-voice capabilities
 ## Installation
 ```bash
 pip install git+https://github.com/dubverse-ai/MahaTTS.git
 ```
 ```bash
 pip install maha-tts
 ```
+## api usage
+```bash
+!gdown --folder 1-HEc3V4f6X93I8_IfqExLfL3s8I_dXGZ -q # download speakers ref files
+import torch,glob
+from maha_tts import load_models,infer_tts
+from scipy.io.wavfile import write
+from IPython.display import Audio,display
+# PATH TO THE SPEAKERS WAV FILES
+speaker =['/content/infer_ref_wavs/2272_152282_000019_000001/',
+          '/content/infer_ref_wavs/2971_4275_000049_000000/',
+          '/content/infer_ref_wavs/4807_26852_000062_000000/',
+          '/content/infer_ref_wavs/6518_66470_000014_000002/']
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+diff_model,ts_model,vocoder,diffuser = load_models('Smolie',device)
+print('Using:',device)
+speaker_num = 0 # @param ["0", "1", "2", "3"] {type:"raw"}
+text = "I freakin love how Elon came to life the moment they started talking about gaming and specifically diablo, you can tell that he didn't want that part of the discussion to end, while Lex to move on to the next subject! Once a true gamer, always a true gamer!" # @param {type:"string"}
+ref_clips = glob.glob(speaker[speaker_num]+'*.wav')
+audio,sr = infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder)
+write('/content/test.wav',sr,audio)
+```
 ## Roadmap
+- [x] Smolie - eng (trained on 200 hours of LibriTTS)
+- [ ] Smolie - indic (Train on Indian languages, coming soon)
+- [ ] Optimizations for inference (looking for contributors, check issues)
+## Some Generated Samples
+0 -> "I seriously laughed so much hahahaha (seals with headphones...) and appreciate both the interviewer and the subject. Major respect for two extraordinary humans - and in this time of gratefulness, I'm thankful for you both and this forum!"
+1 -> "I freakin love how Elon came to life the moment they started talking about gaming and specifically diablo, you can tell that he didn't want that part of the discussion to end, while Lex to move on to the next subject! Once a true gamer, always a true gamer!"
+2 -> "hello there! how are you?" (This one didn't work well, M1 model hallucinated)
+3 -> "Who doesn't love a good scary story, something to send a chill across your skin in the middle of summer's heat or really, any other time? And this year, we're celebrating the two hundredth birthday of one of the most famous scary stories of all time: Frankenstein."
+https://github.com/dubverse-ai/MahaTTS/assets/32906806/462ee134-5d8c-43c8-a425-3b6cabd2ff85
+https://github.com/dubverse-ai/MahaTTS/assets/32906806/40c62402-7f65-4a35-b739-d8b8a082ad62
+https://github.com/dubverse-ai/MahaTTS/assets/32906806/f0a9628c-ef81-450d-ab82-2f4c4626864e
+https://github.com/dubverse-ai/MahaTTS/assets/32906806/15476151-72ea-410d-bcdc-177433df7884
+## Technical Details
+### Model Params
+|      Model (Smolie)       | Parameters | Model Type |       Output      |
+|:-------------------------:|:----------:|------------|:-----------------:|
+|   Text to Semantic (M1)   |    69 M    | Causal LM  |   10,001 Tokens   |
+|  Semantic to MelSpec(M2)  |    108 M   | Diffusion  |   2x 80x Melspec  |
+|      Hifi Gan Vocoder     |    13 M    |    GAN     |   Audio Waveform  |
+### Languages Supported
+| Language | Status |
+| --- | :---: |
+| English (en) | ✅ |
+## License
+MahaTTS is licensed under the Apache 2.0 License.
+## 🙏 Appreciation
+- [tortoise-tts](https://github.com/neonbjb/tortoise-tts)
+- [M4t Seamless](https://github.com/facebookresearch/seamless_communication) [AudioLM](https://arxiv.org/abs/2209.03143) and many other ground-breaking papers that enabled the development of MahaTTS
+- [Diffusion training](https://github.com/openai/guided-diffusion) for training diffusion model
+- [Huggingface](https://huggingface.co/docs/transformers/index) for related training and inference code

maha_tts/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- from .inference import load_models,load_diffuser,infer_tts

+from maha_tts.inference import load_models,load_diffuser,infer_tts
+from maha_tts.config import config
+__version__ = '1.0.0'

maha_tts/config.py CHANGED Viewed

@@ -5,8 +5,9 @@ class config:
     seed_value = 3407
     # Text to Semantic
-    t2s_position = 2048
     # Semantic to acoustic
     sa_timesteps_max = 1000

     seed_value = 3407
     # Text to Semantic
+    t2s_position = 4096
+    langs = ['english','tamil', 'telugu', 'punjabi', 'marathi', 'hindi', 'gujarati', 'bengali', 'assamese']
+    lang_index = {i:j for j,i in enumerate(langs)}
     # Semantic to acoustic
     sa_timesteps_max = 1000

maha_tts/inference.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import torch,glob,os
 import numpy as np
 import torch.nn.functional as F
 from librosa.filters import mel as librosa_mel_fn
 from scipy.io.wavfile import write
 from scipy.special import softmax
@@ -11,10 +12,12 @@ from maha_tts.models.vocoder import load_vocoder_model,infer_wav
 from maha_tts.utils.audio import denormalize_tacotron_mel,normalize_tacotron_mel,load_wav_to_torch,dynamic_range_compression
 from maha_tts.utils.stft import STFT
 from maha_tts.utils.diffusion import SpacedDiffusion,get_named_beta_schedule,space_timesteps
-from maha_tts.text.symbols import labels,text_labels,code_labels,text_enc,text_dec,code_enc,code_dec
 from maha_tts.text.cleaners import  english_cleaners
 from maha_tts.config import config
 stft_fn = STFT(config.filter_length, config.hop_length, config.win_length)
 mel_basis = librosa_mel_fn(
@@ -23,13 +26,52 @@ mel_basis = librosa_mel_fn(
 mel_basis = torch.from_numpy(mel_basis).float()
 model_dirs= {
-    'Smolie':'asdf',
-    'hifigan':'asdf'
 }
-def download_model(name):
-    pass
 def load_models(name,device=torch.device('cpu')):
     '''
@@ -51,10 +93,10 @@ def load_models(name,device=torch.device('cpu')):
     assert name in model_dirs, "no model name "+name
-    checkpoint_diff = 'maha_tts/pretrained_models/'+str(name)+'/S2A/s2a_latest.pt'
-    checkpoint_ts = 'maha_tts/pretrained_models/'+str(name)+'/T2S/t2s_best.pt'
-    checkpoint_voco = 'maha_tts/pretrained_models/hifigan/g_02500000'
-    voco_config_path = 'maha_tts/pretrained_models/hifigan/config.json'
     # for i in [checkpoint_diff,checkpoint_ts,checkpoint_voco,voco_config_path]:
     if not os.path.exists(checkpoint_diff) or not os.path.exists(checkpoint_ts):
@@ -64,15 +106,16 @@ def load_models(name,device=torch.device('cpu')):
         download_model('hifigan')
     diff_model = load_diff_model(checkpoint_diff,device)
-    ts_model = load_TS_model(checkpoint_ts,device)
     vocoder = load_vocoder_model(voco_config_path,checkpoint_voco,device)
     diffuser = load_diffuser()
     return diff_model,ts_model,vocoder,diffuser
-def infer_mel(model,timeshape,code,ref_mel,diffuser,temperature=0.1):
     device = next(model.parameters()).device
     code = code.to(device)
     output_shape = (1,80,timeshape)
     noise = torch.randn(output_shape, device=code.device) * temperature
     mel = diffuser.p_sample_loop(model, output_shape, noise=noise,
@@ -84,17 +127,18 @@ def generate_semantic_tokens(
     text,
     model,
     ref_mels,
     temp = 0.7,
     top_p= None,
-    top_k= None,
     n_tot_steps = 1000,
     device = None
     ):
     semb = []
     with torch.no_grad():
-        for n in range(n_tot_steps):
-            x = get_inputs(text,semb,ref_mels,device)
-            _,result = model(**x)
             relevant_logits = result[0,:,-1]
             if top_p is not None:
                 # faster to convert to numpy
@@ -125,9 +169,13 @@ def generate_semantic_tokens(
     semb = torch.tensor([int(i) for i in semb[:-1]])
     return semb,result
-def get_inputs(text,semb=[],ref_mels=[],device=torch.device('cpu')):
   text = text.lower()
-  text_ids=[text_enc['<S>']]+[text_enc[i] for i in text.strip()]+[text_enc['<E>']]
   semb_ids=[code_enc['<SST>']]+[code_enc[i] for i in semb]#+[tok_enc['<EST>']]
   input_ids = text_ids+semb_ids
@@ -166,7 +214,7 @@ def get_mel(filepath):
     energy = torch.norm(magnitudes, dim=1).squeeze(0)
     return melspec,list(energy)
-def infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder):
     '''
     Generate audio from the given text using a text-to-speech (TTS) pipeline.
@@ -193,6 +241,7 @@ def infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder):
     Example usage:
     audio, sampling_rate = infer_tts("Hello, how are you?", ref_clips, diffuser, diff_model, ts_model, vocoder)
     '''
     text = english_cleaners(text)
     ref_mels = get_ref_mels(ref_clips)
     with torch.no_grad():
@@ -200,20 +249,21 @@ def infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder):
                         text,
                         ts_model,
                         ref_mels,
                         temp = 0.7,
                         top_p= 0.8,
                         top_k= 5,
                         n_tot_steps = 1000,
-                        device = None
                     )
         mel = infer_mel(diff_model,int(((sem_tok.shape[-1] * 320 / 16000) * 22050/256)+1),sem_tok.unsqueeze(0) + 1,
-                        ref_mels,diffuser,temperature=1.0)
         audio = infer_wav(mel,vocoder)
     return audio,config.sampling_rate
-def load_diffuser(timesteps = 100, gudiance=3):
     '''
     Load and configure a diffuser for denoising and guidance in the diffusion model.
@@ -227,10 +277,10 @@ def load_diffuser(timesteps = 100, gudiance=3):
     Description:
     The `load_diffuser` function initializes a diffuser with specific settings for denoising and guidance.
     '''
-    betas = get_named_beta_schedule('cosine',config.sa_timesteps_max)
     diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [timesteps]), model_mean_type='epsilon',
                         model_var_type='learned_range', loss_type='rescaled_mse', betas=betas,
-                        conditioning_free=True, conditioning_free_k=gudiance)
     diffuser.training=False
     return diffuser

+import torch,glob,os,requests
 import numpy as np
 import torch.nn.functional as F
+from tqdm import tqdm
 from librosa.filters import mel as librosa_mel_fn
 from scipy.io.wavfile import write
 from scipy.special import softmax
 from maha_tts.utils.audio import denormalize_tacotron_mel,normalize_tacotron_mel,load_wav_to_torch,dynamic_range_compression
 from maha_tts.utils.stft import STFT
 from maha_tts.utils.diffusion import SpacedDiffusion,get_named_beta_schedule,space_timesteps
+from maha_tts.text.symbols import labels,text_labels,text_labels_en,code_labels,text_enc,text_dec,code_enc,code_dec,text_enc_en,text_dec_en
 from maha_tts.text.cleaners import  english_cleaners
 from maha_tts.config import config
+DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser('~'), '.cache', 'maha_tts', 'models')
+DEFAULT_MODELS_DIR = '/Users/jaskaransingh/Desktop/MahaTTS/models/'
 stft_fn = STFT(config.filter_length, config.hop_length, config.win_length)
 mel_basis = librosa_mel_fn(
 mel_basis = torch.from_numpy(mel_basis).float()
 model_dirs= {
+    'Smolie':['https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/smolie/S2A/s2a_latest.pt',
+                'https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/smolie/T2S/t2s_best.pt'],
+    'Smolie-en':[''],
+    'Smolie-in':[''],
+    'hifigan':['https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/hifigan/g_02500000',
+                'https://huggingface.co/Dubverse/MahaTTS/resolve/main/maha_tts/pretrained_models/hifigan/config.json']
 }
+def download_file(url, filename):
+    response = requests.get(url, stream=True)
+    total_size = int(response.headers.get('content-length', 0))
+    # Check if the response was successful (status code 200)
+    response.raise_for_status()
+    with open(filename, 'wb') as file, tqdm(
+        desc=filename,
+        total=total_size,
+        unit='B',
+        unit_scale=True,
+        unit_divisor=1024,
+    ) as bar:
+        for data in response.iter_content(chunk_size=1024):
+            # Write data to the file
+            file.write(data)
+            # Update the progress bar
+            bar.update(len(data))
+    print(f"Download complete: {filename}\n")
+def download_model(name):
+    print('Downloading ',name," ....")
+    checkpoint_diff = os.path.join(DEFAULT_MODELS_DIR,name,'s2a_latest.pt')
+    checkpoint_ts = os.path.join(DEFAULT_MODELS_DIR,name,'t2s_best.pt')
+    checkpoint_voco = os.path.join(DEFAULT_MODELS_DIR,'hifigan','g_02500000')
+    voco_config_path = os.path.join(DEFAULT_MODELS_DIR,'hifigan','config.json')
+    os.makedirs(os.path.join(DEFAULT_MODELS_DIR,name),exist_ok=True)
+    if name == 'hifigan':
+        download_file(model_dirs[name][0],checkpoint_voco)
+        download_file(model_dirs[name][1],voco_config_path)
+    else:
+        download_file(model_dirs[name][0],checkpoint_diff)
+        download_file(model_dirs[name][1],checkpoint_ts)
 def load_models(name,device=torch.device('cpu')):
     '''
     assert name in model_dirs, "no model name "+name
+    checkpoint_diff = os.path.join(DEFAULT_MODELS_DIR,name,'s2a_latest.pt')
+    checkpoint_ts = os.path.join(DEFAULT_MODELS_DIR,name,'t2s_best.pt')
+    checkpoint_voco = os.path.join(DEFAULT_MODELS_DIR,'hifigan','g_02500000')
+    voco_config_path = os.path.join(DEFAULT_MODELS_DIR,'hifigan','config.json')
     # for i in [checkpoint_diff,checkpoint_ts,checkpoint_voco,voco_config_path]:
     if not os.path.exists(checkpoint_diff) or not os.path.exists(checkpoint_ts):
         download_model('hifigan')
     diff_model = load_diff_model(checkpoint_diff,device)
+    ts_model = load_TS_model(checkpoint_ts,device,name)
     vocoder = load_vocoder_model(voco_config_path,checkpoint_voco,device)
     diffuser = load_diffuser()
     return diff_model,ts_model,vocoder,diffuser
+def infer_mel(model,timeshape,code,ref_mel,diffuser,temperature=1.0):
     device = next(model.parameters()).device
     code = code.to(device)
+    ref_mel =ref_mel.to(device)
     output_shape = (1,80,timeshape)
     noise = torch.randn(output_shape, device=code.device) * temperature
     mel = diffuser.p_sample_loop(model, output_shape, noise=noise,
     text,
     model,
     ref_mels,
+    language=None,
     temp = 0.7,
     top_p= None,
+    top_k= 1,
     n_tot_steps = 1000,
     device = None
     ):
     semb = []
     with torch.no_grad():
+        for n in tqdm(range(n_tot_steps)):
+            x = get_inputs(text,semb,ref_mels,device,model.name)
+            _,result = model(**x,language=language)
             relevant_logits = result[0,:,-1]
             if top_p is not None:
                 # faster to convert to numpy
     semb = torch.tensor([int(i) for i in semb[:-1]])
     return semb,result
+def get_inputs(text,semb=[],ref_mels=[],device=torch.device('cpu'),name = 'Smolie-in'):
   text = text.lower()
+  if name=='Smolie-en':
+    text_ids=[text_enc_en['<S>']]+[text_enc_en[i] for i in text.strip()]+[text_enc_en['<E>']]
+  else:
+    text_ids=[text_enc['<S>']]+[text_enc[i] for i in text.strip()]+[text_enc['<E>']]
   semb_ids=[code_enc['<SST>']]+[code_enc[i] for i in semb]#+[tok_enc['<EST>']]
   input_ids = text_ids+semb_ids
     energy = torch.norm(magnitudes, dim=1).squeeze(0)
     return melspec,list(energy)
+def infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder,language=None):
     '''
     Generate audio from the given text using a text-to-speech (TTS) pipeline.
     Example usage:
     audio, sampling_rate = infer_tts("Hello, how are you?", ref_clips, diffuser, diff_model, ts_model, vocoder)
     '''
+    device = next(ts_model.parameters()).device
     text = english_cleaners(text)
     ref_mels = get_ref_mels(ref_clips)
     with torch.no_grad():
                         text,
                         ts_model,
                         ref_mels,
+                        language,
                         temp = 0.7,
                         top_p= 0.8,
                         top_k= 5,
                         n_tot_steps = 1000,
+                        device = device
                     )
         mel = infer_mel(diff_model,int(((sem_tok.shape[-1] * 320 / 16000) * 22050/256)+1),sem_tok.unsqueeze(0) + 1,
+                        normalize_tacotron_mel(ref_mels),diffuser,temperature=0.5)
         audio = infer_wav(mel,vocoder)
     return audio,config.sampling_rate
+def load_diffuser(timesteps = 100, guidance=3):
     '''
     Load and configure a diffuser for denoising and guidance in the diffusion model.
     Description:
     The `load_diffuser` function initializes a diffuser with specific settings for denoising and guidance.
     '''
+    betas = get_named_beta_schedule('linear',config.sa_timesteps_max)
     diffuser = SpacedDiffusion(use_timesteps=space_timesteps(1000, [timesteps]), model_mean_type='epsilon',
                         model_var_type='learned_range', loss_type='rescaled_mse', betas=betas,
+                        conditioning_free=True, conditioning_free_k=guidance)
     diffuser.training=False
     return diffuser

maha_tts/models/__init__.py DELETED Viewed

File without changes

maha_tts/models/autoregressive.py DELETED Viewed

@@ -1,135 +0,0 @@
-'''
-Inspiration taken from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/models/autoregressive.py
-'''
-import os,sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import functools
-from typing import Any
-from torch.utils.data import Dataset,DataLoader
-from transformers import GPT2Tokenizer,GPT2Config, GPT2Model, GPT2LMHeadModel
-from tqdm import tqdm
-from maha_tts.config import config
-from maha_tts.text.symbols import labels,code_labels,text_labels
-from maha_tts.models.modules import GST
-def null_position_embeddings(range, dim):
-    return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
-class TS_model(nn.Module):
-    def __init__(self,n_embed = 512, n_layer = 16, n_head = 8):
-        super(TS_model,self).__init__()
-        self.vocab_size=len(labels)
-        self.n_positions=config.t2s_position
-        self.n_embed=n_embed
-        self.n_layer=n_layer
-        self.n_head=n_head
-        self.config = GPT2Config(vocab_size=self.vocab_size,n_positions=self.n_positions,n_embd=self.n_embed,n_layer=self.n_layer,n_head=self.n_head)
-        self.gpt = GPT2Model(self.config)
-        del self.gpt.wpe
-        self.gpt.wpe = functools.partial(null_position_embeddings, dim=self.n_embed)
-        # Built-in token embeddings are unused.
-        del self.gpt.wte
-        self.GST = GST(model_channels=self.n_embed,num_heads=self.n_head,in_channels=config.n_mel_channels,k=1)
-        self.text_head = nn.Linear(self.n_embed,len(text_labels))
-        self.code_head = nn.Linear(self.n_embed,len(code_labels))
-        self.text_positional_embed = LearnedPositionEmbeddings(self.n_positions,self.n_embed)
-        self.code_positional_embed = LearnedPositionEmbeddings(self.n_positions,self.n_embed)
-        self.text_embed = nn.Embedding(len(text_labels),self.n_embed)
-        self.code_embed = nn.Embedding(len(code_labels),self.n_embed)
-        self.final_norm = nn.LayerNorm(self.n_embed)
-    def get_speaker_latent(self, ref_mels):
-        ref_mels = ref_mels.unsqueeze(1) if len(
-            ref_mels.shape) == 3 else ref_mels
-        conds = []
-        for j in range(ref_mels.shape[1]):
-            conds.append(self.GST(ref_mels[:, j,:,:]))
-        conds = torch.cat(conds, dim=-1)
-        conds = conds.mean(dim=-1)
-        return conds.unsqueeze(1)
-    def forward(self,text_ids,codes_ids = None,speaker_embed=None,ref_clips=None,return_loss = False):
-        assert speaker_embed is not None or ref_clips is not None
-        text_embed = self.text_embed(text_ids)
-        text_embed += self.text_positional_embed(text_embed)
-        code_embed = None
-        code_probs= None
-        if codes_ids is not None:
-            code_embed = self.code_embed(codes_ids)
-            code_embed+= self.code_positional_embed(code_embed)
-        if ref_clips is not None:
-            speaker_embed = self.get_speaker_latent(ref_clips)
-        text_embed,code_embed = self.get_logits(speaker_embed=speaker_embed,text_embed=text_embed,code_embed=code_embed)
-        text_probs = self.text_head(text_embed).permute(0,2,1)
-        if codes_ids is not None:
-            code_probs = self.code_head(code_embed).permute(0,2,1)
-        if return_loss:
-            loss_text = F.cross_entropy(text_probs[:,:,:-1], text_ids[:,1:].long(), reduce=False)
-            loss_mel = F.cross_entropy(code_probs[:,:,:-1], codes_ids[:,1:].long(), reduce=False)
-            return loss_text,loss_mel,code_probs
-        return text_probs,code_probs
-    def get_logits(self,speaker_embed,text_embed,code_embed=None):
-        if code_embed is not None:
-            embed = torch.cat([speaker_embed,text_embed,code_embed],dim=1)
-        else:
-            embed = torch.cat([speaker_embed,text_embed],dim=1)
-        gpt_output = self.gpt(inputs_embeds=embed, return_dict=True)
-        enc = gpt_output.last_hidden_state[:, 1:]
-        enc = self.final_norm(enc)
-        if code_embed is not None:
-            return enc[:,:text_embed.shape[1]],enc[:,-code_embed.shape[1]:]
-        return enc[:,:text_embed.shape[1]],None
-class LearnedPositionEmbeddings(nn.Module):
-    def __init__(self, seq_len, model_dim, init=.02):
-        super().__init__()
-        self.emb = nn.Embedding(seq_len, model_dim)
-        # Initializing this way is standard for GPT-2
-        self.emb.weight.data.normal_(mean=0.0, std=init)
-    def forward(self, x):
-        sl = x.shape[1]
-        return self.emb(torch.arange(0, sl, device=x.device))
-    def get_fixed_embedding(self, ind, dev):
-        return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
-def load_TS_model(checkpoint,device):
-    sem_model= TS_model(n_embed = 512, n_layer = 16, n_head = 8)
-    sem_model.load_state_dict(torch.load(checkpoint,map_location=torch.device('cpu')),strict=False)
-    sem_model.eval().to(device)
-    return sem_model
-if __name__ == '__main__':
-    model=TS_model(n_embed = 256, n_layer = 6, n_head = 4)
-    text_ids = torch.randint(0,100,(5,20))
-    code_ids = torch.randint(0,100,(5,200))
-    speaker_embed = torch.randn((5,1,256))
-    output=model(text_ids=text_ids,speaker_embed=speaker_embed,codes_ids=code_ids,return_loss=True)

maha_tts/models/diff_model.py DELETED Viewed

@@ -1,303 +0,0 @@
-'''
-inspiration taken from https://github.com/neonbjb/tortoise-tts/blob/main/tortoise/models/diffusion_decoder.py
-'''
-import sys
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-from maha_tts.config import config
-from torch import autocast
-from maha_tts.models.modules import QuartzNetBlock,AttentionBlock,mySequential,normalization,SCBD,SqueezeExcite,GST
-def timestep_embedding(timesteps, dim, max_period=10000):
-    """
-    Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
-    """
-    half = dim // 2
-    freqs = torch.exp(
-        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
-    ).to(device=timesteps.device)
-    args = timesteps[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
-class TimestepBlock(nn.Module):
-    def forward(self, x, emb):
-        """
-        Apply the module to `x` given `emb` timestep embeddings.
-        """
-class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
-    def forward(self, x, emb):
-        for layer in self:
-            if isinstance(layer, TimestepBlock):
-                x = layer(x, emb)
-            else:
-                x = layer(x)
-        return x
-class QuartzNetBlock(TimestepBlock):
-    '''Similar to Resnet block with Batchnorm and dropout, and using Separable conv in the middle.
-    if its the last layer,set se = False and separable = False, and use a projection layer on top of this.
-    '''
-    def __init__(self,nin,nout,emb_channels,kernel_size=3,dropout=0.1,R=1,se=True,ratio=8,separable=False,bias=True,use_scale_shift_norm=True):
-        super(QuartzNetBlock,self).__init__()
-        self.use_scale_shift_norm = use_scale_shift_norm
-        self.se=se
-        self.in_layers = mySequential(
-            nn.Conv1d(nin,nout,kernel_size=1,padding='same',bias=bias),
-            normalization(nout) #nn.BatchNorm1d(nout,eps)
-        )
-        self.residual=mySequential(
-            nn.Conv1d(nin,nout,kernel_size=1,padding='same',bias=bias),
-            normalization(nout) #nn.BatchNorm1d(nout,eps)
-        )
-        nin=nout
-        model=[]
-        self.emb_layers = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(
-                emb_channels,
-                2 * nout if use_scale_shift_norm else nout,
-            ),
-        )
-        for i in range(R-1):
-            model.append(SCBD(nin,nout,kernel_size,dropout,bias=bias))
-            nin=nout
-        if separable:
-            model.append(SCBD(nin,nout,kernel_size,dropout,rd=False,bias=bias))
-        else:
-            model.append(SCBD(nin,nout,kernel_size,dropout,rd=False,separable=False,bias=bias))
-        self.model=mySequential(*model)
-        if self.se:
-            self.se_layer=SqueezeExcite(nin,ratio)
-        self.mout= mySequential(nn.SiLU(),nn.Dropout(dropout))
-    def forward(self,x,emb,mask=None):
-        x_new=self.in_layers(x)
-        emb = self.emb_layers(emb)
-        while len(emb.shape) < len(x_new.shape):
-            emb = emb[..., None]
-        scale, shift = torch.chunk(emb, 2, dim=1)
-        x_new = x_new * (1 + scale) + shift
-        y,_=self.model(x_new)
-        if self.se:
-            y,_=self.se_layer(y,mask)
-        y+=self.residual(x)
-        y=self.mout(y)
-        return y
-class QuartzAttn(TimestepBlock):
-    def __init__(self, model_channels, dropout, num_heads):
-        super().__init__()
-        self.resblk = QuartzNetBlock(model_channels, model_channels, model_channels,dropout=dropout,use_scale_shift_norm=True)
-        self.attn = AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True)
-    def forward(self, x, time_emb):
-        y = self.resblk(x, time_emb)
-        return self.attn(y)
-class QuartzNet9x5(nn.Module):
-    def __init__(self,model_channels,num_heads,enable_fp16=False):
-        super(QuartzNet9x5,self).__init__()
-        self.enable_fp16 = enable_fp16
-        self.conv1=QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=3,dropout=0.1,R=3)
-        kernels=[5,7,9,13,15,17]
-        quartznet=[]
-        attn=[]
-        for i in kernels:
-            quartznet.append(QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=i,dropout=0.1,R=5,se=True))
-            attn.append(AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True))
-        kernels=[21,23,25]
-        quartznet.append(QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=21,dropout=0.1,R=5,se=True))
-        attn.append(AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True))
-        for i in kernels[1:]:
-            quartznet.append(QuartzNetBlock(model_channels,model_channels,model_channels,kernel_size=i,dropout=0.1,R=5,se=True))
-            attn.append(AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True))
-        self.quartznet= nn.ModuleList(quartznet)
-        self.attn = nn.ModuleList(attn)
-        self.conv3=nn.Conv1d(model_channels, model_channels, 1, padding='same')
-    def forward(self, x, time_emb):
-        x = self.conv1(x,time_emb)
-        # with autocast(x.device.type, enabled=self.enable_fp16):
-        for n,(layer,attn) in enumerate(zip(self.quartznet,self.attn)):
-            x = layer(x,time_emb) #256 dim
-            x = attn(x)
-        x = self.conv3(x.float())
-        return x
-class DiffModel(nn.Module):
-    def __init__(
-        self,
-        input_channels=80,
-        output_channels=160,
-        model_channels=512,
-        num_heads=8,
-        dropout=0.0,
-        multispeaker = True,
-        condition_free_per=0.1,
-        training = False,
-        ar_active = False,
-        in_latent_channels = 10004
-    ):
-        super().__init__()
-        self.input_channels = input_channels
-        self.model_channels = model_channels
-        self.output_channels = output_channels
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.condition_free_per = condition_free_per
-        self.training = training
-        self.multispeaker = multispeaker
-        self.ar_active = ar_active
-        self.in_latent_channels = in_latent_channels
-        if not self.ar_active:
-            self.code_emb = nn.Embedding(config.semantic_model_centroids+1,model_channels)
-            self.code_converter = mySequential(
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            )
-        else:
-            self.code_converter = mySequential(
-                nn.Conv1d(self.in_latent_channels, model_channels, 3, padding=1),
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-                AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
-            )
-        if self.multispeaker:
-            self.GST = GST(model_channels,num_heads)
-        self.code_norm = normalization(model_channels)
-        self.time_norm = normalization(model_channels)
-        self.noise_norm = normalization(model_channels)
-        self.code_time_norm = normalization(model_channels)
-        # self.code_latent = []
-        self.time_embed = mySequential(
-            nn.Linear(model_channels, model_channels),
-            nn.SiLU(),
-            nn.Linear(model_channels, model_channels),)
-        self.input_block = nn.Conv1d(input_channels,model_channels,3,1,1)
-        self.unconditioned_embedding = nn.Parameter(torch.randn(1,model_channels,1))
-        self.code_time = TimestepEmbedSequential(QuartzAttn(model_channels, dropout, num_heads),QuartzAttn(model_channels, dropout, num_heads),QuartzAttn(model_channels, dropout, num_heads))
-        self.layers = QuartzNet9x5(model_channels,num_heads)
-        self.out = nn.Sequential(
-            normalization(model_channels),
-            nn.SiLU(),
-            nn.Conv1d(model_channels, output_channels, 3, padding=1),
-        )
-    def get_speaker_latent(self, ref_mels):
-        ref_mels = ref_mels.unsqueeze(1) if len(
-            ref_mels.shape) == 3 else ref_mels
-        conds = []
-        for j in range(ref_mels.shape[1]):
-            conds.append(self.GST(ref_mels[:, j,:,:]))
-        conds = torch.cat(conds, dim=-1)
-        conds = conds.mean(dim=-1)
-        return conds.unsqueeze(2)
-    def forward(self ,x,t,code_emb,ref_clips=None,speaker_latents=None,conditioning_free=False):
-        time_embed = self.time_norm(self.time_embed(timestep_embedding(t.unsqueeze(-1),self.model_channels)).permute(0,2,1)).squeeze(2)
-        if conditioning_free:
-            code_embed = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1])
-        else:
-            if not self.ar_active:
-                code_embed = self.code_norm(self.code_converter(self.code_emb(code_emb).permute(0,2,1)))
-            else:
-                code_embed = self.code_norm(self.code_converter(code_emb))
-        if self.multispeaker:
-            assert speaker_latents is not None or ref_clips is not None
-            if ref_clips is not None:
-                speaker_latents = self.get_speaker_latent(ref_clips)
-            cond_scale, cond_shift = torch.chunk(speaker_latents, 2, dim=1)
-            code_embed = code_embed * (1 + cond_scale) + cond_shift
-        if self.training and self.condition_free_per > 0:
-            unconditioned_batches = torch.rand((code_embed.shape[0], 1, 1),
-                                               device=code_embed.device) < self.condition_free_per
-            code_embed = torch.where(unconditioned_batches, self.unconditioned_embedding.repeat(code_embed.shape[0], 1, 1),
-                                   code_embed)
-        expanded_code_emb = F.interpolate(code_embed, size=x.shape[-1], mode='nearest') #try different modes
-        x_cond = self.code_time_norm(self.code_time(expanded_code_emb,time_embed))
-        x = self.noise_norm(self.input_block(x))
-        x += x_cond
-        x = self.layers(x, time_embed)
-        out = self.out(x)
-        return out
-def load_diff_model(checkpoint,device,model_channels=512,ar_active=False,len_code_labels=10004):
-    diff_model = DiffModel(input_channels=80,
-                 output_channels=160,
-                 model_channels=512,
-                 num_heads=8,
-                 dropout=0.15,
-                 condition_free_per=0.15,
-                 multispeaker=True,
-                 training=False,
-                 ar_active=ar_active,
-                 in_latent_channels = len_code_labels)
-    # diff_model.load_state_dict(torch.load('/content/LibriTTS_fp64_10k/S2A/_latest.pt',map_location=torch.device('cpu')),strict=True)
-    diff_model.load_state_dict(torch.load(checkpoint,map_location=torch.device('cpu')),strict=True)
-    diff_model=diff_model.eval().to(device)
-    return diff_model
-if __name__ == '__main__':
-    device = torch.device('cpu')
-    diff_model = DiffModel(input_channels=80,
-                 output_channels=160,
-                 model_channels=1024,
-                 num_heads=8,
-                 dropout=0.1,
-                 num_layers=8,
-                 enable_fp16=True,
-                 condition_free_per=0.1,
-                 multispeaker=True,
-                 training=True).to(device)
-    batch_Size = 32
-    timeseries = 800
-    from torchinfo import summary
-    summary(diff_model, input_data={'x': torch.randn(batch_Size, 80, timeseries).to(device),
-    'ref_clips': torch.randn(batch_Size,3, 80, timeseries).to(device),
-    't':torch.LongTensor(size=[batch_Size,]).to(device),
-    'code_emb':torch.randint(0,201,(batch_Size,timeseries)).to(device)})

maha_tts/models/modules.py DELETED Viewed

@@ -1,406 +0,0 @@
-import torch,math
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.nn.init as init
-from einops import rearrange, repeat
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    Using it for Zero Convolutions
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-def normalization(channels):
-    """
-    Make a standard normalization layer. of groups ranging from 2 to 32.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    groups = 32
-    if channels <= 16:
-        groups = 8
-    elif channels <= 64:
-        groups = 16
-    while channels % groups != 0:
-        groups = int(groups / 2)
-    assert groups > 2
-    return GroupNorm32(groups, channels)
-class mySequential(nn.Sequential):
-    '''Using this to pass mask variable to nn layers
-    '''
-    def forward(self, *inputs):
-        for module in self._modules.values():
-            if type(inputs) == tuple:
-                inputs = module(*inputs)
-            else:
-                inputs = module(inputs)
-        return inputs
-class SepConv1D(nn.Module):
-    '''Depth wise separable Convolution layer with mask
-    '''
-    def __init__(self,nin,nout,kernel_size,stride=1,dilation=1,padding_mode='same',bias=True):
-        super(SepConv1D,self).__init__()
-        self.conv1=nn.Conv1d(nin, nin, kernel_size=kernel_size, stride=stride,groups=nin,dilation=dilation,padding=padding_mode,bias=bias)
-        self.conv2=nn.Conv1d(nin,nout,kernel_size=1,stride=1,padding=padding_mode,bias=bias)
-    def forward(self,x,mask=None):
-        if mask is not None:
-            x = x * mask.unsqueeze(1).to(device=x.device)
-        x=self.conv1(x)
-        x=self.conv2(x)
-        return x,mask
-class Conv1DBN(nn.Module):
-    def __init__(self,nin,nout,kernel_size,stride=1,dilation=1,dropout=0.1,padding_mode='same',bias=False):
-        super(Conv1DBN,self).__init__()
-        self.conv1=nn.Conv1d(nin, nout, kernel_size=kernel_size, stride=stride,padding=padding_mode,dilation=dilation,bias=bias)
-        self.bn=nn.BatchNorm1d(nout)
-        self.drop=nn.Dropout(dropout)
-    def forward(self,x,mask=None):
-        if mask is not None:
-            x = x * mask.unsqueeze(1).to(device=x.device)
-        x=self.conv1(x)
-        x=self.bn(x)
-        x=F.relu(x)
-        x=self.drop(x)
-        return x,mask
-class Conv1d(nn.Module):
-    '''normal conv1d with mask
-    '''
-    def __init__(self,nin,nout,kernel_size,padding,bias=True):
-        super(Conv1d,self).__init__()
-        self.l=nn.Conv1d(nin,nout,kernel_size,padding=padding,bias=bias)
-    def forward(self,x,mask):
-        if mask is not None:
-            x = x * mask.unsqueeze(1).to(device=x.device)
-        y=self.l(x)
-        return y,mask
-class SqueezeExcite(nn.Module):
-    '''Let the CNN decide how to add across channels
-    '''
-    def __init__(self,nin,ratio=8):
-        super(SqueezeExcite,self).__init__()
-        self.nin=nin
-        self.ratio=ratio
-        self.fc=mySequential(
-            nn.Linear(nin,nin//ratio,bias=True),nn.SiLU(inplace=True),nn.Linear(nin//ratio,nin,bias=True)
-        )
-    def forward(self,x,mask=None):
-        if mask is None:
-            mask = torch.ones((x.shape[0],x.shape[-1]),dtype=torch.bool).to(x.device)
-        mask=~mask
-        x=x.float()
-        x.masked_fill_(mask.unsqueeze(1), 0.0)
-        mask=~mask
-        y = (torch.sum(x, dim=-1, keepdim=True) / mask.unsqueeze(1).sum(dim=-1, keepdim=True)).type(x.dtype)
-        # y=torch.mean(x,-1,keepdim=True)
-        y=y.transpose(1, -1)
-        y=self.fc(y)
-        y=torch.sigmoid(y)
-        y=y.transpose(1, -1)
-        y= x * y
-        return y,mask
-class SCBD(nn.Module):
-    '''SeparableConv1D + Batchnorm + Dropout, Generally use it for middle layers and resnet
-    '''
-    def __init__(self,nin,nout,kernel_size,p=0.1,rd=True,separable=True,bias=True):
-        super(SCBD,self).__init__()
-        if separable:
-            self.SC=SepConv1D(nin,nout,kernel_size,bias=bias)
-        else:
-            self.SC=Conv1d(nin,nout,kernel_size,padding='same',bias=bias)
-        if rd: #relu and Dropout
-            self.mout=mySequential(normalization(nout),nn.SiLU(), # nn.BatchNorm1d(nout,eps)
-                nn.Dropout(p))
-        else:
-            self.mout=normalization(nout) # nn.BatchNorm1d(nout,eps)
-    def forward(self,x,mask=None):
-        if mask is not None:
-            x = x * mask.unsqueeze(1).to(device=x.device)
-        x,_= self.SC(x,mask)
-        y = self.mout(x)
-        return y,mask
-class QuartzNetBlock(nn.Module):
-    '''Similar to Resnet block with Batchnorm and dropout, and using Separable conv in the middle.
-    if its the last layer,set se = False and separable = False, and use a projection layer on top of this.
-    '''
-    def __init__(self,nin,nout,kernel_size,dropout=0.1,R=5,se=False,ratio=8,separable=False,bias=True):
-        super(QuartzNetBlock,self).__init__()
-        self.se=se
-        self.residual=mySequential(
-            nn.Conv1d(nin,nout,kernel_size=1,padding='same',bias=bias),
-            normalization(nout) #nn.BatchNorm1d(nout,eps)
-        )
-        model=[]
-        for i in range(R-1):
-            model.append(SCBD(nin,nout,kernel_size,dropout,eps=0.001,bias=bias))
-            nin=nout
-        if separable:
-            model.append(SCBD(nin,nout,kernel_size,dropout,eps=0.001,rd=False,bias=bias))
-        else:
-            model.append(SCBD(nin,nout,kernel_size,dropout,eps=0.001,rd=False,separable=False,bias=bias))
-        self.model=mySequential(*model)
-        if self.se:
-            self.se_layer=SqueezeExcite(nin,ratio)
-        self.mout= mySequential(nn.SiLU(),nn.Dropout(dropout))
-    def forward(self,x,mask=None):
-        if mask is not None:
-            x = x * mask.unsqueeze(1).to(device=x.device)
-        y,_=self.model(x,mask)
-        if self.se:
-            y,_=self.se_layer(y,mask)
-        y+=self.residual(x)
-        y=self.mout(y)
-        return y,mask
-class QKVAttentionLegacy(nn.Module):
-    """
-    A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
-    """
-    def __init__(self, n_heads):
-        super().__init__()
-        self.n_heads = n_heads
-    def forward(self, qkv, mask=None, rel_pos=None):
-        """
-        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
-        :return: an [N x (H * C) x T] tensor after attention.
-        """
-        bs, width, length = qkv.shape
-        assert width % (3 * self.n_heads) == 0
-        ch = width // (3 * self.n_heads)
-        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
-        scale = 1 / math.sqrt(math.sqrt(ch))
-        weight = torch.einsum(
-            "bct,bcs->bts", q * scale, k * scale
-        )  # More stable with f16 than dividing afterwards
-        if rel_pos is not None:
-            weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(bs * self.n_heads, weight.shape[-2], weight.shape[-1])
-        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
-        if mask is not None:
-            # The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs.
-            mask = mask.repeat(self.n_heads, 1).unsqueeze(1)
-            weight = weight * mask
-        a = torch.einsum("bts,bcs->bct", weight, v)
-        return a.reshape(bs, -1, length)
-class AttentionBlock(nn.Module):
-    """
-    An attention block that allows spatial positions to attend to each other.
-    Originally ported from here, but adapted to the N-d case.
-    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
-    """
-    def __init__(
-        self,
-        channels,
-        num_heads=1,
-        num_head_channels=-1,
-        do_checkpoint=True,
-        relative_pos_embeddings=False,
-    ):
-        super().__init__()
-        self.channels = channels
-        self.do_checkpoint = do_checkpoint
-        if num_head_channels == -1:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
-        self.norm = normalization(channels)
-        self.qkv = nn.Conv1d(channels, channels * 3, 1)
-        # split heads before split qkv
-        self.attention = QKVAttentionLegacy(self.num_heads)
-        self.proj_out = zero_module(nn.Conv1d(channels, channels, 1)) # no effect of attention in the inital stages.
-        # if relative_pos_embeddings:
-        self.relative_pos_embeddings = RelativePositionBias(scale=(channels // self.num_heads) ** .5, causal=False, heads=num_heads, num_buckets=32, max_distance=64) #need to read about this, vit and swin transformers
-        # self.relative_pos_embeddings = FixedPositionalEmbedding(dim=channels)
-        # else:
-        # self.relative_pos_embeddings = None
-    def forward(self, x, mask=None):
-        b, c, *spatial = x.shape
-        x = x.reshape(b, c, -1)
-        qkv = self.qkv(self.norm(x))
-        h = self.attention(qkv, mask, self.relative_pos_embeddings)
-        h = self.proj_out(h)
-        return (x + h).reshape(b, c, *spatial)
-class AbsolutePositionalEmbedding(nn.Module):
-    def __init__(self, dim, max_seq_len):
-        super().__init__()
-        self.scale = dim ** -0.5
-        self.emb = nn.Embedding(max_seq_len, dim)
-    def forward(self, x):
-        n = torch.arange(x.shape[1], device=x.device)
-        pos_emb = self.emb(n)
-        pos_emb = rearrange(pos_emb, 'n d -> () n d')
-        return pos_emb * self.scale
-class FixedPositionalEmbedding(nn.Module):
-    def __init__(self, dim):
-        super().__init__()
-        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
-        self.register_buffer('inv_freq', inv_freq)
-    def forward(self, x, seq_dim=1, offset=0):
-        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
-        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
-        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
-        return rearrange(emb, 'n d -> () n d')
-class RelativePositionBias(nn.Module):
-    def __init__(self, scale, causal=False, num_buckets=32, max_distance=128, heads=8):
-        super().__init__()
-        self.scale = scale
-        self.causal = causal
-        self.num_buckets = num_buckets
-        self.max_distance = max_distance
-        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
-    @staticmethod
-    def _relative_position_bucket(relative_position, causal=True, num_buckets=32, max_distance=128):
-        ret = 0
-        n = -relative_position
-        if not causal:
-            num_buckets //= 2
-            ret += (n < 0).long() * num_buckets
-            n = torch.abs(n)
-        else:
-            n = torch.max(n, torch.zeros_like(n))
-        max_exact = num_buckets // 2
-        is_small = n < max_exact
-        val_if_large = max_exact + (
-                torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).long()
-        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
-        ret += torch.where(is_small, n, val_if_large)
-        return ret
-    def forward(self, qk_dots):
-        i, j, device = *qk_dots.shape[-2:], qk_dots.device
-        q_pos = torch.arange(i, dtype=torch.long, device=device)
-        k_pos = torch.arange(j, dtype=torch.long, device=device)
-        rel_pos = k_pos[None, :] - q_pos[:, None]
-        rp_bucket = self._relative_position_bucket(rel_pos, causal=self.causal, num_buckets=self.num_buckets,
-                                                   max_distance=self.max_distance)
-        values = self.relative_attention_bias(rp_bucket)
-        bias = rearrange(values, 'i j h -> () h i j')
-        return qk_dots + (bias * self.scale)
-class MultiHeadAttention(nn.Module):
-    '''
-    only for GST
-    input:
-        query --- [N, T_q, query_dim]
-        key --- [N, T_k, key_dim]
-    output:
-        out --- [N, T_q, num_units]
-    '''
-    def __init__(self, query_dim, key_dim, num_units, num_heads):
-        super().__init__()
-        self.num_units = num_units
-        self.num_heads = num_heads
-        self.key_dim = key_dim
-        self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
-        self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
-        self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
-    def forward(self, query, key):
-        querys = self.W_query(query)  # [N, T_q, num_units]
-        keys = self.W_key(key)  # [N, T_k, num_units]
-        values = self.W_value(key)
-        split_size = self.num_units // self.num_heads
-        querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0)  # [h, N, T_q, num_units/h]
-        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
-        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
-        # score = softmax(QK^T / (d_k ** 0.5))
-        scores = torch.matmul(querys, keys.transpose(2, 3))  # [h, N, T_q, T_k]
-        scores = scores / (self.key_dim ** 0.5)
-        scores = F.softmax(scores, dim=3)
-        # out = score * V
-        out = torch.matmul(scores, values)  # [h, N, T_q, num_units/h]
-        out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0)  # [N, T_q, num_units]
-        return out
-class GST(nn.Module):
-    def __init__(self,model_channels=512,num_heads=8,in_channels=80,k=2):
-        super(GST,self).__init__()
-        self.model_channels=model_channels
-        self.num_heads=num_heads
-        self.reference_encoder=nn.Sequential(
-            nn.Conv1d(in_channels,model_channels,3,padding=1,stride=2),
-            nn.Conv1d(model_channels, model_channels*k,3,padding=1,stride=2),
-            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
-            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
-            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
-            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False),
-            AttentionBlock(model_channels*k, num_heads, relative_pos_embeddings=True, do_checkpoint=False)
-        )
-    def forward(self,x):
-        x=self.reference_encoder(x)
-        return x
-if __name__ == '__main__':
-    device = torch.device('cpu')
-    m = GST(512,10).to(device)
-    mels = torch.rand((16,80,1000)).to(device)
-    o = m(mels)
-    print(o.shape,'final output')
-    from torchinfo import summary
-    summary(m, input_data={'x': torch.randn(16,80,500).to(device)})

maha_tts/models/vocoder.py DELETED Viewed

@@ -1,342 +0,0 @@
-'''
-copde from https://github.com/jik876/hifi-gan/blob/master/models.py
-'''
-import json,os
-import torch
-import torch.nn.functional as F
-import torch.nn as nn
-from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
-from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
-# from utils import init_weights, get_padding
-LRELU_SLOPE = 0.1
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-def apply_weight_norm(m):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        weight_norm(m)
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size*dilation - dilation)/2)
-class ResBlock1(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock1, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
-                               padding=get_padding(kernel_size, dilation[2])))
-        ])
-        self.convs1.apply(init_weights)
-        self.convs2 = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
-                               padding=get_padding(kernel_size, 1)))
-        ])
-        self.convs2.apply(init_weights)
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-class ResBlock2(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
-        super(ResBlock2, self).__init__()
-        self.h = h
-        self.convs = nn.ModuleList([
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
-                               padding=get_padding(kernel_size, dilation[0]))),
-            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
-                               padding=get_padding(kernel_size, dilation[1])))
-        ])
-        self.convs.apply(init_weights)
-    def forward(self, x):
-        for c in self.convs:
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs:
-            remove_weight_norm(l)
-class Generator(torch.nn.Module):
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
-        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            self.ups.append(weight_norm(
-                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
-                                k, u, padding=(k-u)//2)))
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel//(2**(i+1))
-            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
-                self.resblocks.append(resblock(h, ch, k, d))
-        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i*self.num_kernels+j](x)
-                else:
-                    xs += self.resblocks[i*self.num_kernels+j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-        return x
-    def remove_weight_norm(self):
-        # print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-class DiscriminatorP(torch.nn.Module):
-    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
-        super(DiscriminatorP, self).__init__()
-        self.period = period
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
-            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
-        ])
-        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
-    def forward(self, x):
-        fmap = []
-        # 1d to 2d
-        b, c, t = x.shape
-        if t % self.period != 0: # pad first
-            n_pad = self.period - (t % self.period)
-            x = F.pad(x, (0, n_pad), "reflect")
-            t = t + n_pad
-        x = x.view(b, c, t // self.period, self.period)
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiPeriodDiscriminator(torch.nn.Module):
-    def __init__(self):
-        super(MultiPeriodDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorP(2),
-            DiscriminatorP(3),
-            DiscriminatorP(5),
-            DiscriminatorP(7),
-            DiscriminatorP(11),
-        ])
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-class DiscriminatorS(torch.nn.Module):
-    def __init__(self, use_spectral_norm=False):
-        super(DiscriminatorS, self).__init__()
-        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
-        self.convs = nn.ModuleList([
-            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
-            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
-            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
-            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
-            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
-        ])
-        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
-    def forward(self, x):
-        fmap = []
-        for l in self.convs:
-            x = l(x)
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            fmap.append(x)
-        x = self.conv_post(x)
-        fmap.append(x)
-        x = torch.flatten(x, 1, -1)
-        return x, fmap
-class MultiScaleDiscriminator(torch.nn.Module):
-    def __init__(self):
-        super(MultiScaleDiscriminator, self).__init__()
-        self.discriminators = nn.ModuleList([
-            DiscriminatorS(use_spectral_norm=True),
-            DiscriminatorS(),
-            DiscriminatorS(),
-        ])
-        self.meanpools = nn.ModuleList([
-            AvgPool1d(4, 2, padding=2),
-            AvgPool1d(4, 2, padding=2)
-        ])
-    def forward(self, y, y_hat):
-        y_d_rs = []
-        y_d_gs = []
-        fmap_rs = []
-        fmap_gs = []
-        for i, d in enumerate(self.discriminators):
-            if i != 0:
-                y = self.meanpools[i-1](y)
-                y_hat = self.meanpools[i-1](y_hat)
-            y_d_r, fmap_r = d(y)
-            y_d_g, fmap_g = d(y_hat)
-            y_d_rs.append(y_d_r)
-            fmap_rs.append(fmap_r)
-            y_d_gs.append(y_d_g)
-            fmap_gs.append(fmap_g)
-        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
-def feature_loss(fmap_r, fmap_g):
-    loss = 0
-    for dr, dg in zip(fmap_r, fmap_g):
-        for rl, gl in zip(dr, dg):
-            loss += torch.mean(torch.abs(rl - gl))
-    return loss*2
-def discriminator_loss(disc_real_outputs, disc_generated_outputs):
-    loss = 0
-    r_losses = []
-    g_losses = []
-    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
-        r_loss = torch.mean((1-dr)**2)
-        g_loss = torch.mean(dg**2)
-        loss += (r_loss + g_loss)
-        r_losses.append(r_loss.item())
-        g_losses.append(g_loss.item())
-    return loss, r_losses, g_losses
-def generator_loss(disc_outputs):
-    loss = 0
-    gen_losses = []
-    for dg in disc_outputs:
-        l = torch.mean((1-dg)**2)
-        gen_losses.append(l)
-        loss += l
-    return loss, gen_losses
-def load_checkpoint(filepath, device):
-    assert os.path.isfile(filepath)
-    checkpoint_dict = torch.load(filepath, map_location=device)
-    return checkpoint_dict
-def load_vocoder_model(config_path,checkpoint_path,device):
-    # config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
-    with open(config_path) as f:
-        data = f.read()
-    global h
-    json_config = json.loads(data)
-    h = AttrDict(json_config)
-    torch.manual_seed(h.seed)
-    generator = Generator(h).to(device)
-    state_dict_g = load_checkpoint(checkpoint_path, device)
-    generator.load_state_dict(state_dict_g['generator'])
-    generator.eval()
-    generator.remove_weight_norm()
-    return generator
-def infer_wav(mel,generator):
-    MAX_WAV_VALUE =32768.0
-    with torch.no_grad():
-        y_g_hat = generator(mel)
-        audio = y_g_hat.squeeze()
-        audio = audio * MAX_WAV_VALUE
-        audio = audio.cpu().numpy().astype('int16')
-    return audio

maha_tts/pretrained_models/.DS_Store CHANGED Viewed

Binary files a/maha_tts/pretrained_models/.DS_Store and b/maha_tts/pretrained_models/.DS_Store differ

maha_tts/pretrained_models/Smolie-en/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

maha_tts/pretrained_models/Smolie-en/s2a_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1cb3aef9bebda0535dce135de3ae5f23f62ec3890eed87469dfe4a9a07f0f98
+size 1720934888

maha_tts/pretrained_models/{smolie/T2S → Smolie-en}/t2s_best.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67a10c3bf12a8bca3dd67075ccbfbd79887b244109bd9c96013b0f348d9e2570
-size 276146627

 version https://git-lfs.github.com/spec/v1
+oid sha256:6be1b489366ebbd35e55404be875804d380b0430587319f67e592da7ba1b5240
+size 276143363

maha_tts/pretrained_models/Smolie-in/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

maha_tts/pretrained_models/Smolie-in/s2a_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce73f611d8071f69111b71363fdad9465d84c4431fc26dc6b6de4595591c3305
+size 1720934441

maha_tts/pretrained_models/{smolie/S2A/s2a_latest.pt → Smolie-in/t2s_best.pt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf359fab98b047ef89d79a99a78fee9c38880e307630d3b3af7bc9cb170f366b
-size 432971673

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c867f8a11f364b4cf543b42335e0a7f0450078693c66539296de1adcf2f27e6
+size 823446386

maha_tts/text/cleaners.py CHANGED Viewed

@@ -135,8 +135,8 @@ def transliteration_cleaners(text):
 def english_cleaners(text):
   '''Pipeline for English text, including number and abbreviation expansion.'''
-  text = convert_to_ascii(text)
-  text = lowercase(text)
   text = expand_numbers(text)
   text = expand_abbreviations(text)
   text = collapse_whitespace(text)

 def english_cleaners(text):
   '''Pipeline for English text, including number and abbreviation expansion.'''
+  # text = convert_to_ascii(text)
+  # text = lowercase(text)
   text = expand_numbers(text)
   text = expand_abbreviations(text)
   text = collapse_whitespace(text)

maha_tts/text/symbols.py CHANGED Viewed

@@ -2,12 +2,18 @@ import sys
 from maha_tts.config import config
 labels=" abcdefghijklmnopqrstuvwxyz.,:;'()?!\""
-labels=" !\"'(),-.:;?[]abcdefghijklmnopqrstuvwxyzàâèéêü’“”"
 labels= [i for i in labels]
 text_labels = [i for i in labels]
 text_labels+='<S>','<E>','<PAD>'
 code_labels= [str(i) for i in range(config.semantic_model_centroids)]
 labels+=code_labels
 code_labels+='<SST>','<EST>','<PAD>'
@@ -21,6 +27,10 @@ tok_dec = {i:j for i,j in enumerate(labels)}
 text_enc = {j:i for i,j in enumerate(text_labels)}
 text_dec = {i:j for i,j in enumerate(text_labels)}
 #code encdec
 code_enc = {j:i for i,j in enumerate(code_labels)}
 code_dec = {i:j for i,j in enumerate(code_labels)}

 from maha_tts.config import config
 labels=" abcdefghijklmnopqrstuvwxyz.,:;'()?!\""
+labels_en=" !\"'(),-.:;?[]abcdefghijklmnopqrstuvwxyzàâèéêü’“”"
+labels='''ଊతూിਮ০य़లഢਪਟକఝૂएड‌`यঢअచଢ଼ਧ—ତলશರଖच,பવड़ષंಈಮਤਇଥkखഗబ= इਸಣਹછ™ୟ.ोೀৎುഊଳંർਘମഴఙसଗൃlଝਜఇഓਐভയಅಠభാടਔಒ೧পஜaૅૠএଲ৯eകँ৭àৱऊટഒਗহિేயీെஈଓഭೊাੌಙ१ଈःസठખm‘ొऍಿcശrట।ऱଋઘਛெਬಂङಹஞ਼ભ১"એੂചಸગಷ়ଁമಓtஒઉಪs్-pଛ›ढ+ಆ'বનধৰউીଅઝ੍ೂʼൂఔfતषഖঢ়৬ਖक़ਵషணझപળଔઞੇವௗઁത২xెഥख़iটਲધಔೇீથ*ഝॅঃஓूఒীనਜ਼எુுహौ९ൗౌফഔોhஔণంफ़ఋçଯઊൽଆ’ୁைഛ२&ঁണ़ైৌআஆোਠਭजொમळಘஷഏি/ચਾ“ਯ$ଐീवऩ८ઢఛఎেథഠ[औಳରथୃൈಝnজਥऑଷੱल೯wओଵढ़மവरడఊbೖਈૃपdêଉఐ;ै ఢ	ઔકচ৩‎ਊൾഉਕ೦ಏj€:ਦಗાളੁशफുழൻಊगફఏఅ?णറഘಞ४ಡಫଠ್ড೨ൊঞमਂસૉॉઅരஙલঘନ്ఠॄvઋృষऎகೕଘઆఞലେূஊఉૈദఫఈदকज़!ధઠవଞறಟਖ਼ਫ਼ইਢഡঠஃஸୂटঅହఆளోईৃಜ॥(ઈଏੀഈक્গ ಚಢഹೃिஏಯyশேଡೋੈਣડఃഷഇਸ਼நখಋோনૐਏgहৗೈृவੰଜग़ੋ୍)ൌరమൺংञਓપయധஇോ५ઃಲళঊತॽന…ঙಭाಇउਅଶরઓି্ூমuపബ\ૌଟबਆुಕଫதছ३దਿದణஐௌ்ৈqఘலહಾ०ಛঐிওऋ‍ి৮ेਨଇүଧഞಶéਚ्৫ୋశఓદঈୀ৪ପüুങਗ਼ઑજথఖঝಐऽਰାആജीઇੜ]आବଡ଼ഫಥుಎણଃયछஅેஹംଢબoদഎగଭాേഅঋসഐಃzਡಬਝன–உಖಉഃযସୈೆకॐನഋয়సசଙড়ୱऒऐઐतଂாতરâèनಧ॑டঔभர”జ৷ਫଣଚଦधघೌୌਉ'''
 labels= [i for i in labels]
+labels_en= [i for i in labels_en]
 text_labels = [i for i in labels]
 text_labels+='<S>','<E>','<PAD>'
+text_labels_en = [i for i in labels_en]
+text_labels_en+='<S>','<E>','<PAD>'
 code_labels= [str(i) for i in range(config.semantic_model_centroids)]
 labels+=code_labels
 code_labels+='<SST>','<EST>','<PAD>'
 text_enc = {j:i for i,j in enumerate(text_labels)}
 text_dec = {i:j for i,j in enumerate(text_labels)}
+text_enc_en = {j:i for i,j in enumerate(text_labels_en)}
+text_dec_en = {i:j for i,j in enumerate(text_labels_en)}
 #code encdec
 code_enc = {j:i for i,j in enumerate(code_labels)}
 code_dec = {i:j for i,j in enumerate(code_labels)}

maha_tts/utils/audio.py CHANGED Viewed

@@ -6,8 +6,8 @@ from scipy.signal import get_window
 from scipy.io.wavfile import read
 from maha_tts.config import config
-TACOTRON_MEL_MAX = 2.3143386840820312
-TACOTRON_MEL_MIN = -11.512925148010254
 def denormalize_tacotron_mel(norm_mel):

 from scipy.io.wavfile import read
 from maha_tts.config import config
+TACOTRON_MEL_MAX = 2.4
+TACOTRON_MEL_MIN = -11.5130
 def denormalize_tacotron_mel(norm_mel):

ref_clips/2971_4275_000003_000007.wav DELETED Viewed

Binary file (392 kB)

ref_clips/2971_4275_000020_000001.wav DELETED Viewed

Binary file (386 kB)

ref_clips/2971_4275_000023_000010.wav DELETED Viewed

Binary file (435 kB)

ref_clips/2971_4275_000049_000000.wav DELETED Viewed

Binary file (366 kB)

ref_clips/2971_4275_000049_000004.wav DELETED Viewed

Binary file (321 kB)

ref_clips/2971_4275_000050_000000.wav DELETED Viewed

Binary file (385 kB)

requirements.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+annotated-types==0.6.0
+audioread==3.0.1
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+decorator==5.1.1
+einops==0.7.0
+filelock==3.13.1
+fsspec==2023.10.0
+huggingface-hub==0.19.4
+idna==3.4
+inflect==7.0.0
+Jinja2==3.1.2
+joblib==1.3.2
+lazy_loader==0.3
+librosa==0.10.1
+llvmlite==0.41.1
+MarkupSafe==2.1.3
+mpmath==1.3.0
+msgpack==1.0.7
+networkx==3.2.1
+numba==0.58.1
+numpy==1.26.2
+packaging==23.2
+platformdirs==4.0.0
+pooch==1.8.0
+pycparser==2.21
+pydantic==2.5.1
+pydantic_core==2.14.3
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.31.0
+safetensors==0.4.0
+scikit-learn==1.3.2
+scipy==1.11.3
+soundfile==0.12.1
+soxr==0.3.7
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+torch==2.1.1
+tqdm==4.66.1
+transformers==4.35.2
+typing_extensions==4.8.0
+Unidecode==1.3.7
+urllib3==2.1.0

setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+from setuptools import setup, find_packages
+__version__ = '1.0.0'
+cwd = os.path.dirname(os.path.abspath(__file__))
+# requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
+setup(
+    name='maha_tts',
+    version=__version__,
+    url='https://github.com/dubverse-ai/MahaTTS/tree/main',
+    author='Dubverse AI',
+    author_email='jaskaran@dubverse.ai',
+    install_requires = [
+        'einops',
+        'transformers',
+        'unidecode',
+        'inflect'
+    ],
+    packages=find_packages(),
+    py_modules=['maha_tts'],
+)

tts.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import torch,glob
-from maha_tts import load_diffuser,load_models,infer_tts
 from scipy.io.wavfile import write
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print('Using:',device)
 text = 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.'
-ref_clips = glob.glob('/Users/jaskaransingh/Desktop/NeuralSpeak/ref_clips/*.wav')
 # print(len(ref_clips))
 # diffuser = load_diffuser()
-diff_model,ts_model,vocoder,diffuser = load_models('Smolie',device)
-audio,sr = infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder)
 write('test.wav',sr,audio)

 import torch,glob
+from maha_tts import load_diffuser,load_models,infer_tts,config
 from scipy.io.wavfile import write
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print('Using:',device)
 text = 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition.'
+langauge = 'english'
+language = torch.tensor(config.lang_index[langauge]).to(device).unsqueeze(0)
+ref_clips = glob.glob('models/Smolie-en/ref_clips/part0_1_1/*.wav')
 # print(len(ref_clips))
 # diffuser = load_diffuser()
+diff_model,ts_model,vocoder,diffuser = load_models('Smolie-in',device)
+audio,sr = infer_tts(text,ref_clips,diffuser,diff_model,ts_model,vocoder,language)
 write('test.wav',sr,audio)