wasmdashai commited on
Commit
1f30c42
·
verified ·
1 Parent(s): 9c2fa2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +158 -154
app.py CHANGED
@@ -1,176 +1,180 @@
1
- import gradio as gr
2
- import spaces
3
- import torch
4
- from transformers import AutoTokenizer,VitsModel
5
- import os
6
- import numpy as np
7
 
8
  token=os.environ.get("key_")
9
- print(token)
10
- #tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
11
- models= {}
12
-
13
- import noisereduce as nr
14
-
15
  import torch
16
- from typing import Any, Callable, Optional, Tuple, Union,Iterator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- import torch.nn as nn # Import the missing module
19
- def remove_noise_nr(audio_data,sr=16000):
20
- reduced_noise = nr.reduce_noise(y=audio_data,hop_length=256, sr=sr)
21
  return reduced_noise
22
 
 
23
  def _inference_forward_stream(
24
  self,
25
- input_ids: Optional[torch.Tensor] = None,
26
- attention_mask: Optional[torch.Tensor] = None,
27
- speaker_embeddings: Optional[torch.Tensor] = None,
28
- output_attentions: Optional[bool] = None,
29
- output_hidden_states: Optional[bool] = None,
30
- return_dict: Optional[bool] = None,
31
- padding_mask: Optional[torch.Tensor] = None,
32
- chunk_size: int = 32, # Chunk size for streaming output
33
- is_streaming: bool = True,
34
- ) -> Iterator[torch.Tensor]:
35
- """Generates speech waveforms in a streaming fashion."""
36
- if attention_mask is not None:
37
- padding_mask = attention_mask.unsqueeze(-1).float()
38
- else:
39
- padding_mask = torch.ones_like(input_ids).unsqueeze(-1).float()
40
-
41
-
42
-
43
- text_encoder_output = self.text_encoder(
44
- input_ids=input_ids,
45
- padding_mask=padding_mask,
46
- attention_mask=attention_mask,
47
- output_attentions=output_attentions,
48
- output_hidden_states=output_hidden_states,
49
- return_dict=return_dict,
 
 
50
  )
51
- hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
52
- hidden_states = hidden_states.transpose(1, 2)
53
- input_padding_mask = padding_mask.transpose(1, 2)
54
-
55
- prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
56
- prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
57
-
58
- if self.config.use_stochastic_duration_prediction:
59
- log_duration = self.duration_predictor(
60
- hidden_states,
61
- input_padding_mask,
62
- speaker_embeddings,
63
- reverse=True,
64
- noise_scale=self.noise_scale_duration,
65
- )
66
- else:
67
- log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
68
-
69
- length_scale = 1.0 / self.speaking_rate
70
- duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
71
- predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
72
-
73
-
74
- # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
75
- indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
76
- output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
77
- output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
78
-
79
- # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
80
- attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
81
- batch_size, _, output_length, input_length = attn_mask.shape
82
- cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
83
- indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
84
- valid_indices = indices.unsqueeze(0) < cum_duration
85
- valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
86
- padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
87
- attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
88
-
89
- # Expand prior distribution
90
- prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
91
- prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
92
-
93
- prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
94
- latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
95
-
96
- spectrogram = latents * output_padding_mask
97
- if is_streaming:
98
-
99
- for i in range(0, spectrogram.size(-1), chunk_size):
100
- with torch.no_grad():
101
- wav=self.decoder(spectrogram[:,:,i : i + chunk_size] ,speaker_embeddings)
102
- yield wav.squeeze().cpu().numpy()
103
- else:
104
-
105
- wav=self.decoder(spectrogram,speaker_embeddings)
106
- yield wav.squeeze().cpu().numpy()
107
- @spaces.GPU
108
- def get_model(name_model):
109
- global models
110
- if name_model in models:
111
- if name_model=='wasmdashai/vits-en-v1':
112
- tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
113
- else:
114
- tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
115
 
116
-
 
 
117
 
118
-
119
- return models[name_model],tokenizer
120
- models[name_model]=VitsModel.from_pretrained(name_model,token=token).cuda()
121
-
122
-
123
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  models[name_model].decoder.apply_weight_norm()
125
- # torch.nn.utils.weight_norm(self.decoder.conv_pre)
126
- # torch.nn.utils.weight_norm(self.decoder.conv_post)
127
  for flow in models[name_model].flow.flows:
128
  torch.nn.utils.weight_norm(flow.conv_pre)
129
  torch.nn.utils.weight_norm(flow.conv_post)
130
-
131
- if name_model=='wasmdashai/vits-en-v1':
132
- tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vits-en-v1",token=token)
133
- else:
134
- tokenizer = AutoTokenizer.from_pretrained("wasmdashai/vtk",token=token)
135
 
136
- return models[name_model],tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
 
 
 
138
 
139
- zero = torch.Tensor([0]).cuda()
140
- print(zero.device) # <-- 'cpu' 🤔
141
- import torch
142
- TXT="""السلام عليكم ورحمة الله وبركاتة يا هلا وسهلا ومراحب بالغالي اخباركم طيبين ان شاء الله ارحبوا على العين والراس """
143
- @spaces.GPU
144
- def modelspeech(text=TXT,name_model="wasmdashai/vits-ar-sa-huba-v2",speaking_rate=16000):
145
- model,tokenizer=get_model(name_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- inputs = tokenizer(text, return_tensors="pt")
148
-
149
- model.speaking_rate=speaking_rate
150
- with torch.no_grad():
151
- wav=list(_inference_forward_stream(model,input_ids=inputs.input_ids.cuda(),attention_mask=inputs.attention_mask.cuda(),speaker_embeddings= None,is_streaming=False))[0]
152
- # with torch.no_grad():
153
- # wav = model(input_ids=inputs["input_ids"].cuda()).waveform.cpu().numpy().reshape(-1)#.detach()
154
-
155
- return (model.config.sampling_rate,remove_noise_nr(wav))
156
 
157
  model_choices = gr.Dropdown(
158
- choices=[
159
-
160
- "wasmdashai/vits-ar-sa-huba-v1",
161
- "wasmdashai/vits-ar-sa-huba-v2",
162
-
163
- "wasmdashai/vits-ar-sa-A",
164
- "wasmdashai/vits-ar-ye-sa",
165
- "wasmdashai/vits-ar-sa-M-v1",
166
- 'wasmdashai/vits-en-v1'
167
-
168
-
169
- ],
170
- label="اختر النموذج",
171
- value="wasmdashai/vits-ar-sa-huba-v2",
172
- )
173
-
174
- demo = gr.Interface(fn=modelspeech, inputs=["text",model_choices,gr.Slider(0.1, 1, step=0.1,value=0.8)], outputs=["audio"])
 
175
  demo.queue()
176
- demo.launch()
 
1
+
 
 
 
 
 
2
 
3
  token=os.environ.get("key_")
4
+ import gradio as gr
 
 
 
 
 
5
  import torch
6
+ import soundfile as sf
7
+ import os
8
+ import numpy as np
9
+ import noisereduce as nr
10
+ from typing import Optional, Iterator
11
+ import torch.nn as nn
12
+ from transformers import AutoTokenizer, VitsModel # لازم تتأكد أنك مستوردهم
13
+ from concurrent.futures import ThreadPoolExecutor, as_completed
14
+ # اختيار الجهاز (CPU أو GPU)
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ print("✅ Running on:", device)
17
+
18
+
19
+ token=os.environ.get("key_")
20
+ models = {}
21
 
22
+ # فلتر الضوضاء
23
+ def remove_noise_nr(audio_data, sr=16000):
24
+ reduced_noise = nr.reduce_noise(y=audio_data, hop_length=256, sr=sr)
25
  return reduced_noise
26
 
27
+
28
  def _inference_forward_stream(
29
  self,
30
+ input_ids: torch.Tensor,
31
+ attention_mask: torch.Tensor,
32
+ speaker_embeddings: torch.Tensor = None,
33
+ chunk_size: int = 32,
34
+ is_streaming: bool = True
35
+ ):
36
+ import torch.nn as nn
37
+
38
+ padding_mask = attention_mask.unsqueeze(-1).float() if attention_mask is not None else torch.ones_like(input_ids).unsqueeze(-1).float()
39
+
40
+ text_encoder_output = self.text_encoder(
41
+ input_ids=input_ids,
42
+ padding_mask=padding_mask,
43
+ attention_mask=attention_mask
44
+ )
45
+
46
+ hidden_states = text_encoder_output[0]
47
+ hidden_states = hidden_states.transpose(1, 2)
48
+ input_padding_mask = padding_mask.transpose(1, 2)
49
+
50
+ prior_means = text_encoder_output[1]
51
+ prior_log_variances = text_encoder_output[2]
52
+
53
+ # حساب المدة
54
+ if self.config.use_stochastic_duration_prediction:
55
+ log_duration = self.duration_predictor(
56
+ hidden_states, input_padding_mask, speaker_embeddings, reverse=True, noise_scale=self.noise_scale_duration
57
  )
58
+ else:
59
+ log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ length_scale = 1.0 / self.speaking_rate
62
+ duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
63
+ predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
64
 
65
+ indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
66
+ output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
67
+ output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
68
+
69
+ attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
70
+ batch_size, _, output_length, input_length = attn_mask.shape
71
+ cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
72
+ indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
73
+ valid_indices = indices.unsqueeze(0) < cum_duration
74
+ valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
75
+ padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
76
+ attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
77
+
78
+ prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
79
+ prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
80
+
81
+ prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
82
+ latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
83
+ spectrogram = latents * output_padding_mask
84
+
85
+ if is_streaming:
86
+ for i in range(0, spectrogram.size(-1), chunk_size):
87
+ with torch.no_grad():
88
+ yield spectrogram[:, :, i: i + chunk_size]
89
+
90
+ else:
91
+
92
+ yield spectrogram
93
+
94
+
95
+ def get_model(name_model):
96
+ global models
97
+ if name_model in models:
98
+ tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
99
+ return models[name_model], tokenizer
100
+
101
+ models[name_model] = VitsModel.from_pretrained(name_model, token=token)
102
  models[name_model].decoder.apply_weight_norm()
 
 
103
  for flow in models[name_model].flow.flows:
104
  torch.nn.utils.weight_norm(flow.conv_pre)
105
  torch.nn.utils.weight_norm(flow.conv_post)
 
 
 
 
 
106
 
107
+ tokenizer = AutoTokenizer.from_pretrained(name_model, token=token)
108
+ return models[name_model], tokenizer
109
+
110
+ TXT = """السلام عليكم ورحمة الله وبركاته يا هلا وسهلا ومراحب بالغالي اخباركم طيبي�� ان شاء الله ارحبوا على العين والراس"""
111
+ def process_chunk(chunk_id, spectrogram_chunk, speaker_embeddings, decoder):
112
+ with torch.no_grad():
113
+ wav = decoder(torch.tensor(spectrogram_chunk), speaker_embeddings)
114
+ wav = wav.squeeze().cpu().numpy()
115
+ file_path = f"audio_chunks/chunk_{chunk_id}.wav"
116
+ sf.write(file_path, wav, samplerate=16000)
117
+ return file_path
118
+
119
+
120
+ def modelspeech(text=TXT, name_model="wasmdashai/vits-ar-sa-huba-v2", speaking_rate=0.9):
121
+ os.makedirs("audio_chunks", exist_ok=True)
122
+ model, tokenizer = get_model(name_model)
123
+ model.config.sampling_rate=16000
124
+ text = ask_ai(text)
125
+ inputs = tokenizer(text, return_tensors="pt").to(device)
126
+
127
+
128
+ model.speaking_rate = speaking_rate
129
+ chunk_files = []
130
 
131
+ with ThreadPoolExecutor(max_workers=8) as executor:
132
+ futures = []
133
+ chunk_id = 0
134
 
135
+ for spectrogram_chunk in _inference_forward_stream(
136
+ model,
137
+ input_ids=inputs.input_ids,
138
+ attention_mask=inputs.attention_mask,
139
+ speaker_embeddings=None,
140
+ is_streaming=True,
141
+ chunk_size=32
142
+ ):
143
+ futures.append(executor.submit(process_chunk, chunk_id, spectrogram_chunk, None, model.decoder))
144
+ chunk_id += 1
145
+
146
+ for future in as_completed(futures):
147
+ chunk_files.append(future.result())
148
+
149
+
150
+
151
+ chunk_files.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
152
+
153
+
154
+ all_audio = np.concatenate([sf.read(f)[0] for f in chunk_files])
155
 
156
+
157
+ return (model.config.sampling_rate, remove_noise_nr(all_audio))
158
+
 
 
 
 
 
 
159
 
160
  model_choices = gr.Dropdown(
161
+ choices=[
162
+ "wasmdashai/vits-ar-sa-huba-v1",
163
+ "wasmdashai/vits-ar-sa-huba-v2",
164
+ "wasmdashai/vits-ar-sa-A",
165
+ "wasmdashai/vits-ar-ye-sa",
166
+ "wasmdashai/vits-ar-sa-M-v2",
167
+ 'wasmdashai/vits-en-v1'
168
+ ],
169
+ label="اختر النموذج",
170
+ value="wasmdashai/vits-ar-sa-huba-v2",
171
+ )
172
+
173
+ demo = gr.Interface(
174
+ fn=modelspeech,
175
+ inputs=["text", model_choices, gr.Slider(0.1, 1, step=0.1, value=0.8)],
176
+ outputs=[gr.Audio(autoplay=True)]
177
+ )
178
+
179
  demo.queue()
180
+ demo.launch(debug=True)