Update README.md
Browse files
README.md
CHANGED
@@ -118,6 +118,125 @@ IPython.display.Audio(audio,rate=24000)
|
|
118 |
torchaudio.save(f"audio.wav", audio, sample_rate=24000)
|
119 |
```
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
## Model Description
|
122 |
|
123 |
- **Developed by:** [Saheedniyi](https://linkedin.com/in/azeez-saheed)
|
|
|
118 |
torchaudio.save(f"audio.wav", audio, sample_rate=24000)
|
119 |
```
|
120 |
|
121 |
+
### Simple News-Reader for Local languages
|
122 |
+
```python
|
123 |
+
# clone the YarnGPT repo to get access to the `audiotokenizer`
|
124 |
+
!git clone https://github.com/saheedniyi02/yarngpt.git
|
125 |
+
|
126 |
+
|
127 |
+
# install some necessary libraries
|
128 |
+
!pip install outetts uroman trafilatura pydub
|
129 |
+
|
130 |
+
|
131 |
+
#import important packages
|
132 |
+
import os
|
133 |
+
import re
|
134 |
+
import json
|
135 |
+
import torch
|
136 |
+
import inflect
|
137 |
+
import random
|
138 |
+
import requests
|
139 |
+
import trafilatura
|
140 |
+
import inflect
|
141 |
+
import uroman as ur
|
142 |
+
import numpy as np
|
143 |
+
import torchaudio
|
144 |
+
import IPython
|
145 |
+
from pydub import AudioSegment
|
146 |
+
from pydub.effects import normalize
|
147 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
148 |
+
from outetts.wav_tokenizer.decoder import WavTokenizer
|
149 |
+
from yarngpt.audiotokenizer import AudioTokenizer,AudioTokenizerForLocal
|
150 |
+
|
151 |
+
# download the `WavTokenizer` files
|
152 |
+
!wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
|
153 |
+
!wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
|
154 |
+
|
155 |
+
tokenizer_path="saheedniyi/YarnGPT-local"
|
156 |
+
wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
|
157 |
+
wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
|
158 |
+
|
159 |
+
|
160 |
+
audio_tokenizer=AudioTokenizerForLocal(
|
161 |
+
tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
|
162 |
+
)
|
163 |
+
|
164 |
+
model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
|
165 |
+
|
166 |
+
# Split text into chunks
|
167 |
+
def split_text_into_chunks(text, word_limit=25):
|
168 |
+
sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
|
169 |
+
chunks=[]
|
170 |
+
for sentence in sentences:
|
171 |
+
chunks.append(".")
|
172 |
+
sentence_splitted=sentence.split(" ")
|
173 |
+
num_words=len(sentence_splitted)
|
174 |
+
start_index=0
|
175 |
+
if num_words>word_limit:
|
176 |
+
while start_index<num_words:
|
177 |
+
end_index=min(num_words,start_index+word_limit)
|
178 |
+
chunks.append(" ".join(sentence_splitted[start_index:start_index+word_limit]))
|
179 |
+
start_index=end_index
|
180 |
+
else:
|
181 |
+
chunks.append(sentence)
|
182 |
+
return chunks
|
183 |
+
|
184 |
+
# reduce the speed of the audio, results from the local languages are always fast
|
185 |
+
def speed_change(sound, speed=0.9):
|
186 |
+
# Manually override the frame_rate. This tells the computer how many
|
187 |
+
# samples to play per second
|
188 |
+
sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
|
189 |
+
"frame_rate": int(sound.frame_rate * speed)
|
190 |
+
})
|
191 |
+
# convert the sound with altered frame rate to a standard frame rate
|
192 |
+
# so that regular playback programs will work right. They often only
|
193 |
+
# know how to play audio at standard frame rate (like 44.1k)
|
194 |
+
return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
|
195 |
+
|
196 |
+
|
197 |
+
page=requests.get("https://alaroye.org/a-maa-too-fo-ipinle-ogun-mo-omo-egbe-okunkun-meje-lowo-ti-te-bayii-omolola/")
|
198 |
+
content=trafilatura.extract(page.text)
|
199 |
+
chunks=split_text_into_chunks(content)
|
200 |
+
|
201 |
+
|
202 |
+
all_codes=[]
|
203 |
+
for i,chunk in enumerate(chunks):
|
204 |
+
print(i)
|
205 |
+
print("\n")
|
206 |
+
print(chunk)
|
207 |
+
if chunk==".":
|
208 |
+
#add silence for 0.5 seconds if we encounter a full stop
|
209 |
+
all_codes.extend([453]*38)
|
210 |
+
else:
|
211 |
+
prompt=audio_tokenizer.create_prompt(chunk,lang="yoruba",speaker_name="igbo_female1")
|
212 |
+
input_ids=audio_tokenizer.tokenize_prompt(prompt)
|
213 |
+
output = model.generate(
|
214 |
+
input_ids=input_ids,
|
215 |
+
temperature=0.1,
|
216 |
+
repetition_penalty=1.1,
|
217 |
+
max_length=4000,
|
218 |
+
num_beams=5,
|
219 |
+
)
|
220 |
+
codes=audio_tokenizer.get_codes(output)
|
221 |
+
all_codes.extend(codes)
|
222 |
+
|
223 |
+
|
224 |
+
audio=audio_tokenizer.get_audio(all_codes)
|
225 |
+
|
226 |
+
#display the output
|
227 |
+
IPython.display.Audio(audio,rate=24000)
|
228 |
+
|
229 |
+
#save audio
|
230 |
+
torchaudio.save(f"news1.wav", audio, sample_rate=24000)
|
231 |
+
|
232 |
+
#convert file to an `AudioSegment` object for furher processing
|
233 |
+
audio_dub=AudioSegment.from_file("news1.wav")
|
234 |
+
|
235 |
+
# reduce audio speed
|
236 |
+
speed_change(audio_dub,0.9)
|
237 |
+
```
|
238 |
+
|
239 |
+
|
240 |
## Model Description
|
241 |
|
242 |
- **Developed by:** [Saheedniyi](https://linkedin.com/in/azeez-saheed)
|