saheedniyi commited on
Commit
abbff0d
·
verified ·
1 Parent(s): 120f59d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +119 -0
README.md CHANGED
@@ -118,6 +118,125 @@ IPython.display.Audio(audio,rate=24000)
118
  torchaudio.save(f"audio.wav", audio, sample_rate=24000)
119
  ```
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  ## Model Description
122
 
123
  - **Developed by:** [Saheedniyi](https://linkedin.com/in/azeez-saheed)
 
118
  torchaudio.save(f"audio.wav", audio, sample_rate=24000)
119
  ```
120
 
121
+ ### Simple News-Reader for Local languages
122
+ ```python
123
+ # clone the YarnGPT repo to get access to the `audiotokenizer`
124
+ !git clone https://github.com/saheedniyi02/yarngpt.git
125
+
126
+
127
+ # install some necessary libraries
128
+ !pip install outetts uroman trafilatura pydub
129
+
130
+
131
+ #import important packages
132
+ import os
133
+ import re
134
+ import json
135
+ import torch
136
+ import inflect
137
+ import random
138
+ import requests
139
+ import trafilatura
140
+ import inflect
141
+ import uroman as ur
142
+ import numpy as np
143
+ import torchaudio
144
+ import IPython
145
+ from pydub import AudioSegment
146
+ from pydub.effects import normalize
147
+ from transformers import AutoModelForCausalLM, AutoTokenizer
148
+ from outetts.wav_tokenizer.decoder import WavTokenizer
149
+ from yarngpt.audiotokenizer import AudioTokenizer,AudioTokenizerForLocal
150
+
151
+ # download the `WavTokenizer` files
152
+ !wget https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml
153
+ !wget https://huggingface.co/novateur/WavTokenizer-large-speech-75token/resolve/main/wavtokenizer_large_speech_320_24k.ckpt
154
+
155
+ tokenizer_path="saheedniyi/YarnGPT-local"
156
+ wav_tokenizer_config_path="/content/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml"
157
+ wav_tokenizer_model_path = "/content/wavtokenizer_large_speech_320_24k.ckpt"
158
+
159
+
160
+ audio_tokenizer=AudioTokenizerForLocal(
161
+ tokenizer_path,wav_tokenizer_model_path,wav_tokenizer_config_path
162
+ )
163
+
164
+ model = AutoModelForCausalLM.from_pretrained(tokenizer_path,torch_dtype="auto").to(audio_tokenizer.device)
165
+
166
+ # Split text into chunks
167
+ def split_text_into_chunks(text, word_limit=25):
168
+ sentences=[sentence.strip() for sentence in text.split('.') if sentence.strip()]
169
+ chunks=[]
170
+ for sentence in sentences:
171
+ chunks.append(".")
172
+ sentence_splitted=sentence.split(" ")
173
+ num_words=len(sentence_splitted)
174
+ start_index=0
175
+ if num_words>word_limit:
176
+ while start_index<num_words:
177
+ end_index=min(num_words,start_index+word_limit)
178
+ chunks.append(" ".join(sentence_splitted[start_index:start_index+word_limit]))
179
+ start_index=end_index
180
+ else:
181
+ chunks.append(sentence)
182
+ return chunks
183
+
184
+ # reduce the speed of the audio, results from the local languages are always fast
185
+ def speed_change(sound, speed=0.9):
186
+ # Manually override the frame_rate. This tells the computer how many
187
+ # samples to play per second
188
+ sound_with_altered_frame_rate = sound._spawn(sound.raw_data, overrides={
189
+ "frame_rate": int(sound.frame_rate * speed)
190
+ })
191
+ # convert the sound with altered frame rate to a standard frame rate
192
+ # so that regular playback programs will work right. They often only
193
+ # know how to play audio at standard frame rate (like 44.1k)
194
+ return sound_with_altered_frame_rate.set_frame_rate(sound.frame_rate)
195
+
196
+
197
+ page=requests.get("https://alaroye.org/a-maa-too-fo-ipinle-ogun-mo-omo-egbe-okunkun-meje-lowo-ti-te-bayii-omolola/")
198
+ content=trafilatura.extract(page.text)
199
+ chunks=split_text_into_chunks(content)
200
+
201
+
202
+ all_codes=[]
203
+ for i,chunk in enumerate(chunks):
204
+ print(i)
205
+ print("\n")
206
+ print(chunk)
207
+ if chunk==".":
208
+ #add silence for 0.5 seconds if we encounter a full stop
209
+ all_codes.extend([453]*38)
210
+ else:
211
+ prompt=audio_tokenizer.create_prompt(chunk,lang="yoruba",speaker_name="igbo_female1")
212
+ input_ids=audio_tokenizer.tokenize_prompt(prompt)
213
+ output = model.generate(
214
+ input_ids=input_ids,
215
+ temperature=0.1,
216
+ repetition_penalty=1.1,
217
+ max_length=4000,
218
+ num_beams=5,
219
+ )
220
+ codes=audio_tokenizer.get_codes(output)
221
+ all_codes.extend(codes)
222
+
223
+
224
+ audio=audio_tokenizer.get_audio(all_codes)
225
+
226
+ #display the output
227
+ IPython.display.Audio(audio,rate=24000)
228
+
229
+ #save audio
230
+ torchaudio.save(f"news1.wav", audio, sample_rate=24000)
231
+
232
+ #convert file to an `AudioSegment` object for furher processing
233
+ audio_dub=AudioSegment.from_file("news1.wav")
234
+
235
+ # reduce audio speed
236
+ speed_change(audio_dub,0.9)
237
+ ```
238
+
239
+
240
  ## Model Description
241
 
242
  - **Developed by:** [Saheedniyi](https://linkedin.com/in/azeez-saheed)