| import gradio as gr |
| import torch |
| from transformers import BarkModel |
| from optimum.bettertransformer import BetterTransformer |
|
|
| model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16) |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| model = model.to(device) |
|
|
| from transformers import AutoProcessor |
| processor = AutoProcessor.from_pretrained("suno/bark") |
|
|
| |
| model = BetterTransformer.transform(model, keep_original_model=False) |
|
|
| |
| model.enable_cpu_offload() |
|
|
| import numpy as np |
| from scipy.io.wavfile import write as write_wav |
| import wave |
|
|
| def split_text_into_sentences(text): |
| sentences = [] |
| current_sentence = '' |
| words = text.split() |
|
|
| for word in words: |
| current_sentence += ' ' + word |
| if word.endswith('.'): |
| sentences.append(current_sentence.strip()) |
| current_sentence = '' |
|
|
| if current_sentence: |
| sentences.append(current_sentence.strip()) |
|
|
| return sentences |
|
|
| def join_wav_files(input_files, output_file): |
| |
| with wave.open(input_files[0], 'rb') as first_file: |
| |
| params = first_file.getparams() |
|
|
| |
| with wave.open(output_file, 'wb') as output: |
| output.setparams(params) |
|
|
| |
| for input_file in input_files: |
| with wave.open(input_file, 'rb') as input: |
| output.writeframes(input.readframes(input.getnframes())) |
|
|
|
|
| def infer(text_prompt): |
| print(""" |
| — |
| Cutting text in chunks |
| — |
| """) |
| |
| |
| text_chunks = split_text_into_sentences(text_prompt) |
| |
| result = generate(text_chunks, "wav") |
| print(result) |
| |
| |
| output_wav = 'full_story.wav' |
|
|
| join_wav_files(result, output_wav) |
|
|
| return 'full_story.wav' |
|
|
|
|
| def generate(text_prompt, out_type): |
| text_prompt = text_prompt |
|
|
| inputs = processor(text_prompt, voice_preset="v2/en_speaker_6").to(device) |
| |
| with torch.inference_mode(): |
| speech_output = model.generate(**inputs) |
|
|
| input_waves = [] |
| |
| for i, speech_out in enumerate(speech_output): |
| |
| audio_array = speech_out.cpu().numpy().squeeze() |
| print(f'AUDIO_ARRAY: {audio_array}') |
| |
| |
| sampling_rate = model.generation_config.sample_rate |
| print(f'sampling_rate: {sampling_rate}') |
|
|
| if out_type == "numpy": |
| input_waves.append(sampling_rate, audio_array) |
| elif out_type == "wav": |
| |
| |
| |
| audio_data = np.int16(audio_array * 32767) |
| write_wav(f"output_{i}.wav", sampling_rate, audio_data) |
| input_waves.append(f"output_{i}.wav") |
| return input_waves |
|
|
|
|
| with gr.Blocks() as demo: |
| with gr.Column(): |
| prompt = gr.Textbox(label="prompt") |
| submit_btn = gr.Button("Submit") |
| audio_out = gr.Audio() |
| submit_btn.click(fn=infer, inputs=[prompt], outputs=[audio_out]) |
|
|
| demo.launch() |
| |