DanLeBossDeESGI commited on
Commit
beb5f8e
1 Parent(s): be80b10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -34
app.py CHANGED
@@ -1,5 +1,5 @@
1
- import streamlit as st
2
  import torch
 
3
  from diffusers import AudioLDMPipeline
4
  from transformers import AutoProcessor, ClapModel
5
 
@@ -22,21 +22,6 @@ processor = AutoProcessor.from_pretrained("sanchit-gandhi/clap-htsat-unfused-m-f
22
 
23
  generator = torch.Generator(device)
24
 
25
- # Streamlit app setup
26
- st.set_page_config(
27
- page_title="Text to Music",
28
- page_icon="🎵",
29
- )
30
-
31
- text_input = st.text_input("Input text", "A hammer is hitting a wooden surface")
32
- negative_prompt = st.text_input("Negative prompt", "low quality, average quality")
33
-
34
- st.markdown("### Configuration")
35
- seed = st.number_input("Seed", value=45)
36
- duration = st.slider("Duration (seconds)", 2.5, 10.0, 5.0, 2.5)
37
- guidance_scale = st.slider("Guidance scale", 0.0, 4.0, 2.5, 0.5)
38
- n_candidates = st.slider("Number waveforms to generate", 1, 3, 3, 1)
39
-
40
  def score_waveforms(text, waveforms):
41
  inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
42
  inputs = {key: inputs[key].to(device) for key in inputs}
@@ -47,24 +32,39 @@ def score_waveforms(text, waveforms):
47
  waveform = waveforms[most_probable]
48
  return waveform
49
 
50
- if st.button("Submit"):
51
- if text_input is None:
52
- st.error("Please provide a text input.")
 
 
 
 
 
 
 
 
 
 
53
  else:
54
- waveforms = pipe(
55
- text_input,
56
- audio_length_in_s=duration,
57
- guidance_scale=guidance_scale,
58
- num_inference_steps=100,
59
- negative_prompt=negative_prompt,
60
- num_waveforms_per_prompt=n_candidates if n_candidates else 1,
61
- generator=generator.manual_seed(int(seed)),
62
- )["audios"]
63
 
64
- if waveforms.shape[0] > 1:
65
- waveform = score_waveforms(text_input, waveforms)
66
- else:
67
- waveform = waveforms[0]
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- # Spécifiez le taux d'échantillonnage (sample_rate) et le format audio
70
- st.audio(waveform, format="audio/wav", sample_rate=16000)
 
 
1
  import torch
2
+ import gradio as gr
3
  from diffusers import AudioLDMPipeline
4
  from transformers import AutoProcessor, ClapModel
5
 
 
22
 
23
  generator = torch.Generator(device)
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def score_waveforms(text, waveforms):
26
  inputs = processor(text=text, audios=list(waveforms), return_tensors="pt", padding=True)
27
  inputs = {key: inputs[key].to(device) for key in inputs}
 
32
  waveform = waveforms[most_probable]
33
  return waveform
34
 
35
+ def text_to_music(text_input, negative_prompt, seed, duration, guidance_scale, n_candidates):
36
+ waveforms = pipe(
37
+ text_input,
38
+ audio_length_in_s=duration,
39
+ guidance_scale=guidance_scale,
40
+ num_inference_steps=100,
41
+ negative_prompt=negative_prompt,
42
+ num_waveforms_per_prompt=n_candidates if n_candidates else 1,
43
+ generator=generator.manual_seed(int(seed)),
44
+ )["audios"]
45
+
46
+ if waveforms.shape[0] > 1:
47
+ waveform = score_waveforms(text_input, waveforms)
48
  else:
49
+ waveform = waveforms[0]
50
+
51
+ return waveform.detach().cpu().numpy()
 
 
 
 
 
 
52
 
53
+ iface = gr.Interface(
54
+ fn=text_to_music,
55
+ inputs=[
56
+ gr.inputs.Textbox(label="Input text", default="A hammer is hitting a wooden surface"),
57
+ gr.inputs.Textbox(label="Negative prompt", default="low quality, average quality"),
58
+ gr.inputs.Number(label="Seed", default=45),
59
+ gr.inputs.Slider(label="Duration (seconds)", minimum=2.5, maximum=10.0, default=5.0, step=0.1),
60
+ gr.inputs.Slider(label="Guidance scale", minimum=0.0, maximum=4.0, default=2.5, step=0.1),
61
+ gr.inputs.Slider(label="Number waveforms to generate", minimum=1, maximum=3, default=3, step=1),
62
+ ],
63
+ outputs=gr.outputs.Audio(label="Generated Audio", type="numpy"),
64
+ live=True,
65
+ title="Text to Music",
66
+ description="Convert text into music using a pre-trained model.",
67
+ theme="default",
68
+ )
69
 
70
+ iface.launch()