sweetcocoa commited on
Commit
71a2b8b
1 Parent(s): db4880c

refactor ui

Browse files
Files changed (3) hide show
  1. app.py +54 -91
  2. requirements.txt +2 -2
  3. utils.py +21 -0
app.py CHANGED
@@ -2,16 +2,17 @@ import os
2
  import binascii
3
  import warnings
4
 
5
- import torch
6
  import librosa
7
  import numpy as np
8
- import pytube as pt # to download the youtube videos as audios
9
- import gradio as gr
10
- import soundfile as sf # to make the stereo mix
11
 
12
  from pytube.exceptions import VideoUnavailable
13
  from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
14
 
 
15
 
16
  yt_video_dir = "./yt_dir"
17
  outputs_dir = "./midi_wav_outputs"
@@ -24,7 +25,7 @@ processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano")
24
  composers = model.generation_config.composer_to_feature_token.keys()
25
 
26
 
27
- def get_audio_from_yt_video(yt_link):
28
  try:
29
  yt = pt.YouTube(yt_link)
30
  t = yt.streams.filter(only_audio=True)
@@ -40,55 +41,43 @@ def get_audio_from_yt_video(yt_link):
40
  def inference(file_uploaded, composer):
41
  # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
42
  # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
43
- waveform, sr = librosa.load(file_uploaded, sr=None)
44
 
45
- inputs = processor(audio=waveform, sampling_rate=sr, return_tensors="pt").to(device)
46
  model_output = model.generate(input_features=inputs["input_features"], composer=composer)
47
  tokenizer_output = processor.batch_decode(
48
  token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
49
  )["pretty_midi_objects"]
50
 
51
- return prepare_output_file(tokenizer_output, sr)
52
 
53
 
54
- def prepare_output_file(tokenizer_output, sr:int):
55
  # Add some random values so that no two file names are same
56
- output_file_name = "output_" + binascii.hexlify(os.urandom(8)).decode()
57
  midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
58
 
59
  # write the .mid and its wav files
60
  tokenizer_output[0].write(midi_output)
61
- midi_wav:np.ndarray = tokenizer_output[0].fluidsynth(sr)
62
- wav_output:str = midi_output.replace(".mid", ".wav")
63
- sf.write(wav_output, midi_wav, samplerate=sr)
64
-
65
- return wav_output, wav_output, midi_output
66
-
67
-
68
- def get_stereo(pop_path, midi, pop_scale=0.5):
69
- pop_y, sr = librosa.load(pop_path, sr=None)
70
- midi_y, _ = librosa.load(midi.name, sr=None)
71
 
 
72
  if len(pop_y) > len(midi_y):
73
  midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
74
  elif len(pop_y) < len(midi_y):
75
  pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
76
- stereo = np.stack((midi_y, pop_y * pop_scale))
77
-
78
- stereo_mix_path = pop_path.replace("output", "output_stereo_mix")
79
- sf.write(
80
- file=stereo_mix_path,
81
- data=stereo.T,
82
- samplerate=sr,
83
- format="wav",
84
- )
85
 
86
- return stereo_mix_path, stereo_mix_path
87
 
88
 
89
- # Thanks a lot to "https://huggingface.co/Taithrah" for this theme.
90
- # taken from https://huggingface.co/spaces/NoCrypt/miku
91
- block = gr.Blocks(theme="Taithrah/Minimal")
92
 
93
  with block:
94
  gr.HTML(
@@ -114,67 +103,48 @@ with block:
114
  """
115
  )
116
  with gr.Group():
117
- with gr.Row(equal_height=True):
118
- with gr.Column():
119
- file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
120
- with gr.Column():
121
- with gr.Row():
122
- yt_link = gr.Textbox(
123
- label="Enter YouTube Link of the Video", autofocus=True, lines=3
 
 
 
 
 
124
  )
125
- yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
126
-
127
- yt_audio_path = gr.Audio(
128
- label="Audio Extracted from the YouTube Video", interactive=False
129
- )
130
- yt_btn.click(
131
- get_audio_from_yt_video,
132
- inputs=[yt_link],
133
- outputs=[yt_audio_path, file_uploaded],
134
- )
135
 
136
  with gr.Group():
137
- with gr.Column():
138
- composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
139
- generate_btn = gr.Button("Generate")
140
-
 
141
  with gr.Row().style(mobile_collapse=False, equal_height=True):
142
- wav_output2 = gr.File(label="Download the Generated MIDI (.wav)")
143
  wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
 
 
 
 
144
  midi_output = gr.File(label="Download the Generated MIDI (.mid)")
145
  generate_btn.click(
146
  inference,
147
  inputs=[file_uploaded, composer],
148
- outputs=[wav_output1, wav_output2, midi_output],
149
  )
150
 
151
- with gr.Group():
152
- gr.HTML(
153
- """
154
- <div> <h3> <center> Get the Stereo Mix from the Pop Music and Generated MIDI </h3> </div>
155
- """
156
- )
157
- pop_scale = (
158
- gr.Slider(
159
- 0,
160
- 1,
161
- value=0.5,
162
- label="Choose the ratio between Pop and MIDI",
163
- info="1.0 = Only Pop, 0.0=Only MIDI",
164
- interactive=True,
165
- ),
166
- )
167
- stereo_btn = gr.Button("Get Stereo Mix")
168
- with gr.Row():
169
- stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
170
- stereo_mix2 = gr.File(label="Download the Stereo Mix")
171
-
172
- stereo_btn.click(
173
- get_stereo,
174
- inputs=[file_uploaded, wav_output2, pop_scale[0]],
175
- outputs=[stereo_mix1, stereo_mix2],
176
- )
177
-
178
  with gr.Group():
179
  gr.Examples(
180
  [
@@ -182,16 +152,9 @@ with block:
182
  ],
183
  fn=inference,
184
  inputs=[file_uploaded, composer],
185
- outputs=[wav_output1, wav_output2, midi_output],
186
  cache_examples=True,
187
  )
188
- gr.HTML(
189
- """
190
- <div class="footer">
191
- <center>The design for this Space is taken from <a href="https://huggingface.co/spaces/NoCrypt/miku"> NoCrypt/miku </a>
192
- </div>
193
- """
194
- )
195
 
196
  gr.HTML(
197
  """
 
2
  import binascii
3
  import warnings
4
 
5
+ import gradio as gr
6
  import librosa
7
  import numpy as np
8
+ import torch
9
+ import pretty_midi
10
+ import pytube as pt
11
 
12
  from pytube.exceptions import VideoUnavailable
13
  from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor
14
 
15
+ from utils import mp3_write, normalize
16
 
17
  yt_video_dir = "./yt_dir"
18
  outputs_dir = "./midi_wav_outputs"
 
25
  composers = model.generation_config.composer_to_feature_token.keys()
26
 
27
 
28
+ def get_audio_from_yt_video(yt_link: str):
29
  try:
30
  yt = pt.YouTube(yt_link)
31
  t = yt.streams.filter(only_audio=True)
 
41
  def inference(file_uploaded, composer):
42
  # to save the native sampling rate of the file, sr=None is used, but this can cause some silent errors where the
43
  # generated output will not be upto the desired quality. If that happens please consider switching sr to 44100 Hz.
44
+ pop_y, sr = librosa.load(file_uploaded, sr=None)
45
 
46
+ inputs = processor(audio=pop_y, sampling_rate=sr, return_tensors="pt").to(device)
47
  model_output = model.generate(input_features=inputs["input_features"], composer=composer)
48
  tokenizer_output = processor.batch_decode(
49
  token_ids=model_output.to("cpu"), feature_extractor_output=inputs.to("cpu")
50
  )["pretty_midi_objects"]
51
 
52
+ return prepare_output_file(tokenizer_output, sr, pop_y)
53
 
54
 
55
+ def prepare_output_file(tokenizer_output: pretty_midi.PrettyMIDI, sr: int, pop_y: np.ndarray):
56
  # Add some random values so that no two file names are same
57
+ output_file_name = "p2p_" + binascii.hexlify(os.urandom(8)).decode()
58
  midi_output = os.path.join(outputs_dir, output_file_name + ".mid")
59
 
60
  # write the .mid and its wav files
61
  tokenizer_output[0].write(midi_output)
62
+ midi_y: np.ndarray = tokenizer_output[0].fluidsynth(sr)
63
+ midi_y_path: str = midi_output.replace(".mid", ".mp3")
64
+ mp3_write(midi_y_path, sr, normalize(midi_y), normalized=True)
 
 
 
 
 
 
 
65
 
66
+ # stack stereo audio
67
  if len(pop_y) > len(midi_y):
68
  midi_y = np.pad(midi_y, (0, len(pop_y) - len(midi_y)))
69
  elif len(pop_y) < len(midi_y):
70
  pop_y = np.pad(pop_y, (0, -len(pop_y) + len(midi_y)))
71
+ stereo = np.stack((midi_y, pop_y * 0.5))
72
+
73
+ # write stereo audio
74
+ stereo_path = midi_output.replace(".mid", ".mix.mp3")
75
+ mp3_write(stereo_path, sr, normalize(stereo.T), normalized=True)
 
 
 
 
76
 
77
+ return midi_y_path, midi_y_path, midi_output, stereo_path, stereo_path
78
 
79
 
80
+ block = gr.Blocks()
 
 
81
 
82
  with block:
83
  gr.HTML(
 
103
  """
104
  )
105
  with gr.Group():
106
+ with gr.Column():
107
+ with gr.Blocks() as audio_select:
108
+ with gr.Tab("Upload Audio"):
109
+ file_uploaded = gr.Audio(label="Upload an audio", type="filepath")
110
+ with gr.Tab("YouTube url"):
111
+ with gr.Row():
112
+ yt_link = gr.Textbox(
113
+ label="Enter YouTube Link of the Video", autofocus=True, lines=3
114
+ )
115
+ yt_btn = gr.Button("Download Audio from YouTube Link", size="lg")
116
+ yt_audio_path = gr.Audio(
117
+ label="Audio Extracted from the YouTube Video", interactive=False
118
  )
119
+ yt_btn.click(
120
+ get_audio_from_yt_video,
121
+ inputs=[yt_link],
122
+ outputs=[yt_audio_path, file_uploaded],
123
+ )
124
+ with gr.Column():
125
+ composer = gr.Dropdown(label="Arranger", choices=composers, value="composer1")
126
+ generate_btn = gr.Button("Generate")
 
 
127
 
128
  with gr.Group():
129
+ gr.HTML(
130
+ """
131
+ <div> <h3> <center> Listen to the generated MIDI. </h3> </div>
132
+ """
133
+ )
134
  with gr.Row().style(mobile_collapse=False, equal_height=True):
135
+ stereo_mix1 = gr.Audio(label="Listen to the Stereo Mix")
136
  wav_output1 = gr.Audio(label="Listen to the Generated MIDI")
137
+
138
+ with gr.Row():
139
+ stereo_mix2 = gr.File(label="Download the Stereo Mix (.mp3")
140
+ wav_output2 = gr.File(label="Download the Generated MIDI (.mp3)")
141
  midi_output = gr.File(label="Download the Generated MIDI (.mid)")
142
  generate_btn.click(
143
  inference,
144
  inputs=[file_uploaded, composer],
145
+ outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
146
  )
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  with gr.Group():
149
  gr.Examples(
150
  [
 
152
  ],
153
  fn=inference,
154
  inputs=[file_uploaded, composer],
155
+ outputs=[wav_output1, wav_output2, midi_output, stereo_mix1, stereo_mix2],
156
  cache_examples=True,
157
  )
 
 
 
 
 
 
 
158
 
159
  gr.HTML(
160
  """
requirements.txt CHANGED
@@ -3,8 +3,8 @@ librosa
3
  pretty-midi==0.2.9
4
  essentia==2.1b6.dev1034
5
  pyFluidSynth==1.3.0
6
- git+https://github.com/huggingface/transformers
7
  pytube
8
  gradio
9
  resampy
10
- soundfile
 
3
  pretty-midi==0.2.9
4
  essentia==2.1b6.dev1034
5
  pyFluidSynth==1.3.0
6
+ transformers
7
  pytube
8
  gradio
9
  resampy
10
+ pydub
utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pydub
3
+
4
+
5
+ def mp3_write(f: str, sr: int, x: np.ndarray, normalized: bool = False):
6
+ channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1
7
+ if normalized: # normalized array - each item should be a float in [-1, 1)
8
+ y = np.int16(x * 2**15)
9
+ else:
10
+ y = np.int16(x)
11
+ song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels)
12
+ song.export(f, format="mp3", bitrate="256k")
13
+
14
+
15
+ def normalize(audio: np.ndarray, min_y: float = -1.0, max_y: float = 1.0, eps: float = 1e-8):
16
+ max_y -= eps
17
+ min_y += eps
18
+ amax = audio.max()
19
+ amin = audio.min()
20
+ audio = (max_y - min_y) * (audio - amin) / (amax - amin) + min_y
21
+ return audio