Spaces:

jhtonyKoo
/

music_mixing_style_transfer

Sleeping

App Files Files Community

jhtonyKoo commited on Oct 24, 2023

Commit

07846b5

•

1 Parent(s): 077a11b

Update inference/mastering_transfer.py

Browse files

Files changed (1) hide show

inference/mastering_transfer.py +51 -62

inference/mastering_transfer.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """
     Inference code of music style transfer
     of the work "Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"
     Process : converts the mastering style of the input music recording to that of the refernce music.
                 files inside the target directory should be organized as follow
                     "path_to_data_directory"/"song_name_#1"/input.wav
@@ -112,73 +111,64 @@ class Mastering_Style_Transfer_Inference:
         # normalized input
         output_name_tag = 'output' if self.args.normalize_input else 'output_notnormed'
-        input_aud = load_wav_segment(input_track_path)
-        reference_aud = load_wav_segment(reference_track_path)
         # input_stems, reference_stems, dir_name
         print(f"---inference file name : {dir_name[0]}---")
         cur_out_dir = dir_name[0].replace(self.target_dir, self.output_dir)
         os.makedirs(cur_out_dir, exist_ok=True)
-        ''' stem-level inference '''
-        inst_outputs = []
-        for cur_inst_idx, cur_inst_name in enumerate(self.args.instruments):
-            print(f'\t{cur_inst_name}...')
-            ''' segmentize whole songs into batch '''
-            if len(input_stems[0][cur_inst_idx][0]) > self.args.segment_length:
-                cur_inst_input_stem = self.batchwise_segmentization(input_stems[0][cur_inst_idx], \
-                                                                            dir_name[0], \
-                                                                            segment_length=self.args.segment_length, \
-                                                                            discard_last=False)
-            else:
-                cur_inst_input_stem = [input_stems[:, cur_inst_idx]]
-            if len(reference_stems[0][cur_inst_idx][0]) > self.args.segment_length*2:
-                cur_inst_reference_stem = self.batchwise_segmentization(reference_stems[0][cur_inst_idx], \
-                                                                            dir_name[0], \
-                                                                            segment_length=self.args.segment_length_ref, \
-                                                                            discard_last=False)
-            else:
-                cur_inst_reference_stem = [reference_stems[:, cur_inst_idx]]
-            ''' inference '''
-            # first extract reference style embedding
-            infered_ref_data_list = []
-            for cur_ref_data in cur_inst_reference_stem:
-                cur_ref_data = cur_ref_data.to(self.device)
-                # Effects Encoder inference
-                with torch.no_grad():
-                    self.models["effects_encoder"].eval()
-                    reference_feature = self.models["effects_encoder"](cur_ref_data)
-                infered_ref_data_list.append(reference_feature)
-            # compute average value from the extracted exbeddings
-            infered_ref_data = torch.stack(infered_ref_data_list)
-            infered_ref_data_avg = torch.mean(infered_ref_data.reshape(infered_ref_data.shape[0]*infered_ref_data.shape[1], infered_ref_data.shape[2]), axis=0)
-            # mixing style converter
-            infered_data_list = []
-            for cur_data in cur_inst_input_stem:
-                cur_data = cur_data.to(self.device)
-                with torch.no_grad():
-                    self.models["mastering_converter"].eval()
-                    infered_data = self.models["mastering_converter"](cur_data, infered_ref_data_avg.unsqueeze(0))
-                infered_data_list.append(infered_data.cpu().detach())
-            # combine back to whole song
-            for cur_idx, cur_batch_infered_data in enumerate(infered_data_list):
-                cur_infered_data_sequential = torch.cat(torch.unbind(cur_batch_infered_data, dim=0), dim=-1)
-                fin_data_out = cur_infered_data_sequential if cur_idx==0 else torch.cat((fin_data_out, cur_infered_data_sequential), dim=-1)
-            # final output of current instrument
-            fin_data_out_inst = fin_data_out[:, :input_stems[0][cur_inst_idx].shape[-1]].numpy()
-            inst_outputs.append(fin_data_out_inst)
-            # save output of each instrument
-            if self.args.save_each_inst:
-                sf.write(os.path.join(cur_out_dir, f"{cur_inst_name}_{output_name_tag}.wav"), fin_data_out_inst.transpose(-1, -2), self.args.sample_rate, 'PCM_16')
         # remix
-        fin_data_out_mix = sum(inst_outputs)
-        fin_output_path = os.path.join(cur_out_dir, f"mixture_{output_name_tag}.wav")
-        sf.write(fin_output_path, fin_data_out_mix.transpose(-1, -2), self.args.sample_rate, 'PCM_16')
-        return fin_output_path
     # Inference whole song
@@ -375,4 +365,3 @@ def set_up_mastering(start_point_in_second=0, duration_in_second=30):
     args.cfg_converter = configs['TCN']['default']
     return args

 """
     Inference code of music style transfer
     of the work "Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"
     Process : converts the mastering style of the input music recording to that of the refernce music.
                 files inside the target directory should be organized as follow
                     "path_to_data_directory"/"song_name_#1"/input.wav
         # normalized input
         output_name_tag = 'output' if self.args.normalize_input else 'output_notnormed'
+        input_aud = load_wav_segment(input_track_path, axis=0)
+        reference_aud = load_wav_segment(reference_track_path, axis=0)
         # input_stems, reference_stems, dir_name
         print(f"---inference file name : {dir_name[0]}---")
         cur_out_dir = dir_name[0].replace(self.target_dir, self.output_dir)
         os.makedirs(cur_out_dir, exist_ok=True)
+        ''' segmentize whole songs into batch '''
+        if input_aud.shape[1] > self.args.segment_length:
+            cur_inst_input_stem = self.batchwise_segmentization(input_aud, \
+                                                                    dir_name[0], \
+                                                                    segment_length=self.args.segment_length, \
+                                                                    discard_last=False)
+        else:
+            cur_inst_input_stem = [input_aud.unsqueeze(0)]
+        if reference_aud.shape[1] > self.args.segment_length*2:
+            cur_inst_reference_stem = self.batchwise_segmentization(reference_aud, \
+                                                                        dir_name[0], \
+                                                                        segment_length=self.args.segment_length_ref, \
+                                                                        discard_last=False)
+        else:
+            cur_inst_reference_stem = [reference_aud.unsqueeze(0)]
+        ''' inference '''
+        # first extract reference style embedding
+        infered_ref_data_list = []
+        for cur_ref_data in cur_inst_reference_stem:
+            cur_ref_data = cur_ref_data.to(self.device)
+            # Effects Encoder inference
+            with torch.no_grad():
+                self.models["effects_encoder"].eval()
+                reference_feature = self.models["effects_encoder"](cur_ref_data)
+            infered_ref_data_list.append(reference_feature)
+        # compute average value from the extracted exbeddings
+        infered_ref_data = torch.stack(infered_ref_data_list)
+        infered_ref_data_avg = torch.mean(infered_ref_data.reshape(infered_ref_data.shape[0]*infered_ref_data.shape[1], infered_ref_data.shape[2]), axis=0)
+        # mastering style converter
+        infered_data_list = []
+        for cur_data in cur_inst_input_stem:
+            cur_data = cur_data.to(self.device)
+            with torch.no_grad():
+                self.models["mastering_converter"].eval()
+                infered_data = self.models["mastering_converter"](cur_data, infered_ref_data_avg.unsqueeze(0))
+            infered_data_list.append(infered_data.cpu().detach())
+        # combine back to whole song
+        for cur_idx, cur_batch_infered_data in enumerate(infered_data_list):
+            cur_infered_data_sequential = torch.cat(torch.unbind(cur_batch_infered_data, dim=0), dim=-1)
+            fin_data_out = cur_infered_data_sequential if cur_idx==0 else torch.cat((fin_data_out, cur_infered_data_sequential), dim=-1)
+        # final output of current instrument
+        fin_data_out_mastered = fin_data_out[:, :input_aud.shape[-1]].numpy()
         # remix
+        fin_output_path = os.path.join(cur_out_dir, f"remastered_output.wav")
+        sf.write(fin_output_path, fin_data_out_mastered.transpose(-1, -2), self.args.sample_rate, 'PCM_16')
+        return fin_output_path_mastering
     # Inference whole song
     args.cfg_converter = configs['TCN']['default']
     return args