Spaces:

amaai-lab
/

video2music

Running on T4

App Files Files Community

kjysmu commited on Nov 30, 2023

Commit

f1a5bcb

1 Parent(s): 0e18a50

modify app

Browse files

Files changed (1) hide show

app.py +119 -79

app.py CHANGED Viewed

@@ -78,6 +78,8 @@ flatsharpDic = {
     'Bb':'A#'
 }
 max_conseq_N = 0
 max_conseq_chord = 2
 tempo = 120
@@ -89,6 +91,17 @@ min_velocity = 49  # Minimum velocity value in the output range
 max_velocity = 112  # Maximum velocity value in the output range
 def split_video_into_frames(video, frame_dir):
     output_path = os.path.join(frame_dir, f"%03d.jpg")
     cmd = f"ffmpeg -i {video} -vf \"select=bitor(gte(t-prev_selected_t\,1)\,isnan(prev_selected_t))\" -vsync 0 -qmin 1 -q:v 1 {output_path}"
@@ -390,77 +403,6 @@ class Video2music:
     def generate(self, video, primer, key):
-        feature_dir = Path("./feature")
-        output_dir = Path("./output")
-        if feature_dir.exists():
-            shutil.rmtree(str(feature_dir))
-        if output_dir.exists():
-            shutil.rmtree(str(output_dir))
-        feature_dir.mkdir(parents=True)
-        output_dir.mkdir(parents=True)
-        frame_dir = feature_dir / "vevo_frame"
-        #video features
-        semantic_dir = feature_dir / "vevo_semantic"
-        emotion_dir = feature_dir / "vevo_emotion"
-        scene_dir = feature_dir / "vevo_scene"
-        scene_offset_dir = feature_dir / "vevo_scene_offset"
-        motion_dir = feature_dir / "vevo_motion"
-        frame_dir.mkdir(parents=True)
-        semantic_dir.mkdir(parents=True)
-        emotion_dir.mkdir(parents=True)
-        scene_dir.mkdir(parents=True)
-        scene_offset_dir.mkdir(parents=True)
-        motion_dir.mkdir(parents=True)
-        #music features
-        chord_dir = feature_dir / "vevo_chord"
-        loudness_dir = feature_dir / "vevo_loudness"
-        note_density_dir = feature_dir / "vevo_note_density"
-        chord_dir.mkdir(parents=True)
-        loudness_dir.mkdir(parents=True)
-        note_density_dir.mkdir(parents=True)
-        split_video_into_frames(video, frame_dir)
-        gen_semantic_feature(frame_dir, semantic_dir)
-        gen_emotion_feature(frame_dir, emotion_dir)
-        gen_scene_feature(video, scene_dir, frame_dir)
-        gen_scene_offset_feature(scene_dir, scene_offset_dir)
-        gen_motion_feature(video, motion_dir)
-        feature_scene_offset = get_scene_offset_feature(scene_offset_dir)
-        feature_motion = get_motion_feature(motion_dir)
-        feature_emotion = get_emotion_feature(emotion_dir)
-        feature_semantic = get_semantic_feature(semantic_dir)
-        # cuda
-        feature_scene_offset = feature_scene_offset.to(self.device)
-        feature_motion = feature_motion.to(self.device)
-        feature_emotion = feature_emotion.to(self.device)
-        feature_scene_offset = feature_scene_offset.unsqueeze(0)
-        feature_motion = feature_motion.unsqueeze(0)
-        feature_emotion = feature_emotion.unsqueeze(0)
-        feature_semantic = feature_semantic.to(self.device)
-        feature_semantic_list = []
-        feature_semantic = torch.unsqueeze(feature_semantic, 0)
-        feature_semantic_list.append( feature_semantic.to(self.device) )
-        #feature_semantic_list.append( feature_semantic )
-        if "major" in key:
-            feature_key = torch.tensor([0])
-            feature_key = feature_key.float()
-        elif "minor" in key:
-            feature_key = torch.tensor([1])
-            feature_key = feature_key.float()
-        feature_key = feature_key.to(self.device)
         with open('dataset/vevo_meta/chord.json') as json_file:
             chordDic = json.load(json_file)
         with open('dataset/vevo_meta/chord_inv.json') as json_file:
@@ -504,14 +446,30 @@ class Video2music:
                     pChord = pChord[0:type_idx] + ":maj6"
                 if pChord[type_idx+1:] == "M7":
                     pChord = pChord[0:type_idx] + ":maj7"
-                if pChord[type_idx+1:] == "":
                     pChord = pChord[0:type_idx]
             print("pchord is ", pChord)
-            chordID = chordDic[pChord]
-            primerCID.append(chordID)
             chord_arr = pChord.split(":")
             if len(chord_arr) == 1:
                 chordRootID = chordRootDic[chord_arr[0]]
                 primerCID_root.append(chordRootID)
@@ -537,6 +495,84 @@ class Video2music:
         primerCID_attr = primerCID_attr.to(torch.long)
         primerCID_attr = primerCID_attr.to(self.device)
         # self.model.eval()
         # self.modelReg.eval()
@@ -616,6 +652,7 @@ class Video2music:
             midi_chords = voice(midi_chords_orginal)
             trans = traspose_key_dic[key]
             for i, chord in enumerate(midi_chords):
                 if densitylist[i] == 0:
                     if len(chord) >= 4:
@@ -727,6 +764,9 @@ def gradio_generate2(input_youtube, input_primer, input_key):
     youtube_dir.mkdir(parents=True)
     yObject = YouTube(input_youtube)
     yObject_stream = yObject.streams.get_by_resolution("240p")
     fname = yObject.video_id +".mp4"
     if yObject_stream == None:
@@ -813,11 +853,11 @@ with gr.Blocks(css=css) as demo:
             # with gr.Column(visible=True) as colA:
             with gr.Column(visible=True) as rowA:
                 with gr.Row():
-                    input_video = gr.Video(label="Input Video")
                 with gr.Row():
                     with gr.Row():
-                        input_primer = gr.Textbox(label="Input Primer", value="C Am F G")
-                        input_key = gr.Dropdown(choices=["C major", "A minor"], value="C major", label="Input Key")
                 with gr.Row():
                     btn = gr.Button("Generate")
@@ -826,8 +866,8 @@ with gr.Blocks(css=css) as demo:
                     input_video_yt = gr.Textbox(label="YouTube URL")
                 with gr.Row():
                     with gr.Row():
-                        input_primer_yt = gr.Textbox(label="Input Primer", value="C Am F G")
-                        input_key_yt = gr.Dropdown(choices=["C major", "A minor"], value="C major", label="Input Key")
                 with gr.Row():
                     btn_yt = gr.Button("Generate")

     'Bb':'A#'
 }
+chordList = ['C','C#','D','D#','E','F','F#','G','G#','A','A#','B']
 max_conseq_N = 0
 max_conseq_chord = 2
 tempo = 120
 max_velocity = 112  # Maximum velocity value in the output range
+# def get_video_duration(file_path):
+#     try:
+#         clip = VideoFileClip(file_path)
+#         duration = clip.duration
+#         clip.close()
+#         return duration
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+#         return None
 def split_video_into_frames(video, frame_dir):
     output_path = os.path.join(frame_dir, f"%03d.jpg")
     cmd = f"ffmpeg -i {video} -vf \"select=bitor(gte(t-prev_selected_t\,1)\,isnan(prev_selected_t))\" -vsync 0 -qmin 1 -q:v 1 {output_path}"
     def generate(self, video, primer, key):
         with open('dataset/vevo_meta/chord.json') as json_file:
             chordDic = json.load(json_file)
         with open('dataset/vevo_meta/chord_inv.json') as json_file:
                     pChord = pChord[0:type_idx] + ":maj6"
                 if pChord[type_idx+1:] == "M7":
                     pChord = pChord[0:type_idx] + ":maj7"
+                if pChord[type_idx+1:] == "" or pChord[type_idx+1:] == "maj" or pChord[type_idx+1:] == "M":
                     pChord = pChord[0:type_idx]
             print("pchord is ", pChord)
+            if pChord not in chordDic:
+                raise gr.Error("Not Supported Chord Type!")
             chord_arr = pChord.split(":")
+            trans = traspose_key_dic[key]
+            trasindex = (chordList.index( chord_arr[0] ) - trans) % 12
+            if len(chord_arr) == 1:
+                pChordTrans = chordList[trasindex]
+            elif len(chord_arr) == 2:
+                pChordTrans = chordList[trasindex] + ":" + chord_arr[1]
+            print(pChordTrans)
+            chordID = chordDic[pChordTrans]
+            primerCID.append(chordID)
+            chord_arr = pChordTrans.split(":")
             if len(chord_arr) == 1:
                 chordRootID = chordRootDic[chord_arr[0]]
                 primerCID_root.append(chordRootID)
         primerCID_attr = primerCID_attr.to(torch.long)
         primerCID_attr = primerCID_attr.to(self.device)
+        # duration = get_video_duration(video)
+        # if duration >= 300:
+        #     raise gr.Error("We only support duration of video less than 300 seconds")
+        feature_dir = Path("./feature")
+        output_dir = Path("./output")
+        if feature_dir.exists():
+            shutil.rmtree(str(feature_dir))
+        if output_dir.exists():
+            shutil.rmtree(str(output_dir))
+        feature_dir.mkdir(parents=True)
+        output_dir.mkdir(parents=True)
+        frame_dir = feature_dir / "vevo_frame"
+        #video features
+        semantic_dir = feature_dir / "vevo_semantic"
+        emotion_dir = feature_dir / "vevo_emotion"
+        scene_dir = feature_dir / "vevo_scene"
+        scene_offset_dir = feature_dir / "vevo_scene_offset"
+        motion_dir = feature_dir / "vevo_motion"
+        frame_dir.mkdir(parents=True)
+        semantic_dir.mkdir(parents=True)
+        emotion_dir.mkdir(parents=True)
+        scene_dir.mkdir(parents=True)
+        scene_offset_dir.mkdir(parents=True)
+        motion_dir.mkdir(parents=True)
+        #music features
+        chord_dir = feature_dir / "vevo_chord"
+        loudness_dir = feature_dir / "vevo_loudness"
+        note_density_dir = feature_dir / "vevo_note_density"
+        chord_dir.mkdir(parents=True)
+        loudness_dir.mkdir(parents=True)
+        note_density_dir.mkdir(parents=True)
+        split_video_into_frames(video, frame_dir)
+        gen_semantic_feature(frame_dir, semantic_dir)
+        gen_emotion_feature(frame_dir, emotion_dir)
+        gen_scene_feature(video, scene_dir, frame_dir)
+        gen_scene_offset_feature(scene_dir, scene_offset_dir)
+        gen_motion_feature(video, motion_dir)
+        feature_scene_offset = get_scene_offset_feature(scene_offset_dir)
+        feature_motion = get_motion_feature(motion_dir)
+        feature_emotion = get_emotion_feature(emotion_dir)
+        feature_semantic = get_semantic_feature(semantic_dir)
+        # cuda
+        feature_scene_offset = feature_scene_offset.to(self.device)
+        feature_motion = feature_motion.to(self.device)
+        feature_emotion = feature_emotion.to(self.device)
+        feature_scene_offset = feature_scene_offset.unsqueeze(0)
+        feature_motion = feature_motion.unsqueeze(0)
+        feature_emotion = feature_emotion.unsqueeze(0)
+        feature_semantic = feature_semantic.to(self.device)
+        feature_semantic_list = []
+        feature_semantic = torch.unsqueeze(feature_semantic, 0)
+        feature_semantic_list.append( feature_semantic.to(self.device) )
+        #feature_semantic_list.append( feature_semantic )
+        if "major" in key:
+            feature_key = torch.tensor([0])
+            feature_key = feature_key.float()
+        elif "minor" in key:
+            feature_key = torch.tensor([1])
+            feature_key = feature_key.float()
+        feature_key = feature_key.to(self.device)
         # self.model.eval()
         # self.modelReg.eval()
             midi_chords = voice(midi_chords_orginal)
             trans = traspose_key_dic[key]
             for i, chord in enumerate(midi_chords):
                 if densitylist[i] == 0:
                     if len(chord) >= 4:
     youtube_dir.mkdir(parents=True)
     yObject = YouTube(input_youtube)
+    if yObject.length >= 300:
+        raise gr.Error("We only support duration of video less than 300 seconds")
     yObject_stream = yObject.streams.get_by_resolution("240p")
     fname = yObject.video_id +".mp4"
     if yObject_stream == None:
             # with gr.Column(visible=True) as colA:
             with gr.Column(visible=True) as rowA:
                 with gr.Row():
+                    input_video = gr.Video(label="Input Video", max_length=299)
                 with gr.Row():
                     with gr.Row():
+                        input_primer = gr.Textbox(label="Input Primer", value="C Am F G", info="Supported types: dim, sus4, min7(m7), min(m), sus2, aug, dim7, maj6(M6), hdim7, 7, min6(m6), maj7(M7)")
+                        input_key = gr.Dropdown(choices=all_key_names, value="C major", label="Input Key")
                 with gr.Row():
                     btn = gr.Button("Generate")
                     input_video_yt = gr.Textbox(label="YouTube URL")
                 with gr.Row():
                     with gr.Row():
+                        input_primer_yt = gr.Textbox(label="Input Primer", value="C Am F G", info="Supported types: dim, sus4, min7(m7), min(m), sus2, aug, dim7, maj6(M6), hdim7, 7, min6(m6), maj7(M7)")
+                        input_key_yt = gr.Dropdown(choices=all_key_names, value="C major", label="Input Key")
                 with gr.Row():
                     btn_yt = gr.Button("Generate")