RexChan commited on
Commit
9854ac0
1 Parent(s): eeb58fa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
3
+ from transformers import pipeline
4
+ import librosa
5
+ import torch
6
+ from spleeter.separator import Separator
7
+ from pydub import AudioSegment
8
+ from IPython.display import Audio
9
+ import os
10
+ import accelerate
11
+
12
+
13
+ # steamlit setup
14
+ st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
15
+ st.header("Cantonese Song Sentiment Analyzer")
16
+ input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
17
+ if input_file is not None:
18
+ st.write("File uploaded successfully!")
19
+ st.write(input_file)
20
+ else:
21
+ st.write("No file uploaded.")
22
+ button_click = st.button("Run Analysis", type="primary")
23
+
24
+
25
+ # load song
26
+ #input_file = os.path.isfile("test1.mp3")
27
+ output_file = os.path.isdir("")
28
+
29
+
30
+ # preprocess and crop audio file
31
+ def audio_preprocess(file_name = '/test1/vocals.wav'):
32
+ # separate music and vocal
33
+ separator = Separator('spleeter:2stems')
34
+ separator.separate_to_file(input_file, output_file)
35
+
36
+
37
+ # Crop the audio
38
+ start_time = 60000 # e.g. 30 seconds, 30000
39
+ end_time = 110000 # e.g. 40 seconds, 40000
40
+
41
+
42
+
43
+
44
+ audio = AudioSegment.from_file(file_name)
45
+ cropped_audio = audio[start_time:end_time]
46
+ processed_audio = cropped_audio
47
+ # .export('cropped_vocals.wav', format='wav') # save vocal audio file
48
+ return processed_audio
49
+
50
+
51
+ # ASR transcription
52
+ def asr_model(processed_audio):
53
+ # load audio file
54
+ y, sr = librosa.load(processed_audio, sr=16000)
55
+
56
+
57
+ # ASR model
58
+ MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
59
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
60
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
61
+
62
+
63
+ model.config.forced_decoder_ids = None
64
+ model.config.suppress_tokens = []
65
+ model.config.use_cache = False
66
+
67
+
68
+ processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
69
+ gout = model.generate(
70
+ input_features=processed_in.input_features,
71
+ output_scores=True, return_dict_in_generate=True
72
+ )
73
+ transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
74
+
75
+
76
+ # print result
77
+ print(f"Song lyrics = {transcription}")
78
+
79
+
80
+ return transcription
81
+
82
+
83
+
84
+
85
+ # sentiment analysis
86
+ def senti_model(transcription):
87
+
88
+
89
+ pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
90
+ final_result = pipe(transcription)
91
+ display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
92
+ print(display)
93
+ return display
94
+
95
+
96
+ # return final_result
97
+
98
+
99
+
100
+
101
+ # main
102
+ def main():
103
+
104
+
105
+ processed_audio = audio_preprocess(input_file)
106
+ transcription = asr_model(processed_audio)
107
+ final_result = senti_model(transcription)
108
+ st.write(final_result)
109
+
110
+
111
+ if st.button("Play Audio"):
112
+ st.audio(audio_data['audio'],
113
+ format="audio/wav",
114
+ start_time=0,
115
+ sample_rate = audio_data['sampling_rate'])
116
+
117
+
118
+
119
+
120
+ if __name__ == '__main__':
121
+ if button_click:
122
+ main()