import streamlit as st from transformers import WhisperForConditionalGeneration, WhisperProcessor from transformers import pipeline import librosa import torch from spleeter.separator import Separator from pydub import AudioSegment from IPython.display import Audio import os import accelerate # steamlit setup st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",) st.header("Cantonese Song Sentiment Analyzer") #input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song #if input_file is not None: #st.write("File uploaded successfully!") #else: #st.write("No file uploaded.") button_click = st.button("Run Analysis", type="primary") # load song input_file = os.path.join(os.path.dirname(__file__), 'test1.mp3') #output_file = os.path.join(os.path.dirname(__file__)) output_file = None # preprocess and crop audio file def audio_preprocess(): # separate music and vocal global output_file, cropped_audio separator = Separator('spleeter:2stems') separator.separate_to_file(input_file, output_file) # Crop the audio start_time = 60000 # e.g. 30 seconds, 30000 end_time = 110000 # e.g. 40 seconds, 40000 #audio = AudioSegment.from_file(os.path.join(os.path.dirname(__file__), 'vocals.wav')) audio = AudioSegment.from_file(output_file[0]) # Use the global output_file variable for vocal part cropped_audio = audio[start_time:end_time] cropped_audio.export(os.path.join(os.path.dirname(__file__), 'cropped_vocals.wav'), format='wav') # save vocal audio file # ASR transcription def asr_model(): # load audio file #y, sr = librosa.load('cropped_vocals.wav', sr=16000) y, sr = librosa.load(os.path.join(os.path.dirname(__file__), 'cropped_vocals.wav'), sr=16000) # ASR model MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1" processor = WhisperProcessor.from_pretrained(MODEL_NAME) model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True) model.config.forced_decoder_ids = None model.config.suppress_tokens = [] model.config.use_cache = False processed_in = processor(y, sampling_rate=sr, return_tensors="pt") gout = model.generate( input_features=processed_in.input_features, output_scores=True, return_dict_in_generate=True ) transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0] # print result print(f"Song lyrics = {transcription}") return transcription # sentiment analysis def senti_model(transcription): pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student") final_result = pipe(transcription) print(f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%.") return final_result # main def main(): audio_preprocess() transcription = asr_model() final_result = senti_model(transcription) if st.button("Play Audio"): st.audio(audio_data['audio'], format="audio/wav", start_time=0, sample_rate = audio_data['sampling_rate']) if __name__ == '__main__': if button_click: main()