Spaces:
Sleeping
Sleeping
File size: 2,890 Bytes
74372b3 8ab8537 74372b3 66e3963 74372b3 efed6b6 74372b3 ec9ef72 74372b3 52548b6 6545b3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import streamlit as st
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import librosa
import torch
from spleeter.separator import Separator
from pydub import AudioSegment
from IPython.display import Audio
import os
import accelerate
# steamlit setup
st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
st.header("Cantonese Song Sentiment Analyzer")
# load song
input_file = ""
output_file = "/content/"
# preprocess and crop audio file
def audio_preprocess():
# separate music and vocal
separator = Separator('spleeter:2stems')
separator.separate_to_file(input_file, output_file)
# Crop the audio
start_time = 60000 # e.g. 30 seconds, 30000
end_time = 110000 # e.g. 40 seconds, 40000
audio = AudioSegment.from_file('/content/test1/vocals.wav')
cropped_audio = audio[start_time:end_time]
cropped_audio.export('/content/cropped_vocals.wav', format='wav') # save vocal audio file
# ASR transcription
def asr_model():
# load audio file
y, sr = librosa.load('cropped_vocals.wav', sr=16000)
# ASR model
MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
gout = model.generate(
input_features=processed_in.input_features,
output_scores=True, return_dict_in_generate=True
)
transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
# print result
print(f"Song lyrics = {transcription}")
return transcription
# sentiment analysis
def senti_model(transcription):
pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
final_result = pipe(transcription)
print(f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%.")
return final_result
# main
def main():
input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
if input_file is not None:
st.write("File uploaded successfully!")
else:
st.write("No file uploaded.")
audio_preprocess()
transcription = asr_model()
final_result = senti_model(transcription)
if st.button("Play Audio"):
st.audio(audio_data['audio'],
format="audio/wav",
start_time=0,
sample_rate = audio_data['sampling_rate'])
if __name__ == '__main__':
if clicked = st.button("Run Analysis"):
main() |