Spaces:
Sleeping
Sleeping
File size: 4,892 Bytes
5195825 fd326aa 8c57520 5195825 8c57520 646e64c 8c57520 5195825 fd326aa 8c57520 fd326aa 8c57520 fd326aa 8c57520 fd326aa 8c57520 5195825 8c57520 cae734b 8c57520 5195825 ae5932c 5195825 ae5932c 5195825 cae734b 5195825 fd326aa 5195825 d303e1d 5195825 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import streamlit as st
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import librosa
import torch
from spleeter.separator import Separator
from pydub import AudioSegment
from IPython.display import Audio
import os
import accelerate
#import pyaudio
import numpy as np
# Create PyAudio object
#p = pyaudio.PyAudio()
CHUNK_SIZE = 1024
SAMPLING_RATE = 16000
vocals_data = bytes()
# preprocess and crop audio file
def audio_preprocess(input_file): #, in_data, frame_count, time_info, status):
# Define callback function for audio processing
global vocals_data
# Convert input data to numpy array
audio_array = np.frombuffer(input_file, dtype=np.int16)
# Perform vocal removal on the audio input
# Pass the audio array as waveform to separate() method
vocals = Separator('spleeter:2stems').separate(audio_array)
# Convert vocals to audio data
vocals_data = vocals['vocals'].flatten().astype(np.int16).tobytes()
separated_audio = vocals_data
# Return processed data for output
return vocals_data, pyaudio.paContinue, processed_audio
# audio processing 2?
def py_audio():
# Open stream for recording
stream = p.open(format=pyaudio.paInt16, channels=1, rate=SAMPLING_RATE, input=True, output=True,
frames_per_buffer=CHUNK_SIZE, stream_callback=process_audio)
# Start stream
stream.start_stream()
# Create stream for playback
playback_stream = p.open(format=pyaudio.paInt16, channels=1, rate=SAMPLING_RATE, output=True)
# Play processed data in real-time
while stream.is_active():
if len(vocals_data) >= CHUNK_SIZE:
playback_stream.write(vocals_data[:CHUNK_SIZE])
vocals_data = vocals_data[CHUNK_SIZE:]
# Stop streams
stream.stop_stream()
stream.close()
playback_stream.stop_stream()
playback_stream.close()
# Terminate PyAudio object
p.terminate()
# Now 'processed_file' contains the separated vocals
separated_audio = vocals_data
# separate music and vocal
#separator = Separator('spleeter:2stems')
#separator.separate_to_file(input_file, output_file)
#separated_audio = separator.separate(input_file)
# Crop the audio
start_time = 60000 # e.g. 30 seconds, 30000
end_time = 110000 # e.g. 40 seconds, 40000
audio = AudioSegment.from_file(separated_audio)
cropped_audio = audio[start_time:end_time]
processed_audio = cropped_audio
# .export('cropped_vocals.wav', format='wav') # save vocal audio file
return processed_audio
# ASR transcription
def asr_model(processed_audio):
# load audio file
y, sr = librosa.load(processed_audio, sr=16000)
# ASR model
MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
gout = model.generate(
input_features=processed_in.input_features,
output_scores=True, return_dict_in_generate=True
)
transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
# print result
print(f"Song lyrics = {transcription}")
return transcription
# sentiment analysis
def senti_model(transcription):
pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
final_result = pipe(transcription)
display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
print(display)
return display
# main
def main(input_file):
separated_audio = audio_preprocess(input_file)
transcription = asr_model(processed_audio)
final_result = senti_model(transcription)
st.write(final_result)
if st.button("Play Audio"):
st.audio(audio_data['audio'],
format="audio/wav",
start_time=0,
sample_rate = audio_data['sampling_rate'])
if __name__ == '__main__':
# steamlit setup
st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
st.header("Cantonese Song Sentiment Analyzer")
input_file = "/test1.mp3"
#input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
if input_file is not None:
st.write("File uploaded successfully!")
st.write(input_file)
else:
st.write("No file uploaded.")
button_click = st.button("Run Analysis", type="primary")
if button_click:
main(input_file=input_file)
|