import streamlit as st
import torchaudio
import io
import matplotlib.pyplot as plt
import time # Import the time module
from audio_recorder_streamlit import audio_recorder
from trainer import SpeechLLMLightning
import re
import json
import whisper
import re
from transformers import AutoProcessor
# Function to load the model and tokenizer
def plot_mel_spectrogram(mel_spec):
plt.figure(figsize=(10, 4))
plt.imshow(mel_spec.squeeze().cpu().numpy(), aspect='auto', origin='lower')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
st.pyplot(plt)
def get_or_load_model():
if 'model' not in st.session_state or 'tokenizer' not in st.session_state or 'processor' not in st.session_state:
ckpt_path = "checkpoints/pretrained_checkpoint.ckpt"
model = SpeechLLMLightning()
# .load_from_checkpoint(ckpt_path)
tokenizer = model.llm_tokenizer
model.eval()
model.freeze()
model.to('cuda')
st.session_state.model = model
st.session_state.tokenizer = tokenizer
st.session_state.processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
return st.session_state.model, st.session_state.tokenizer, st.session_state.processor
def extract_dictionary(input_string):
json_str_match = re.search(r'\{.*\}', input_string)
if not json_str_match:
print(input_string)
return "No valid JSON found."
json_str = json_str_match.group(0)
json_str = re.sub(r'(?<=\{|\,)\s*([^\"{}\[\]\s]+)\s*:', r'"\1":', json_str) # Fix unquoted keys
json_str = re.sub(r',\s*([\}\]])', r'\1', json_str) # Remove trailing commas
try:
data_dict = json.loads(json_str)
return data_dict
except json.JSONDecodeError as e:
return f"Error parsing JSON: {str(e)}"
pre_speech_prompt = '''Instruction:
Give me the following information about the speech [Transcript, Gender, Age, Emotion, Accent]
Input:
'''
post_speech_prompt = f'''
Output:'''
# Function to generate a response from the model
def generate_response(mel, pre_speech_prompt, post_speech_prompt, model, tokenizer):
output_prompt = '\n'
pre_tokenized_ids = tokenizer(pre_speech_prompt, padding="do_not_pad", return_tensors='pt', truncation=False, add_special_tokens=False)["input_ids"]
post_tokenized_ids = tokenizer(post_speech_prompt, padding="do_not_pad", return_tensors='pt', truncation=False, add_special_tokens=False)["input_ids"]
output_tokenized_ids = tokenizer(output_prompt, padding="do_not_pad", return_tensors='pt', truncation=False, add_special_tokens=False)["input_ids"]
combined_embeds, atts, label_ids = model.encode(mel.cuda(), pre_tokenized_ids.cuda(), post_tokenized_ids.cuda(), output_tokenized_ids.cuda())
start_time = time.time() # Record start time
out = model.llm_model.generate(
inputs_embeds=combined_embeds,
max_new_tokens=2000,
).cpu().tolist()[0]
end_time = time.time() # Record end time
latency = (end_time - start_time) * 1000 # Calculate latency in milliseconds
output_text = tokenizer.decode(out, skip_special_tokens=True)
return output_text, latency
def extract_prediction_values(self, input_string):
json_str_match = re.search(r'\s*\{.*?\}\s*', input_string)
try:
json_str = json_str_match.group(0)
except:
json_str = '{}'
return self.extract_dictionary(json_str)
# Load model and tokenizer once and store them in session_state
model, tokenizer, processor = get_or_load_model()
# Streamlit UI components
st.title("Multi-Modal Speech LLM")
st.write("Record an audio file to get its transcription and other metadata.")
pre_prompt = st.text_area("Pre Speech Prompt:", value=pre_speech_prompt, height=150)
post_prompt = st.text_area("Post Speech Prompt:", value=post_speech_prompt, height=100)
# Audio recording
audio_data = audio_recorder(sample_rate=16000)
# Transcription process
if audio_data is not None:
with st.spinner('Transcribing...'):
try:
# Load audio data into a tensor
audio_buffer = io.BytesIO(audio_data)
st.audio(audio_data, format='audio/wav', start_time=0)
wav_tensor, sample_rate = torchaudio.load(audio_buffer)
wav_tensor = wav_tensor.to('cuda')
audio = wav_tensor.mean(0)
mel = whisper.log_mel_spectrogram(audio)
plot_mel_spectrogram(mel)
audio = processor(audio.squeeze(), return_tensors="pt", sampling_rate=16000).input_values
# Process audio to get transcription
prediction, latency = generate_response(audio.cuda(), pre_prompt, post_prompt, model, tokenizer)
pred_dict = extract_dictionary(prediction)
user_utterance = '' + pred_dict['Transcript']
# Display the transcription and latency
st.success('Transcription Complete')
st.text_area("LLM Output:", value=pred_dict, height=200, max_chars=None)
st.write(f"Latency in CPU: {latency:.2f} ms")
except Exception as e:
st.error(f"An error occurred during transcription: {e}")