import gradio as gr import time import openai import json import os from transformers import pipeline from transformers import AutoProcessor, AutoModelForCTC processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h") model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h") # asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-robust-ft-libri-960h") openai.api_key = os.environ.get('OPENAI_KEY') def classify_audio(audio): # Transcribe the audio to text # audio_transcript = asr_pipeline(audio)["text"] # audio_transcript = audio_transcript.lower() input_values = processor(audio, return_tensors="pt", padding="longest").input_values # retrieve logits logits = model(input_values).logits # take argmax and decode predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) messages = [ {"role": "system", "content": "Is this chat a scam, spam or is safe? Only answer in JSON format with 'classification': '' as string and 'reasons': '' as the most plausible reasons why. The reason should be explaning to the potential victim why the conversation is probably a scam"}, {"role": "user", "content": transcription}, ] # Call the OpenAI API to generate a response response = openai.ChatCompletion.create( model="gpt-4", # Replace with the actual GPT-4 model ID messages=messages ) # Extract the generated text text = response.choices[0].message['content'] text = json.loads(text) # Get the decision and reasons from the JSON dictionary decision = text["classification"] reasons = text["reasons"] # Return the transcription and the prediction as a dictionary return transcription, decision, reasons gr.Interface( fn=classify_audio, inputs=gr.inputs.Audio(source="upload", type="numpy"), outputs=[ gr.outputs.Textbox(label="Transcription"), gr.outputs.Textbox(label="Classification"), gr.outputs.Textbox(label="Reason"), ], live=True ).launch()