Ngadou's picture
Update app.py
c2d51d1
import gradio as gr
import time
import openai
import json
import os
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForCTC
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
# asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-robust-ft-libri-960h")
openai.api_key = os.environ.get('OPENAI_KEY')
def classify_audio(audio):
# Transcribe the audio to text
# audio_transcript = asr_pipeline(audio)["text"]
# audio_transcript = audio_transcript.lower()
input_values = processor(audio, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
messages = [
{"role": "system", "content": "Is this chat a scam, spam or is safe? Only answer in JSON format with 'classification': '' as string and 'reasons': '' as the most plausible reasons why. The reason should be explaning to the potential victim why the conversation is probably a scam"},
{"role": "user", "content": transcription},
]
# Call the OpenAI API to generate a response
response = openai.ChatCompletion.create(
model="gpt-4", # Replace with the actual GPT-4 model ID
messages=messages
)
# Extract the generated text
text = response.choices[0].message['content']
text = json.loads(text)
# Get the decision and reasons from the JSON dictionary
decision = text["classification"]
reasons = text["reasons"]
# Return the transcription and the prediction as a dictionary
return transcription, decision, reasons
gr.Interface(
fn=classify_audio,
inputs=gr.inputs.Audio(source="upload", type="numpy"),
outputs=[
gr.outputs.Textbox(label="Transcription"),
gr.outputs.Textbox(label="Classification"),
gr.outputs.Textbox(label="Reason"),
],
live=True
).launch()