File size: 2,158 Bytes
396fa06
 
c42e8bc
 
 
b0090a7
 
17c3241
b0090a7
 
 
 
396fa06
c42e8bc
396fa06
 
 
b0090a7
 
 
 
 
 
 
 
 
 
c42e8bc
 
 
b0090a7
c42e8bc
 
 
 
 
 
 
396fa06
c42e8bc
 
 
396fa06
c42e8bc
 
 
396fa06
 
4fbea1d
396fa06
 
c42e8bc
c2d51d1
396fa06
 
 
c42e8bc
396fa06
 
2ef4925
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import time
import openai
import json
import os
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-large-robust-ft-libri-960h")

# asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-robust-ft-libri-960h")

openai.api_key = os.environ.get('OPENAI_KEY')

def classify_audio(audio):
    # Transcribe the audio to text
    # audio_transcript = asr_pipeline(audio)["text"]
    # audio_transcript = audio_transcript.lower()

    input_values = processor(audio, return_tensors="pt", padding="longest").input_values
    # retrieve logits
    logits = model(input_values).logits
     
    # take argmax and decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    messages = [
        {"role": "system", "content": "Is this chat a scam, spam or is safe? Only answer in JSON format with 'classification': '' as string and 'reasons': '' as the most plausible reasons why. The reason should be explaning to the potential victim why the conversation is probably a scam"},
        {"role": "user", "content": transcription},
    ]

    # Call the OpenAI API to generate a response
    response = openai.ChatCompletion.create(
        model="gpt-4",  # Replace with the actual GPT-4 model ID
        messages=messages
    )

    # Extract the generated text
    text = response.choices[0].message['content']
    text = json.loads(text)

    # Get the decision and reasons from the JSON dictionary
    decision = text["classification"]
    reasons = text["reasons"]

    # Return the transcription and the prediction as a dictionary
    return transcription, decision, reasons

gr.Interface(
    fn=classify_audio,
    inputs=gr.inputs.Audio(source="upload", type="numpy"),
    outputs=[
        gr.outputs.Textbox(label="Transcription"),
        gr.outputs.Textbox(label="Classification"),
        gr.outputs.Textbox(label="Reason"),
    ],
    live=True
).launch()