import torch import gradio as gr import models as MOD import process_data as PD from transformers import pipeline from manipulate_model.utils import get_config_and_model, infere model_master = { "SSL-AASIST (Trained on ASV-Spoof5)": {"eer_threshold": 3.3330237865448, "data_process_func": "process_ssl_assist_input", "note": "This model is trained only on ASVSpoof 2024 training data.", "model_class": "Model", "model_checkpoint": "ssl_aasist_epoch_7.pth"}, "AASIST": {"eer_threshold": 1.8018419742584229, "data_process_func": "process_assist_input", "note": "This model is trained on ASVSpoof 2024 training data.", "model_class":"AASIST_Model", "model_checkpoint": "orig_aasist_epoch_1.pth"} } model = MOD.Model(None, "cpu") model.load_state_dict(torch.load("ssl_aasist_epoch_7.pth", map_location="cpu")) model.eval() loaded_model = "SSL-AASIST (Trained on ASV-Spoof5)" manpulate_config, manipulate_model = get_config_and_model() def process(file, type): global model global loaded_model inp = getattr(PD, model_master[type]["data_process_func"])(file) if not loaded_model == type: model = getattr(MOD, model_master[type]["model_class"])(None, "cpu") model.load_state_dict(torch.load(model_master[type]["model_checkpoint"], map_location="cpu")) model.eval() loaded_model = type op = model(inp).detach().squeeze()[1].item() response_text = "Decision score: {} \nDecision threshold: {} \nNotes: 1. Any score below threshold is indicative of fake. \n2. {} ".format( str(op), str(model_master[type]["eer_threshold"]), model_master[type]["note"]) return response_text demo = gr.Blocks() file_proc = gr.Interface( fn=process, inputs=[ gr.Audio(sources=["upload"], label="Audio file", type="filepath"), gr.Radio(["SSL-AASIST (Trained on ASV-Spoof5)", "AASIST"], label="Select Model", type="value"), ], outputs="text", title="Find the Fake: Analyze 'Real' or 'Fake'.", description=( "Analyze fake or real with a click of a button. Upload a .wav or .flac file." ), examples=[ ["./bonafide.flac", "SSL-AASIST (Trained on ASV-Spoof5)"], ["./fake.flac", "SSL-AASIST (Trained on ASV-Spoof5)"], ["./bonafide.flac", "AASIST"], ["./fake.flac", "AASIST"], ], cache_examples=True, allow_flagging="never", ) ##################################################################################### # For ASR interface pipe = pipeline( task="automatic-speech-recognition", model="openai/whisper-large-v3", chunk_length_s=30, device="cpu", ) def transcribe(inputs): if inputs is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") op = pipe(inputs, batch_size=8, generate_kwargs={"task": "transcribe"}, return_timestamps=False, return_language=True) lang = op["chunks"][0]["language"] text = op["text"] return lang, text transcribe_proc = gr.Interface( fn = transcribe, inputs = [ gr.Audio(type="filepath", label="Speech file (<30s)", max_length=30, sources=["microphone", "upload"], show_download_button=True) ], outputs=[ gr.Text(label="Predicted Language", info="Language identification is performed automatically."), gr.Text(label="Predicted transcription", info="Best hypothesis."), ], title="Transcribe Anything.", description=( "Automatactic language identification and transcription service by Whisper Large V3. Upload a .wav or .flac file." ), allow_flagging="never" ) ############################################################################################# #For manipulation detection interface def detect_manipulation(inputs): global manipulate_model global manpulate_config out = infere(manipulate_model, inputs, manpulate_config) out = out.tolist() return str(out) manipulate_proc = gr.Interface( fn = detect_manipulation, inputs=[ gr.Audio(type="filepath", label="Speech file (<30s)", max_length=30, sources=["microphone", "upload"], show_download_button=True) ], outputs=[ gr.Text(label="Predicted manipulations", info="Manipulation detection is performed automatically."), ], title="Find the manipulated segments", description=( "Automatactic manipulation detection service. Upload a audio file." ), allow_flagging="never" ) with demo: gr.TabbedInterface([file_proc, transcribe_proc, manipulate_proc], ["Analyze Audio File", "Transcribe Audio File", "Manipulation Detection"]) demo.queue(max_size=10) demo.launch(share=True)