File size: 5,374 Bytes
45a4975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from flask import Flask, render_template, request, jsonify
from transformers import Wav2Vec2FeatureExtractor, UniSpeechSatForXVector
import torchaudio
import torch
import io
import librosa
from scipy.spatial.distance import cosine
import numpy as np
import os
# brew install ffmpeg
# pip install flask transformers librosa torch torchaudio

app = Flask(__name__, static_url_path='/static')

# https://www.youtube.com/watch?v=NjR6TyHgAho first 30s
mp3_file_path = "arnold.mp3"

# https://neets.ai/ "With great power comes great responsibility"
mp3_file_path2 = 'arnold2.wav'

flag1=""
flag2=""

with open("flag1.txt") as f:
 flag1=f.read()
with open("flag2.txt") as f:
 flag2=f.read()

# Load feature extractor and model
themodel = "microsoft/unispeech-sat-large-sv"
if os.path.exists("model"):
    themodel = "model"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(themodel)
model = UniSpeechSatForXVector.from_pretrained(themodel)

# Preprocess audio function to convert audio to mono 16khz
def preprocess_audio(audio_data):
    waveform, sample_rate = torchaudio.load(audio_data)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
    waveform = waveform.squeeze().numpy()
    return waveform

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/chal2')
def chal2():
    return render_template('chal2.html')

# Hugging faces doesn't run an api for us anymore so processing data against a model needs to be done locally now
# https://www.ktskumar.com/2021/12/introduction-to-voice-authentication-using-javascript/
# https://hf.space/gradioiframe/microsoft/unispeech-speaker-verification/api/predict
# You can buy something similiar through Azure, perhaps microsoft just wanted to commercialize this
@app.route('/compare_audio', methods=['POST'])
def compare_audio():
    try:
        # Get the recorded audio file from the frontend
        recorded_audio = request.files['audio_data']

        # Preprocess recorded audio
        audio_data = preprocess_audio(recorded_audio)
        inputs = feature_extractor(audio_data, return_tensors="pt")
        embeddings = model(**inputs).embeddings
        embeddings_normalized = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

        # Load and preprocess MP3 file for comparison
        mp3_audio = preprocess_audio(mp3_file_path)
        mp3_inputs = feature_extractor(mp3_audio, return_tensors="pt")
        mp3_embeddings = model(**mp3_inputs).embeddings
        mp3_embeddings_normalized = torch.nn.functional.normalize(mp3_embeddings, dim=-1).cpu()

        # Calculate cosine similarity
        cosine_sim = torch.nn.CosineSimilarity(dim=-1)
        similarity = cosine_sim(embeddings_normalized, mp3_embeddings_normalized).item()

        similarity = round(similarity, 3)

        threshold = 0.89  # Adjust the threshold as needed
        if similarity < threshold:
            result = "Authorization Failed! " + str(similarity) + " < 0.890<br>Do your best Terminator impression"
        else:
            result = "Good job! Match: " + str(similarity) + "<br>" + flag1 + "<br><a href='/chal2'>Click here to open the next challenge</a>"

        return jsonify({'result': result})
    except Exception as e:
        print("Caught: "+str(e))
        return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.' })

def extract_mfcc(audio_bytes):
    # Preprocess audio
    waveform = preprocess_audio2(audio_bytes)
    
    # Extract MFCC coefficients
    mfcc = librosa.feature.mfcc(y=waveform, sr=16000, n_mfcc=13)

    return mfcc

def preprocess_audio2(audio_bytes):
    # Load the audio bytes into torchaudio waveform
    waveform, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))
    
    # Ensure the audio has a single channel (mono)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample the audio to 16kHz if needed
    if sample_rate != 16000:
        waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

    # Trim silence at beginning and end
    waveform, _ = librosa.effects.trim(waveform, top_db=20)
   
    waveform = waveform.squeeze().numpy()

    return waveform

@app.route('/compare_audio2', methods=['POST'])
def compare_audio2():
    try:
        recorded_audio = request.files['audio_data'].read()
        mp3_audio = open(mp3_file_path2, 'rb').read()

        # Compare similarity between audio
        mfcc1 = extract_mfcc(recorded_audio)
        mfcc2 = extract_mfcc(mp3_audio)
        similarity = 1 - cosine(np.mean(mfcc1, axis=1), np.mean(mfcc2, axis=1))
        similarity = round(similarity, 3)
        if similarity < 0.940:
            result = "Authorization Failed! " + str(similarity) + " < 0.940<br>Say: 'With great power comes great responsibility' as Arnold Schwarzenegger"
        else:
            result = "Good job! Match: " + str(similarity) + "<br>" + flag2

        return jsonify({'result': result})
    except Exception as e:
        print("Caught: "+str(e))
        return jsonify({'error': 'An error occurred during audio comparison. Im fragile please dont abuse.'})

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=8080, debug=True)