Fixed audio
Browse files
app.py
CHANGED
|
@@ -27,7 +27,6 @@ client = Groq(api_key=GROQ_API_KEY)
|
|
| 27 |
|
| 28 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
|
| 30 |
-
# -----------------------------
|
| 31 |
# TEXT DETECTION
|
| 32 |
# -----------------------------
|
| 33 |
def run_hf_detector(text, model_id="roberta-base-openai-detector"):
|
|
@@ -154,43 +153,100 @@ def analyze_video(video_path):
|
|
| 154 |
# AUDIO DETECTION
|
| 155 |
# -----------------------------
|
| 156 |
class AudioCNNRNN(nn.Module):
|
| 157 |
-
def __init__(self,lstm_hidden_size=128,num_classes=2):
|
| 158 |
-
super().__init__()
|
| 159 |
self.cnn = nn.Sequential(
|
| 160 |
-
nn.Conv2d(1,32,3,1,1),
|
| 161 |
-
nn.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
)
|
| 163 |
-
self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size,batch_first=True)
|
| 164 |
-
self.fc = nn.Linear(lstm_hidden_size,num_classes)
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
def extract_mel_spectrogram(audio_path, sr=16000, n_mels=64):
|
| 172 |
-
waveform,
|
| 173 |
-
mel_spec = librosa.feature.melspectrogram(waveform,sr,n_mels=n_mels)
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
def analyze_audio(audio_path):
|
| 180 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
mel_spec = extract_mel_spectrogram(audio_path)
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
# -----------------------------
|
| 196 |
# GRADIO UI
|
|
@@ -202,61 +258,17 @@ def format_text_results(text):
|
|
| 202 |
|
| 203 |
def format_image_results(image):
|
| 204 |
res = analyze_image(image)
|
| 205 |
-
return f"### Image Detection\nAI Probability: {res['ai_probability']:.4f}\nConfidence: {res['confidence']}\nExplanation: {res['explanation']}"
|
| 206 |
|
| 207 |
def format_video_results(video_file):
|
| 208 |
res = analyze_video(video_file)
|
| 209 |
if "error" in res: return res["error"]
|
| 210 |
-
return f"### Video Detection\nAI Probability: {res['ai_probability']:.4f}\nConfidence: {res['confidence']}\nExplanation: {res['explanation']}"
|
| 211 |
|
| 212 |
def format_audio_results(audio_file):
|
| 213 |
res = analyze_audio(audio_file)
|
| 214 |
-
return f"### Audio Detection\nAI Probability: {res['ai_probability']:.4f}\nConfidence: {res['confidence']}\nExplanation: {res['explanation']}"
|
| 215 |
-
|
| 216 |
-
# with gr.Blocks() as app:
|
| 217 |
-
# home = gr.Column(visible=True)
|
| 218 |
-
# with home:
|
| 219 |
-
# gr.Markdown("## AI Multi-Modal Detector")
|
| 220 |
-
# with gr.Row():
|
| 221 |
-
# t_btn = gr.Button("Text")
|
| 222 |
-
# i_btn = gr.Button("Image")
|
| 223 |
-
# v_btn = gr.Button("Video")
|
| 224 |
-
# a_btn = gr.Button("Audio")
|
| 225 |
-
|
| 226 |
-
# text_page = gr.Column(visible=False)
|
| 227 |
-
# with text_page:
|
| 228 |
-
# inp = gr.Textbox(lines=5, placeholder="Paste text...", label="Text")
|
| 229 |
-
# out = gr.Markdown()
|
| 230 |
-
# gr.Button("Analyze").click(format_text_results, inputs=inp, outputs=out)
|
| 231 |
-
# gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,text_page])
|
| 232 |
-
|
| 233 |
-
# image_page = gr.Column(visible=False)
|
| 234 |
-
# with image_page:
|
| 235 |
-
# inp = gr.Image(type="pil")
|
| 236 |
-
# out = gr.Markdown()
|
| 237 |
-
# gr.Button("Analyze").click(format_image_results, inputs=inp, outputs=out)
|
| 238 |
-
# gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,image_page])
|
| 239 |
-
|
| 240 |
-
# video_page = gr.Column(visible=False)
|
| 241 |
-
# with video_page:
|
| 242 |
-
# inp = gr.Video()
|
| 243 |
-
# out = gr.Markdown()
|
| 244 |
-
# gr.Button("Analyze").click(format_video_results, inputs=inp, outputs=out)
|
| 245 |
-
# gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,video_page])
|
| 246 |
-
|
| 247 |
-
# audio_page = gr.Column(visible=False)
|
| 248 |
-
# with audio_page:
|
| 249 |
-
# inp = gr.Audio(type="filepath")
|
| 250 |
-
# out = gr.Markdown()
|
| 251 |
-
# gr.Button("Analyze").click(format_audio_results, inputs=inp, outputs=out)
|
| 252 |
-
# gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,audio_page])
|
| 253 |
-
|
| 254 |
-
# t_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,text_page])
|
| 255 |
-
# i_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,image_page])
|
| 256 |
-
# v_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,video_page])
|
| 257 |
-
# a_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,audio_page])
|
| 258 |
-
|
| 259 |
-
# app.launch(share=True)
|
| 260 |
with gr.Blocks() as app:
|
| 261 |
# Home Page
|
| 262 |
home_page = gr.Column(visible=True)
|
|
@@ -337,4 +349,7 @@ with gr.Blocks() as app:
|
|
| 337 |
analyze_video_btn.click(format_video_results, inputs=video_input, outputs=video_output)
|
| 338 |
analyze_audio_btn.click(format_audio_results, inputs=audio_input, outputs=audio_output)
|
| 339 |
|
| 340 |
-
app.launch(share=True, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
|
|
|
|
| 30 |
# TEXT DETECTION
|
| 31 |
# -----------------------------
|
| 32 |
def run_hf_detector(text, model_id="roberta-base-openai-detector"):
|
|
|
|
| 153 |
# AUDIO DETECTION
|
| 154 |
# -----------------------------
|
| 155 |
class AudioCNNRNN(nn.Module):
|
| 156 |
+
def __init__(self, lstm_hidden_size=128, num_classes=2):
|
| 157 |
+
super(AudioCNNRNN, self).__init__()
|
| 158 |
self.cnn = nn.Sequential(
|
| 159 |
+
nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
|
| 160 |
+
nn.ReLU(),
|
| 161 |
+
nn.MaxPool2d(2),
|
| 162 |
+
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
|
| 163 |
+
nn.ReLU(),
|
| 164 |
+
nn.MaxPool2d(2),
|
| 165 |
)
|
| 166 |
+
self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, batch_first=True)
|
| 167 |
+
self.fc = nn.Linear(lstm_hidden_size, num_classes)
|
| 168 |
+
|
| 169 |
+
def forward(self, x):
|
| 170 |
+
batch_size, seq_len, c, h, w = x.size()
|
| 171 |
+
c_in = x.view(batch_size * seq_len, c, h, w)
|
| 172 |
+
features = self.cnn(c_in)
|
| 173 |
+
features = features.mean(dim=[2, 3])
|
| 174 |
+
features = features.view(batch_size, seq_len, -1)
|
| 175 |
+
lstm_out, _ = self.lstm(features)
|
| 176 |
+
out = self.fc(lstm_out[:, -1, :])
|
| 177 |
+
return out
|
| 178 |
|
| 179 |
def extract_mel_spectrogram(audio_path, sr=16000, n_mels=64):
|
| 180 |
+
waveform, sample_rate = librosa.load(audio_path, sr=sr)
|
| 181 |
+
mel_spec = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=n_mels)
|
| 182 |
+
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 183 |
+
return mel_spec_db
|
| 184 |
+
|
| 185 |
+
def slice_spectrogram(mel_spec, slice_size=128, step=64):
|
| 186 |
+
slices = []
|
| 187 |
+
for start in range(0, mel_spec.shape[1] - slice_size, step):
|
| 188 |
+
slice_ = mel_spec[:, start:start + slice_size]
|
| 189 |
+
slices.append(slice_)
|
| 190 |
+
return slices
|
| 191 |
def analyze_audio(audio_path):
|
| 192 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 193 |
+
|
| 194 |
+
model = AudioCNNRNN()
|
| 195 |
+
model.eval()
|
| 196 |
+
model.to(device)
|
| 197 |
+
|
| 198 |
mel_spec = extract_mel_spectrogram(audio_path)
|
| 199 |
+
mel_slices = slice_spectrogram(mel_spec, slice_size=128, step=64)
|
| 200 |
+
|
| 201 |
+
if len(mel_slices) == 0:
|
| 202 |
+
raise RuntimeError("No mel slices generated. Check audio length.")
|
| 203 |
+
|
| 204 |
+
tensor_slices = [torch.tensor(s).unsqueeze(0) for s in mel_slices]
|
| 205 |
+
data = torch.stack(tensor_slices)
|
| 206 |
+
data = data.unsqueeze(0)
|
| 207 |
+
data = data.to(device)
|
| 208 |
+
|
| 209 |
+
with torch.no_grad():
|
| 210 |
+
outputs = model(data)
|
| 211 |
+
logits = outputs
|
| 212 |
+
|
| 213 |
+
temperature = 3.0
|
| 214 |
+
probabilities = torch.nn.functional.softmax(logits / temperature, dim=-1)
|
| 215 |
+
|
| 216 |
+
ai_probability = probabilities[0][0].item()
|
| 217 |
+
human_probability = probabilities[0][1].item()
|
| 218 |
+
|
| 219 |
+
diff = abs(ai_probability - human_probability)
|
| 220 |
+
if diff >= 0.7:
|
| 221 |
+
confidence = "High"
|
| 222 |
+
elif diff >= 0.3:
|
| 223 |
+
confidence = "Medium"
|
| 224 |
+
else:
|
| 225 |
+
confidence = "Low"
|
| 226 |
+
|
| 227 |
+
prompt = f"""
|
| 228 |
+
You are an AI audio analysis expert.
|
| 229 |
+
The detector outputs:
|
| 230 |
+
- AI-generated probability: {ai_probability:.4f}
|
| 231 |
+
- Human-generated probability: {human_probability:.4f}
|
| 232 |
+
- Confidence level: {confidence}
|
| 233 |
+
|
| 234 |
+
Give a short, human-readable explanation (1-2 sentences) of why the audio was likely classified as {'AI-generated' if ai_probability > human_probability else 'human-generated'}.
|
| 235 |
+
Base it on audio cues such as tone, pitch patterns, unnatural pauses, synthesis artifacts, or other hints you might infer.
|
| 236 |
+
Avoid repeating probabilities; focus on the reasoning.
|
| 237 |
+
"""
|
| 238 |
+
|
| 239 |
+
response = client.chat.completions.create(
|
| 240 |
+
model="llama-3.3-70b-versatile",
|
| 241 |
+
messages=[{"role": "user", "content": prompt}],
|
| 242 |
+
temperature=0.6,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
return {
|
| 246 |
+
"ai_probability": ai_probability,
|
| 247 |
+
"confidence": confidence,
|
| 248 |
+
"explanation": response.choices[0].message.content.strip()
|
| 249 |
+
}
|
| 250 |
|
| 251 |
# -----------------------------
|
| 252 |
# GRADIO UI
|
|
|
|
| 258 |
|
| 259 |
def format_image_results(image):
|
| 260 |
res = analyze_image(image)
|
| 261 |
+
return f"### Image Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"
|
| 262 |
|
| 263 |
def format_video_results(video_file):
|
| 264 |
res = analyze_video(video_file)
|
| 265 |
if "error" in res: return res["error"]
|
| 266 |
+
return f"### Video Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"
|
| 267 |
|
| 268 |
def format_audio_results(audio_file):
|
| 269 |
res = analyze_audio(audio_file)
|
| 270 |
+
return f"### Audio Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"
|
| 271 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
with gr.Blocks() as app:
|
| 273 |
# Home Page
|
| 274 |
home_page = gr.Column(visible=True)
|
|
|
|
| 349 |
analyze_video_btn.click(format_video_results, inputs=video_input, outputs=video_output)
|
| 350 |
analyze_audio_btn.click(format_audio_results, inputs=audio_input, outputs=audio_output)
|
| 351 |
|
| 352 |
+
app.launch(share=True, debug=True)
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
|