satyahaha commited on
Commit
ff9e1fa
·
verified ·
1 Parent(s): 1e00889

Fixed audio

Browse files
Files changed (1) hide show
  1. app.py +95 -80
app.py CHANGED
@@ -27,7 +27,6 @@ client = Groq(api_key=GROQ_API_KEY)
27
 
28
  device = "cuda" if torch.cuda.is_available() else "cpu"
29
 
30
- # -----------------------------
31
  # TEXT DETECTION
32
  # -----------------------------
33
  def run_hf_detector(text, model_id="roberta-base-openai-detector"):
@@ -154,43 +153,100 @@ def analyze_video(video_path):
154
  # AUDIO DETECTION
155
  # -----------------------------
156
  class AudioCNNRNN(nn.Module):
157
- def __init__(self,lstm_hidden_size=128,num_classes=2):
158
- super().__init__()
159
  self.cnn = nn.Sequential(
160
- nn.Conv2d(1,32,3,1,1), nn.ReLU(), nn.MaxPool2d(2),
161
- nn.Conv2d(32,64,3,1,1), nn.ReLU(), nn.MaxPool2d(2)
 
 
 
 
162
  )
163
- self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size,batch_first=True)
164
- self.fc = nn.Linear(lstm_hidden_size,num_classes)
165
- def forward(self,x):
166
- b,s,c,h,w = x.size()
167
- x = self.cnn(x.view(b*s,c,h,w)).mean(dim=[2,3]).view(b,s,-1)
168
- out,_ = self.lstm(x)
169
- return self.fc(out[:,-1,:])
 
 
 
 
 
170
 
171
  def extract_mel_spectrogram(audio_path, sr=16000, n_mels=64):
172
- waveform,_ = librosa.load(audio_path,sr=sr)
173
- mel_spec = librosa.feature.melspectrogram(waveform,sr,n_mels=n_mels)
174
- return librosa.power_to_db(mel_spec,ref=np.max)
175
-
176
- def slice_spectrogram(mel_spec,slice_size=128,step=64):
177
- return [mel_spec[:,i:i+slice_size] for i in range(0, mel_spec.shape[1]-slice_size, step)]
178
-
 
 
 
 
179
  def analyze_audio(audio_path):
180
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
181
- model = AudioCNNRNN().to(device).eval()
 
 
 
 
182
  mel_spec = extract_mel_spectrogram(audio_path)
183
- slices = slice_spectrogram(mel_spec)
184
- if not slices: return {"ai_probability":0,"confidence":"Low","explanation":"Audio too short."}
185
- data = torch.stack([torch.tensor(s).unsqueeze(0) for s in slices]).unsqueeze(0).to(device)
186
- with torch.no_grad(): logits = model(data)
187
- probabilities = torch.nn.functional.softmax(logits/3.0, dim=-1)[0]
188
- ai_prob,human_prob = probabilities[0].item(),probabilities[1].item()
189
- diff = abs(ai_prob-human_prob)
190
- confidence = "High" if diff>=0.7 else "Medium" if diff>=0.3 else "Low"
191
- prompt = f"Audio AI:{ai_prob:.4f} Human:{human_prob:.4f} Confidence:{confidence}. Explain reasoning."
192
- response = client.chat.completions.create(model="llama-3.3-70b-versatile", messages=[{"role":"user","content":prompt}], temperature=0.6)
193
- return {"ai_probability":ai_prob,"confidence":confidence,"explanation":response.choices[0].message.content.strip()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  # -----------------------------
196
  # GRADIO UI
@@ -202,61 +258,17 @@ def format_text_results(text):
202
 
203
  def format_image_results(image):
204
  res = analyze_image(image)
205
- return f"### Image Detection\nAI Probability: {res['ai_probability']:.4f}\nConfidence: {res['confidence']}\nExplanation: {res['explanation']}"
206
 
207
  def format_video_results(video_file):
208
  res = analyze_video(video_file)
209
  if "error" in res: return res["error"]
210
- return f"### Video Detection\nAI Probability: {res['ai_probability']:.4f}\nConfidence: {res['confidence']}\nExplanation: {res['explanation']}"
211
 
212
  def format_audio_results(audio_file):
213
  res = analyze_audio(audio_file)
214
- return f"### Audio Detection\nAI Probability: {res['ai_probability']:.4f}\nConfidence: {res['confidence']}\nExplanation: {res['explanation']}"
215
-
216
- # with gr.Blocks() as app:
217
- # home = gr.Column(visible=True)
218
- # with home:
219
- # gr.Markdown("## AI Multi-Modal Detector")
220
- # with gr.Row():
221
- # t_btn = gr.Button("Text")
222
- # i_btn = gr.Button("Image")
223
- # v_btn = gr.Button("Video")
224
- # a_btn = gr.Button("Audio")
225
-
226
- # text_page = gr.Column(visible=False)
227
- # with text_page:
228
- # inp = gr.Textbox(lines=5, placeholder="Paste text...", label="Text")
229
- # out = gr.Markdown()
230
- # gr.Button("Analyze").click(format_text_results, inputs=inp, outputs=out)
231
- # gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,text_page])
232
-
233
- # image_page = gr.Column(visible=False)
234
- # with image_page:
235
- # inp = gr.Image(type="pil")
236
- # out = gr.Markdown()
237
- # gr.Button("Analyze").click(format_image_results, inputs=inp, outputs=out)
238
- # gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,image_page])
239
-
240
- # video_page = gr.Column(visible=False)
241
- # with video_page:
242
- # inp = gr.Video()
243
- # out = gr.Markdown()
244
- # gr.Button("Analyze").click(format_video_results, inputs=inp, outputs=out)
245
- # gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,video_page])
246
-
247
- # audio_page = gr.Column(visible=False)
248
- # with audio_page:
249
- # inp = gr.Audio(type="filepath")
250
- # out = gr.Markdown()
251
- # gr.Button("Analyze").click(format_audio_results, inputs=inp, outputs=out)
252
- # gr.Button("Back").click(lambda: (gr.update(visible=True), gr.update(visible=False)), outputs=[home,audio_page])
253
-
254
- # t_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,text_page])
255
- # i_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,image_page])
256
- # v_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,video_page])
257
- # a_btn.click(lambda: (gr.update(visible=False), gr.update(visible=True)), outputs=[home,audio_page])
258
-
259
- # app.launch(share=True)
260
  with gr.Blocks() as app:
261
  # Home Page
262
  home_page = gr.Column(visible=True)
@@ -337,4 +349,7 @@ with gr.Blocks() as app:
337
  analyze_video_btn.click(format_video_results, inputs=video_input, outputs=video_output)
338
  analyze_audio_btn.click(format_audio_results, inputs=audio_input, outputs=audio_output)
339
 
340
- app.launch(share=True, debug=True)
 
 
 
 
27
 
28
  device = "cuda" if torch.cuda.is_available() else "cpu"
29
 
 
30
  # TEXT DETECTION
31
  # -----------------------------
32
  def run_hf_detector(text, model_id="roberta-base-openai-detector"):
 
153
  # AUDIO DETECTION
154
  # -----------------------------
155
  class AudioCNNRNN(nn.Module):
156
+ def __init__(self, lstm_hidden_size=128, num_classes=2):
157
+ super(AudioCNNRNN, self).__init__()
158
  self.cnn = nn.Sequential(
159
+ nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
160
+ nn.ReLU(),
161
+ nn.MaxPool2d(2),
162
+ nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
163
+ nn.ReLU(),
164
+ nn.MaxPool2d(2),
165
  )
166
+ self.lstm = nn.LSTM(input_size=64, hidden_size=lstm_hidden_size, batch_first=True)
167
+ self.fc = nn.Linear(lstm_hidden_size, num_classes)
168
+
169
+ def forward(self, x):
170
+ batch_size, seq_len, c, h, w = x.size()
171
+ c_in = x.view(batch_size * seq_len, c, h, w)
172
+ features = self.cnn(c_in)
173
+ features = features.mean(dim=[2, 3])
174
+ features = features.view(batch_size, seq_len, -1)
175
+ lstm_out, _ = self.lstm(features)
176
+ out = self.fc(lstm_out[:, -1, :])
177
+ return out
178
 
179
  def extract_mel_spectrogram(audio_path, sr=16000, n_mels=64):
180
+ waveform, sample_rate = librosa.load(audio_path, sr=sr)
181
+ mel_spec = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=n_mels)
182
+ mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
183
+ return mel_spec_db
184
+
185
+ def slice_spectrogram(mel_spec, slice_size=128, step=64):
186
+ slices = []
187
+ for start in range(0, mel_spec.shape[1] - slice_size, step):
188
+ slice_ = mel_spec[:, start:start + slice_size]
189
+ slices.append(slice_)
190
+ return slices
191
  def analyze_audio(audio_path):
192
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
193
+
194
+ model = AudioCNNRNN()
195
+ model.eval()
196
+ model.to(device)
197
+
198
  mel_spec = extract_mel_spectrogram(audio_path)
199
+ mel_slices = slice_spectrogram(mel_spec, slice_size=128, step=64)
200
+
201
+ if len(mel_slices) == 0:
202
+ raise RuntimeError("No mel slices generated. Check audio length.")
203
+
204
+ tensor_slices = [torch.tensor(s).unsqueeze(0) for s in mel_slices]
205
+ data = torch.stack(tensor_slices)
206
+ data = data.unsqueeze(0)
207
+ data = data.to(device)
208
+
209
+ with torch.no_grad():
210
+ outputs = model(data)
211
+ logits = outputs
212
+
213
+ temperature = 3.0
214
+ probabilities = torch.nn.functional.softmax(logits / temperature, dim=-1)
215
+
216
+ ai_probability = probabilities[0][0].item()
217
+ human_probability = probabilities[0][1].item()
218
+
219
+ diff = abs(ai_probability - human_probability)
220
+ if diff >= 0.7:
221
+ confidence = "High"
222
+ elif diff >= 0.3:
223
+ confidence = "Medium"
224
+ else:
225
+ confidence = "Low"
226
+
227
+ prompt = f"""
228
+ You are an AI audio analysis expert.
229
+ The detector outputs:
230
+ - AI-generated probability: {ai_probability:.4f}
231
+ - Human-generated probability: {human_probability:.4f}
232
+ - Confidence level: {confidence}
233
+
234
+ Give a short, human-readable explanation (1-2 sentences) of why the audio was likely classified as {'AI-generated' if ai_probability > human_probability else 'human-generated'}.
235
+ Base it on audio cues such as tone, pitch patterns, unnatural pauses, synthesis artifacts, or other hints you might infer.
236
+ Avoid repeating probabilities; focus on the reasoning.
237
+ """
238
+
239
+ response = client.chat.completions.create(
240
+ model="llama-3.3-70b-versatile",
241
+ messages=[{"role": "user", "content": prompt}],
242
+ temperature=0.6,
243
+ )
244
+
245
+ return {
246
+ "ai_probability": ai_probability,
247
+ "confidence": confidence,
248
+ "explanation": response.choices[0].message.content.strip()
249
+ }
250
 
251
  # -----------------------------
252
  # GRADIO UI
 
258
 
259
  def format_image_results(image):
260
  res = analyze_image(image)
261
+ return f"### Image Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"
262
 
263
  def format_video_results(video_file):
264
  res = analyze_video(video_file)
265
  if "error" in res: return res["error"]
266
+ return f"### Video Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"
267
 
268
  def format_audio_results(audio_file):
269
  res = analyze_audio(audio_file)
270
+ return f"### Audio Detection\nAI Probability: {res['ai_probability']:.4f}\n\nConfidence: {res['confidence']}\n\nExplanation: {res['explanation']}"
271
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  with gr.Blocks() as app:
273
  # Home Page
274
  home_page = gr.Column(visible=True)
 
349
  analyze_video_btn.click(format_video_results, inputs=video_input, outputs=video_output)
350
  analyze_audio_btn.click(format_audio_results, inputs=audio_input, outputs=audio_output)
351
 
352
+ app.launch(share=True, debug=True)
353
+
354
+
355
+