from gradio_client import Client, handle_file import pandas as pd import gradio as gr from vosk import Model, KaldiRecognizer import json import wave clientEngText = Client("dj-dawgs-ipd/IPD-Text-English-Finetune") clientHingText = Client("dj-dawgs-ipd/IPD-Text-Hinglish") clientAud = Client("dj-dawgs-ipd/IPD_Audio_HuBERT") profanity_df = pd.read_csv('Hinglish_Profanity_List.csv', encoding='utf-8') profanity_hn = profanity_df['profanity_hn'] vosk_model = Model(lang="en-us") # import whisper # def stt_whisper(file_path): # model = whisper.load_model("base") # try: # result = model.transcribe(file_path) # return result["text"] # except Exception as e: # print(e) # return "" def stt_vosk(file_path): try: wf = wave.open(file_path, "rb") rec = KaldiRecognizer(vosk_model, wf.getframerate()) rec.SetWords(True) rec.SetPartialWords(True) while True: data = wf.readframes(4000) if len(data) == 0: break rec.AcceptWaveform(data) data = json.loads(rec.FinalResult()) return data["text"] except: return "" def extract_text(audio_path): return stt_vosk(audio_path).lower() def predict_hate_speech(audio_path): audResult = clientAud.predict( audio_path=handle_file(audio_path), api_name="/predict" ) audResult = json.loads(audResult.replace("'", '"')) stt_text = extract_text(audio_path) engResult = clientEngText.predict( text=stt_text[:200], api_name="/predict" ) hingResult = clientHingText.predict( text=stt_text[:200], api_name="/predict" ) profanityFound = [word for word in stt_text.split() if word in profanity_hn] threshold = 0.6 isHate = (engResult[0] != "NEITHER" and engResult[1] > threshold) or ( hingResult[0] != "NAG" and hingResult[1] > threshold) or ( audResult['Classification'] == 'Hate Speech\n' and audResult['Confidence'] > threshold) engConf = engResult[1] if engResult[0] != "NEITHER" else (1 - engResult[1]) hingConf = hingResult[1] if hingResult[0] != "NEITHER" else (1 - hingResult[1]) audConf = audResult['Confidence'] if audResult['Classification'] == 'Hate Speech\n' else (1 - audResult['Confidence']) confidence = (engConf + hingConf + audConf) / 3 if len(profanityFound) > 0: return { 'prediction' : 'hate', 'language' : 'Hindi', 'label' : 'Profanity found', 'confidence' : None, 'hate_text' : ",".join(profanityFound) } if isHate: return { 'prediction' : 'hate', 'language' : 'English' if engConf > hingConf else 'Hinglish', 'label' : None, 'confidence' : confidence, 'hate_text' : stt_text } return { 'prediction' : 'not_hate', 'language' : None, 'label' : None, 'confidence' : None, 'hate_text' : None } iface = gr.Interface( fn=predict_hate_speech, inputs=gr.Audio(type="filepath", label="Upload Audio"), outputs=gr.JSON(), title="Hate Speech Audio Pipeline", description="Upload an audio file to detect potential hate speech content.", examples=[ ["hate_1.wav"], ["hate_2.wav"] ], allow_flagging="manual" ) if __name__ == "__main__": iface.launch()