Spaces:

BALAKA
/

ProfanityGuard

Runtime error

App Files Files Community

BALAKA commited on Jun 3, 2023

Commit

cd4c5a1

1 Parent(s): d0fd718

Create app.py

Browse files

Files changed (1) hide show

app.py +163 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import gradio as gr
+import numpy as np
+import librosa
+import soundfile as sf
+import requests
+import torch
+import torchaudio
+import math
+import os
+from glob import glob
+from pytube import YouTube
+from transformers import (
+    Wav2Vec2CTCTokenizer,
+    Wav2Vec2FeatureExtractor,
+    Wav2Vec2Processor,
+    Wav2Vec2ForCTC,
+    TrainingArguments,
+    Trainer,
+    pipeline
+)
+processor = Wav2Vec2Processor.from_pretrained("airesearch/wav2vec2-large-xlsr-53-th")
+model = Wav2Vec2ForCTC.from_pretrained("BALAKA/wav2vec2-large-xlsr-53-thai")
+demo = gr.Blocks()
+def check(sentence):
+    found = []
+    negative = ["กระดอ", "กระทิง", "กระสัน", "กระหรี่", "กรีด", "กวนส้นตีน", "กะหรี่", "กินขี้ปี้เยี่ยว", "ขายตัว", "ขี้", "ขโมย", "ข่มขืน", "ควย", "ควาย", "คอขาด", "ฆ่า", "จังไร", "จัญไร", "ฉิบหาย", "ฉี่", "ชั่ว", "ชาติหมา", "ชิงหมาเกิด", "ชิบหาย", "ช้างเย็ด", "ดาก", "ตอแหล", "ตัดหัว", "ตัดหำ", "ตาย", "ตีกัน", "ทรมาน", "ทาส", "ทุเรศ", "นรก", "บีบคอ", "ปากหมา", "ปี้กัน", "พ่อง", "พ่อมึง", "ฟักยู", "ฟาย", "ยัดแม่", "ยิงกัน", "ระยำ", "ดอกทอง", "โสเภณี", "ล่อกัน", "ศพ", "สถุล",
+                "สทุน", "สัด", "สันดาน", "สัส", "สาด", "ส้นตีน", "หน้าตัวเมืย", "ส้นตีน", "หมอย", "หรรม", "หัวแตก", "หำ", "หน้าหี", "น่าหี", "อนาจาร", "อัปปรี", "อีช้าง", "อีปลาวาฬ", "อีสัด", "อีหน้าหี", "อีหมา", "ห่า", "อับปรี", "เฆี่ยน", "เงี่ยน", "เจี๊ยว", "เชี่ย", "เด้า", "เผด็จการ", "เยี่ยว", "เย็ด", "เลือด", "เสือก", "เหล้า", "เหี้ย", "เอากัน", "แดก", "แตด", "แทง", "แม่ง", "แม่มึง", "แรด", "โคตร", "โง่", "โป๊", "โรคจิต", "ใจหมา", "ไอเข้", "ไอ้ขึ้หมา", "ไอ้บ้า", "ไอ้หมา", "เวร", "เวน"]
+    negative = list(dict.fromkeys(negative))
+    for i in negative:
+        if sentence.find(i) != -1:
+            found.append(i)
+    return found
+def resample(file_path):
+    speech_array, sampling_rate = torchaudio.load(file_path)
+    resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
+    return resampler(speech_array)[0].numpy()
+def tran_script(file_path):
+    if type(file_path) == 'str':
+        speech = resample(file_path)
+        inputs = processor(speech, sampling_rate=16_000,
+                           return_tensors="pt", padding=True)
+        logits = model(inputs.input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        predicted_sentence = processor.batch_decode(predicted_ids)
+        return predicted_sentence
+    else:
+        now_path = glob('/content/split_*.mp3')
+        sentence = []
+        for i in range(file_path - 1):
+            now_path = f'/content/split_{i+1}.mp3'
+            speech = resample(now_path)
+            inputs = processor(speech, sampling_rate=16_000,
+                               return_tensors="pt", padding=True)
+            logits = model(inputs.input_values).logits
+            predicted_ids = torch.argmax(logits, dim=-1)
+            predicted_sentence = processor.batch_decode(predicted_ids)
+            sentence.append(predicted_sentence)
+        return sentence
+def split_file(file_path):
+    speech, sample_rate = librosa.load(file_path)
+    buffer = 5 * sample_rate
+    samples_total = len(speech)
+    samples_wrote = 0
+    counter = 1
+    while samples_wrote < samples_total:
+        if buffer > (samples_total - samples_wrote):
+            buffer = samples_total - samples_wrote
+        block = speech[samples_wrote: (samples_wrote + buffer)]
+        out_filename = "split_" + str(counter) + ".mp3"
+        sf.write(out_filename, block, sample_rate)
+        counter += 1
+        samples_wrote += buffer
+    return counter
+def process(file_path):
+    if librosa.get_duration(filename=file_path) <= 5:
+        sentence = tran_script(file_path)
+        sentence = str(sentence).replace(' ', '').strip("[]grt")
+        return '[0.00-0.05] found : ' + check(sentence)
+    counter = split_file(file_path)
+    sentence = tran_script(counter)
+    result = ''
+    for index, item in enumerate(sentence):
+        now_sentence = item[0]
+        now_sentence = str(item).replace(' ', '').strip("[]grt")
+        now_sentence = check(now_sentence)
+        if now_sentence:
+            time = (index)*5
+            minutes = math.floor(time / 60)
+            hours = math.floor(minutes/60)
+            seconds = time % 60
+            minutes = str(minutes).zfill(2)
+            hours = str(hours).zfill(2)
+            fist_seconds = str(seconds).zfill(2)
+            last_seconds = str(seconds+5).zfill(2)
+            text = f'found at {hours}h {minutes}m {fist_seconds}-{last_seconds}seconds found {now_sentence}'
+            result += text + '\n'
+    return result
+def youtube_loader(link):
+    yt = YouTube(str(link))
+    video = yt.streams.filter(only_audio=True).first()
+    out_file = video.download(output_path='mp3')
+    os.rename(out_file, 'youtube.mp3')
+    return process('/content/mp3/youtube.mp3')
+def twitch_loader(link):
+    os.system(f"twitch-dl download -q audio_only {link} --output twitch.wav")
+    return process('/content/twitch.wav')
+with demo:
+    gr.Markdown("Select your input type.")
+    with gr.Tabs():
+        with gr.TabItem("From your voice."):
+            with gr.Row():
+                voice = gr.Audio(source="microphone", type="filepath",
+                                 optional=True, labe="Start record your voice here.")
+                voice_output = gr.Textbox()
+            text_button1 = gr.Button("Flip")
+        with gr.TabItem("From your file."):
+            with gr.Row():
+                file_input = gr.Audio(type="filepath", optional=True, labe="Drop your audio file here.")
+                file_output = gr.Textbox()
+            text_button4 = gr.Button("Flip")
+        with gr.TabItem("From youtube"):
+            with gr.Row():
+                youtube_input = gr.Textbox(
+                    label="Insert your youtube link here.", placeholder='https://www.youtube.com/watch?v=dQw4w9WgXcQ')
+                youtube_output = gr.Textbox()
+            text_button2 = gr.Button("Flip")
+        with gr.TabItem("From twitch"):
+            with gr.Row():
+                twitch_input = gr.Textbox(label="Insert your twitch link or ID here.",
+                                          placeholder='https://www.twitch.tv/videos/1823056925 or 1823056925')
+                twitch_output = gr.Textbox()
+            text_button3 = gr.Button("Flip")
+    text_button1.click(process, inputs=voice, outputs=voice_output)
+    text_button2.click(youtube_loader, inputs=youtube_input,
+                       outputs=youtube_output)
+    text_button3.click(twitch_loader, inputs=twitch_input,
+                       outputs=twitch_output)
+    text_button4.click(process, inputs=file_input,
+                       outputs=file_output)
+demo.launch(share=True, enable_queue=True)