Spaces:

mpc001
/

auto_avsr

Runtime error

File size: 3,824 Bytes

a35325d
 
c2d564e
a35325d
 
63e2ee7
 
 
 
 
 
 
 
 
 
 
ccfc7d3
63e2ee7
 
 
7b95e93
63e2ee7
 
 
 
 
 
7b95e93
 
 
63e2ee7
ccfc7d3
63e2ee7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2d564e
 
a35325d
d577e1e
 
 
a35325d
 
 
c2d564e
 
 
 
 
 
a35325d
c2d564e
 
a35325d
 
 
63e2ee7
a35325d
 
63e2ee7
4c034e2
a35325d
 
 
 
63e2ee7
557008d

import os
import gradio as gr
from uuid import uuid4
from pipelines.pipeline import InferencePipeline

TITLE = """
    <div style="text-align: center; max-width: 650px; margin: 0 auto;">
        <div
        style="
            display: inline-flex;
            align-items: center;
            gap: 0.8rem;
            font-size: 1.75rem;
        "
        >
        <h1 style="font-weight: 900; margin-bottom: 7px;">
            Auto-AVSR: Audio-Visual Speech Recognition
        </h1>
        </div>
        <p style="margin-bottom: 10px; font-size: 94%">
        Want to recognize content in a noisy environment?<br>Our Auto-AVSR models are here to transcribe your answers from audio or visual information!
        </p>
    </div>
"""

ARTICLE = """
<div style="text-align: center; max-width: 650px; margin: 0 auto;">
    <p>
    Want to look into models? You can find our [<a href="https://github.com/mpc001/auto_avsr">training code</a>] and [<a href="https://arxiv.org/abs/2303.14307">paper</a>].
    </p>
    <p>
    The inference is performed on the CPU. You can also run on <a href="https://colab.research.google.com/drive/1jfb6e4xxhXHbmQf-nncdLno1u0b4j614?usp=sharing">Colab GPU</a>
    </p>
    <p>
    We share this demo only for non-commercial purposes.
    </p>
</div>
"""

CSS = """
    #col-container {margin-left: auto; margin-right: auto;}
    a {text-decoration-line: underline; font-weight: 600;}
    .animate-spin {
        animation: spin 1s linear infinite;
    }
    @keyframes spin {
        from { transform: rotate(0deg); }
        to { transform: rotate(360deg); }
    }
    #share-btn-container {
        display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
    }
    #share-btn {
        all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
    }
    #share-btn * {
        all: unset;
    }
    #share-btn-container div:nth-child(-n+2){
        width: auto !important;
        min-height: 0px !important;
    }
    #share-btn-container .wrap {
        display: none !important;
    }
"""

FFMPEG_COMMAND = "-loglevel error -y -r 25 -pix_fmt yuv420p -f mp4"

pipelines = {
    "VSR(mediapipe)": InferencePipeline("./configs/LRS3_V_WER19.1.ini", device="cpu", face_track=True, detector="mediapipe"),
    "ASR": InferencePipeline("./configs/LRS3_A_WER1.0.ini", device="cpu", face_track=True, detector="mediapipe"),
    "AVSR(mediapipe)": InferencePipeline("./configs/LRS3_AV_WER0.9.ini", device="cpu", face_track=True, detector="mediapipe")
}

def fn(pipeline_type, filename):
    directory = "./tmp"
    if not os.path.exists(directory):
        os.makedirs(directory)
    dst_filename = os.path.join(directory, str(uuid4())[:8]+".mp4")
    command_string = f"ffmpeg -i {filename} {FFMPEG_COMMAND} {dst_filename}"
    os.system(command_string)
    selected_pipeline_instance = pipelines[pipeline_type]
    landmarks = selected_pipeline_instance.process_landmarks(dst_filename, landmarks_filename=None)
    data = selected_pipeline_instance.dataloader.load_data(dst_filename, landmarks)
    transcript = selected_pipeline_instance.model.infer(data)
    return transcript

demo = gr.Blocks(css=CSS)

with demo:
    gr.HTML(TITLE)
    dropdown_list = gr.inputs.Dropdown(["ASR", "VSR(mediapipe)", "AVSR(mediapipe)"], label="model")
    video_file = gr.Video(label="INPUT VIDEO", include_audio=True)
    text = gr.Textbox(label="PREDICTION")
    btn = gr.Button("Submit").style(full_width=True)
    btn.click(fn, inputs=[dropdown_list, video_file], outputs=text)
    gr.HTML(ARTICLE)
demo.launch()