uk-en-translator

Running on Zero

File size: 12,262 Bytes

import sys
import time

try:
    import spaces
except ImportError:
    print("ZeroGPU is not available, skipping...")

import torch
import torchaudio
import gradio as gr
import torchaudio.transforms as T
import polars as pl

from importlib.metadata import version
from gradio.utils import is_zero_gpu_space
from gradio.themes import Base

from paddleocr import PaddleOCR
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModelForCTC,
    Wav2Vec2BertProcessor,
)

use_zero_gpu = is_zero_gpu_space()
use_cuda = torch.cuda.is_available()

if use_zero_gpu:
    spaces_version = version("spaces")
    print("ZeroGPU is available, changing inference call.")
else:
    spaces_version = "N/A"
    print("ZeroGPU is not available, skipping...")

print(f"Spaces version: {spaces_version}")

if use_cuda:
    print("CUDA is available, setting correct `device` variable.")
    device = "cuda"
    torch_dtype = torch.bfloat16
else:
    device = "cpu"
    torch_dtype = torch.bfloat16

# Config
model_name = "Yehor/kulyk-uk-en"
concurrency_limit = 5
min_duration = 0.5
max_duration = 60
current_theme = Base()

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=torch_dtype,
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load ASR
audio_model = AutoModelForCTC.from_pretrained(
    "Yehor/w2v-bert-uk-v2.1-bf16", torch_dtype=torch_dtype, device_map=device
)
processor = Wav2Vec2BertProcessor.from_pretrained("Yehor/w2v-bert-uk-v2.1-bf16")

# Load OCR
ocr_model = PaddleOCR(
    lang="uk",
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False,
)

# Examples
examples_text = [
    "WP: F-16 навряд чи суттєво змінять ситуацію на полі бою",
    "Над Україною збито ракету та 7 із 8 «Шахедів»",
    "Олімпійські ігри 2024. Розклад змагань українських спортсменів на 28 липня",
    "Кампанія Гарріс зібрала понад 200 мільйонів доларів менш ніж за тиждень",
    "За тиждень НБУ продав майже 800 мільйонів доларів на міжбанківському ринку",
    "Париж 2024. День 2: Текстова трансляція",
]
examples_audio = [
    "example_1.wav",
    "example_2.wav",
    "example_3.wav",
    "example_4.wav",
    "example_5.wav",
    "example_6.wav",
]
examples_image = [
    "example_1.jpg",
    "example_2.jpg",
    "example_3.jpg",
    "example_4.jpg",
    "example_5.jpg",
    "example_6.jpg",
]

title = "UK-EN Translator"

authors_table = """
## Authors

Follow them on social networks and **contact** if you need any help or have any questions:

| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
|-------------------------------------------------------------------------------------------------|
| https://t.me/smlkw in Telegram                                                                  |
| https://x.com/yehor_smoliakov at X                                                              |
| https://github.com/egorsmkv at GitHub                                                           |
| https://huggingface.co/Yehor at Hugging Face                                                    |
| or use egorsmkv@gmail.com                                                                       |
""".strip()

description_head = f"""
# {title}

This space translates your text, audio, image from Ukrainian to English using [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-uk-en) model. Also, check [EN-UK Translator](https://huggingface.co/spaces/Yehor/en-uk-translator) out.
""".strip()


tech_env = f"""
#### Environment

- Python: {sys.version}
- Torch device: {device}
- Torch dtype: {torch_dtype}

#### Models

- [kulyk-uk-en](https://huggingface.co/Yehor/kulyk-en-uk)
- [wav2vec2-bert](https://huggingface.co/Yehor/w2v-bert-uk-v2.1-bf16)
- [PaddleOCR](https://huggingface.co/PaddlePaddle/eslav_PP-OCRv5_mobile_rec)
""".strip()

tech_libraries = f"""
#### Libraries

- torch: {version("torch")}
- torchaudio: {version("torchaudio")}
- transformers: {version("transformers")}
- accelerate: {version("accelerate")}
- gradio: {version("gradio")}
""".strip()


def translate(text: str) -> str:
    prompt = "Translate the text to Ukrainian:\n" + text

    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)

    output = model.generate(
        input_ids,
        max_new_tokens=2048,
        # Greedy Search
        do_sample=False,
        repetition_penalty=1.05,
        # Sampling
        # do_sample=True,
        # temperature=0.1,
        # # top_k=1,
        # min_p=0.9,
        # repetition_penalty=1.05,
    )

    prompt_len = input_ids.shape[1]
    generated_tokens = output[:, prompt_len:]
    translated_text = tokenizer.batch_decode(
        generated_tokens, skip_special_tokens=True
    )[0]

    return translated_text.strip()


@spaces.GPU
def inference_text(text, progress=gr.Progress()):
    if not text:
        raise gr.Error("Please paste your text.")

    progress(0, desc="Translating...")

    results = []

    sentences = text.split("\n")

    non_empty_sentences = []
    for sentence in sentences:
        s = sentence.strip()
        if len(s) != 0:
            non_empty_sentences.append(s)

    for sentence in progress.tqdm(
        non_empty_sentences, desc="Translating...", unit="sentence"
    ):
        t0 = time.time()
        translated_text = translate(sentence)
        elapsed_time = round(time.time() - t0, 2)

        results.append(
            {
                "sentence": sentence,
                "translated_text": translated_text,
                "elapsed_time": elapsed_time,
            }
        )

    gr.Info("Finished!", duration=2)

    return pl.DataFrame(results)


@spaces.GPU
def inference_audio(audio, progress=gr.Progress()):
    if not audio:
        raise gr.Error("Please paste your audio file.")

    progress(0, desc="Translating...")

    meta = torchaudio.info(audio)
    duration = meta.num_frames / meta.sample_rate

    if duration < min_duration:
        raise gr.Error(
            f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds."
        )
    if duration > max_duration:
        raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")

    audio_input, sr = torchaudio.load(audio)

    if meta.num_channels > 1:
        audio_input = torch.mean(audio_input, dim=0, keepdim=True)

    if meta.sample_rate != 16_000:
        resampler = T.Resample(sr, 16_000, dtype=audio_input.dtype)
        audio_input = resampler(audio_input)

    audio_input = audio_input.squeeze().numpy()

    features = processor([audio_input], sampling_rate=16_000).input_features
    features = torch.tensor(features).to(device, dtype=torch_dtype)

    with torch.inference_mode():
        logits = audio_model(features).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predictions = processor.batch_decode(predicted_ids)

    print("Predictions:", predictions)

    if not predictions:
        text = "-"
    else:
        text = "\n".join(predictions)

    print("Text:", text)

    results = []

    sentences = text.split("\n")

    non_empty_sentences = []
    for sentence in sentences:
        s = sentence.strip()
        if len(s) != 0:
            non_empty_sentences.append(s)

    for sentence in progress.tqdm(
        non_empty_sentences, desc="Translating...", unit="sentence"
    ):
        t0 = time.time()
        translated_text = translate(sentence)
        elapsed_time = round(time.time() - t0, 2)

        results.append(
            {
                "sentence": sentence,
                "translated_text": translated_text,
                "elapsed_time": elapsed_time,
            }
        )

    gr.Info("Finished!", duration=2)

    return pl.DataFrame(results)


@spaces.GPU
def inference_image(image, progress=gr.Progress()):
    if not image:
        raise gr.Error("Please paste your image file.")

    progress(0, desc="Translating...")

    if not isinstance(image, str):
        raise gr.Error("Please paste your image file.")

    predictions = ocr_model.predict(image)

    results = []
    for prediction in predictions:
      results.append(' '.join(prediction['rec_texts']))

    text = " ".join(results)

    print("Text:", text)

    results = []

    sentences = [text]

    for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"):
        t0 = time.time()
        translated_text = translate(sentence)
        elapsed_time = round(time.time() - t0, 2)

        results.append(
            {
                "sentence": sentence,
                "translated_text": translated_text,
                "elapsed_time": elapsed_time,
            }
        )

    gr.Info("Finished!", duration=2)

    return pl.DataFrame(results)


def create_app():
    tab = gr.Blocks(
        title=title,
        analytics_enabled=False,
        theme=current_theme,
    )

    with tab:
        gr.Markdown(description_head)
        gr.Markdown("## Usage")

        translated_text = gr.DataFrame(
            label="Translated text",
        )

        text = gr.Textbox(label="Text", autofocus=True, lines=5)

        gr.Button("Translate").click(
            inference_text,
            concurrency_limit=concurrency_limit,
            inputs=text,
            outputs=translated_text,
        )

        with gr.Row():
            gr.Examples(label="Choose an example", inputs=text, examples=examples_text)

    return tab


def create_audio_app():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(description_head)
        gr.Markdown("## Usage")

        translated_text = gr.DataFrame(
            label="Translated text",
        )

        audio = gr.Audio(label="Audio file", sources="upload", type="filepath")

        gr.Button("Translate").click(
            inference_audio,
            concurrency_limit=concurrency_limit,
            inputs=audio,
            outputs=translated_text,
        )

        with gr.Row():
            gr.Examples(
                label="Choose an example", inputs=audio, examples=examples_audio
            )

        gr.Markdown(
            f"> Due to resource limitations, audio duration **must not** exceed **{max_duration}** seconds."
        )

    return tab


def create_image_app():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(description_head)
        gr.Markdown("## Usage")

        translated_text = gr.DataFrame(
            label="Translated text",
        )

        image = gr.Image(label="Image file", sources="upload", type="filepath")

        gr.Button("Translate").click(
            inference_image,
            concurrency_limit=concurrency_limit,
            inputs=image,
            outputs=translated_text,
        )

        with gr.Row():
            gr.Examples(
                label="Choose an example", inputs=image, examples=examples_image
            )

    return tab


def create_env():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(tech_env)
        gr.Markdown(tech_libraries)

    return tab


def create_authors():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(authors_table)

    return tab


def create_demo():
    app_tab = create_app()
    app_audio_tab = create_audio_app()
    app_image_tab = create_image_app()
    authors_tab = create_authors()
    env_tab = create_env()

    return gr.TabbedInterface(
        [app_tab, app_audio_tab, app_image_tab, authors_tab, env_tab],
        tab_names=[
            "✍️ Text",
            "🔊 Audio",
            "👀 Image",
            "👥 Authors",
            "📦 Environment, Models, and Libraries",
        ],
    )


if __name__ == "__main__":
    demo = create_demo()
    demo.queue()
    demo.launch()