File size: 4,229 Bytes
3e6ffc5
c153533
3e6ffc5
c153533
3e6ffc5
 
 
 
27ca4a9
fe02c49
 
 
 
0c7be31
 
 
706408b
 
 
0c7be31
4661832
fe02c49
 
706408b
 
 
 
 
 
 
 
 
 
 
 
fe02c49
 
b10cb1c
 
 
 
 
 
 
fe02c49
706408b
fe02c49
706408b
fe02c49
 
 
 
 
 
 
 
b10cb1c
fe02c49
 
0c7be31
 
 
 
fe02c49
0c7be31
 
 
 
 
 
 
 
 
fe02c49
0c7be31
 
fe02c49
 
 
 
0c7be31
fe02c49
 
 
3e6ffc5
 
 
 
 
 
 
0c7be31
3e6ffc5
0c7be31
3e6ffc5
200d05c
3e6ffc5
 
0c7be31
3e6ffc5
0c7be31
 
 
200d05c
4661832
0c7be31
c153533
fe02c49
200d05c
fe02c49
0c7be31
fe02c49
 
0c7be31
 
eadcb10
0c7be31
 
 
 
 
eadcb10
0c7be31
 
 
 
 
3e6ffc5
0c7be31
 
 
3e6ffc5
0c7be31
 
3e6ffc5
 
 
 
4661832
3e6ffc5
 
fe02c49
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
File: app.py

Description: Translate text...

Author: Didier Guillevic
Date: 2024-09-07
"""
import spaces
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

import gradio as gr
import langdetect

from deep_translator import GoogleTranslator
from model_spacy import nlp_xx

import model_translation
m2m100 = model_translation.ModelM2M100()


def translate_with_Helsinki(
        chunks, src_lang, tgt_lang, input_max_length, output_max_length) -> str:
    """Translate the chunks with the Helsinki model
    """
    if src_lang not in translation.src_langs:
        return (
            f"ISSUE: currently no model for language '{src_lang}'. "
             "If wrong language, please specify language."
        )
    logger.info(f"LANG: {src_lang}, TEXT: {chunks[0][:50]}...")
    tokenizer, model = translation.get_tokenizer_model_for_src_lang(src_lang)

    translated_chunks = []
    for chunk in chunks:
        # NOTE: The 'fa' (Persian) model has multiple target languages to choose from.
        # We need to specifiy the desired languages among: fra ita por ron spa
        #   https://huggingface.co/Helsinki-NLP/opus-mt-tc-big-fa-itc
        # Prepend text with >>fra<< in order to translate in French.
        if src_lang == 'fa':
            chunk = ">>fra<< " + chunk

        inputs = tokenizer(
            chunk, return_tensors="pt", max_length=input_max_length,
            truncation=True, padding="longest").to(model.device)
        outputs = model.generate(**inputs, max_length=output_max_length)
        translated_chunk = tokenizer.batch_decode(
            outputs, skip_special_tokens=True)[0]
        #logger.info(f"Text: {chunk}")
        #logger.info(f"Translation: {translated_chunk}")
        translated_chunks.append(translated_chunk)

    return '\n'.join(translated_chunks)


def translate_text(
        text: str,
        src_lang: str,
        tgt_lang: str
    ) -> str:
    """Translate the given text into English or French
    """
    # src_lang among the supported languages?
    # - make sure src_lang is not None
    src_lang = src_lang if (src_lang and src_lang != "auto") else langdetect.detect(text)
    if src_lang not in model_translation.language_codes.values():
        logging.error(f"Language detected {src_lang} not among supported language")
    
    # tgt_lang: make sure it is not None. Default to 'en' if not set.
    if tgt_lang not in model_translation.tgt_language_codes.values():
        tgt_lang = 'en'

    # translate
    translated_text_m2m100 = m2m100.translate(text, src_lang, tgt_lang)
    translated_text_google_translate = GoogleTranslator(
        source='auto', target='en').translate(text=text)

    return (
        translated_text_m2m100,
        translated_text_google_translate
    )


#
# User interface
#
with gr.Blocks() as demo:

    gr.Markdown("""
        ## Text translation v0.0.3
    """)
    # Input
    input_text = gr.Textbox(
        lines=5,
        placeholder="Enter text to translate",
        label="Text to translate",
        render=True
    )

    # Output
    output_text_m2m100 = gr.Textbox(
        lines=4,
        label="Facebook m2m100 (418M)",
        render=True
    )
    output_text_google_translate = gr.Textbox(
        lines=4,
        label="Google Translate",
        render=True
    )

    # Source and target languages
    with gr.Row():
        src_lang = gr.Dropdown(
            choices=model_translation.language_codes.items(),
            value="auto",
            label="Source language",
            render=True
        )
        tgt_lang = gr.Dropdown(
            choices=model_translation.tgt_language_codes.items(),
            value="en",
            label="Target language",
            render=True
        )

    # Submit button
    translate_btn = gr.Button("Translate")
    translate_btn.click(
        fn=translate_text,
        inputs=[input_text, src_lang, tgt_lang],
        outputs=[output_text_m2m100, output_text_google_translate]
    )

    with gr.Accordion("Documentation", open=False):
        gr.Markdown("""
            - Models: serving Facebook M2M100 (418M) and Google Translate.
        """)

if __name__ == "__main__":
    demo.launch()