File size: 5,397 Bytes
30a32d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import torch
import json
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import HfApi
from datetime import datetime

MODEL_ID = os.environ.get("MODEL_ID")
DATASET_REPO = os.environ.get("DATASET_REPO") 
DESCRIPTION = os.environ.get("DESCRIPTION")
PROMPT = os.environ.get("PROMPT")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
api = HfApi()

on_load="""
async()=>{
    alert("Before using the service, users must agree to the following terms:\\n\\nPlease note that the model presented here is an experimental tool that is still being developed and improved.\\n\\nMeasures have been taken during the model creation process to minimizing the risk of generating vulgar, prohibited or inappropriate content. However, in rare cases, unwanted content may be generated. If you encounter any content that is deemed inappropriate or violates our policies, please contact us to report it. Your information will enable us to take further steps to improve and develop the model to make it safe and user-friendly.\\n\\nYou must not use the model for illegal, harmful, violent, racist or sexual purposes. Please do not send any private information. The website collects user dialogue data and reserves the right to distribute it under the Creative Commons Attribution (CC-BY) or similar license.");
}
"""

@spaces.GPU()
def model_gen(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").to(device)
    generated_ids = model.generate(**input_ids,
                                   max_new_tokens=1024,
                                   do_sample=False,
                                   repetition_penalty=1.1,
                                   )
    return tokenizer.decode(generated_ids[0][input_ids["input_ids"].shape[1]:], skip_special_tokens=True)

def translate(option, text):
    prompt = PROMPT
    if option == "EN → MK":
        direction = "English to Macedonian"
        prompt = prompt.format(direction=direction, 
                               sample=text)        
    else:
        direction = "Macedonian to English"
        prompt = prompt.format(direction=direction, 
                               sample=text)
        
    return model_gen(prompt)

def vote(input_data, output_data, vote):
    day=datetime.now().strftime("%Y-%m-%d")
    timestamp=datetime.now().timestamp()
    api.upload_file(
        path_or_fileobj=json.dumps({"input_data": input_data, "output_data": output_data, "vote": vote}, 
                                   indent=2, 
                                   ensure_ascii=False).encode('utf-8'),
        path_in_repo=f"translate/{day}/{timestamp}.json",
        repo_id=DATASET_REPO,
        repo_type="dataset",
        commit_message=f"L",
        run_as_future=True
    )

with gr.Blocks(theme="default", js=on_load) as demo:
    gr.Markdown(DESCRIPTION, line_breaks=True)
    
    with gr.Row(variant="panel"):
        with gr.Column():
            direction = gr.Dropdown(["MK → EN", "EN → MK"], 
                                     label="Translation direction", 
                                     info="Pick the translation direction", 
                                     value="EN → MK")
            input_data = gr.Textbox(label="Input Text", 
                                    lines=10)
            translate_btn = gr.Button("Submit",
                                      variant="primary", 
                                      scale=1)

        with gr.Column():
            output_data = gr.Textbox(label="Translated Text", 
                                     lines=15)
            with gr.Row():
                good_translation_btn = gr.Button("Good translation",
                                                 variant="primary")
                good_translate_flag = gr.Text(value="GOOD", visible=False)

                ok_translation_btn = gr.Button("OK translation",
                                                 variant="primary")
                ok_translate_flag = gr.Text(value="OK", visible=False)

                bad_translation_btn = gr.Button("Bad translation",
                                                 variant="primary")
                bad_translate_flag = gr.Text(value="BAD", visible=False)

        translate_btn.click(translate,
                            inputs=[direction, input_data],
                            outputs=output_data)
        good_translation_btn.click(vote,
                                   inputs=[input_data, output_data, good_translate_flag],
                                   outputs=[])
        ok_translation_btn.click(vote,
                                 inputs=[input_data, output_data, ok_translate_flag],
                                 outputs=[])
        bad_translation_btn.click(vote,
                                  inputs=[input_data, output_data, bad_translate_flag],
                                  outputs=[])
    

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained( 
                                                MODEL_ID,
                                                device_map=device,
                                                torch_dtype='auto'
                                                )
    
    demo.queue(max_size=50).launch()