File size: 2,029 Bytes
fc5ed00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import time
import nltk

from transformers import pipeline

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def detect_language(text,LID):
    predictions = LID.predict(text)
    detected_lang_code = predictions[0][0].replace("__label__", "")
    return detected_lang_code

def translation(model_name, 
                sentence_mode, selection_mode, 
                source, target, 
                text, 
                flores_codes, 
                model_dict, device):
    start_time = time.time()

    # Determine the source language
    if selection_mode == "Auto-detect":
        detected_lang_code = detect_language(text)
        flores_source_code = detected_lang_code
        source_code = flores_source_code
    else:
        if source == "Auto-detect":  # Make sure we don't use "Auto-detect" as a key
            return {'error': "Source language cannot be 'Auto-detect' when selection mode is manual."}
        source_code = flores_codes.get(source)
        if not source_code:
            return {'error': f"Source language {source} not found in flores_codes."}


    target_code = flores_codes[target]
    model = model_dict[model_name + '_model']
    tokenizer = model_dict[model_name + '_tokenizer']

    translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source_code, tgt_lang=target_code, device=device)

    if sentence_mode == "Sentence-wise":
        sentences = sent_tokenize(text)
        translated_sentences = []
        for sentence in sentences:
            translated_sentence = translator(sentence, max_length=400)[0]['translation_text']
            translated_sentences.append(translated_sentence)
        output = ' '.join(translated_sentences)
    else:
        output = translator(text, max_length=400)[0]['translation_text']

    end_time = time.time()

    result = {
        'inference_time': end_time - start_time,
        'source_language': source_code,
        'target_language': target_code,
        'result': output
    }
    return result