import time import nltk from transformers import pipeline nltk.download('punkt') from nltk.tokenize import sent_tokenize def detect_language(text,LID): predictions = LID.predict(text) detected_lang_code = predictions[0][0].replace("__label__", "") return detected_lang_code def translation(model_name, sentence_mode, selection_mode, source, target, text, flores_codes, model_dict, device): start_time = time.time() # Determine the source language if selection_mode == "Auto-detect": detected_lang_code = detect_language(text) flores_source_code = detected_lang_code source_code = flores_source_code else: if source == "Auto-detect": # Make sure we don't use "Auto-detect" as a key return {'error': "Source language cannot be 'Auto-detect' when selection mode is manual."} source_code = flores_codes.get(source) if not source_code: return {'error': f"Source language {source} not found in flores_codes."} target_code = flores_codes[target] model = model_dict[model_name + '_model'] tokenizer = model_dict[model_name + '_tokenizer'] translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=source_code, tgt_lang=target_code, device=device) if sentence_mode == "Sentence-wise": sentences = sent_tokenize(text) translated_sentences = [] for sentence in sentences: translated_sentence = translator(sentence, max_length=400)[0]['translation_text'] translated_sentences.append(translated_sentence) output = ' '.join(translated_sentences) else: output = translator(text, max_length=400)[0]['translation_text'] end_time = time.time() result = { 'inference_time': end_time - start_time, 'source_language': source_code, 'target_language': target_code, 'result': output } return result