|
from tqdm import tqdm |
|
from deep_translator import GoogleTranslator |
|
from itertools import chain |
|
import copy |
|
from .language_configuration import fix_code_language, INVERTED_LANGUAGES |
|
from .logging_setup import logger |
|
import re |
|
import json |
|
import time |
|
|
|
TRANSLATION_PROCESS_OPTIONS = [ |
|
"google_translator_batch", |
|
"google_translator", |
|
"gpt-3.5-turbo-0125_batch", |
|
"gpt-3.5-turbo-0125", |
|
"gpt-4-turbo-preview_batch", |
|
"gpt-4-turbo-preview", |
|
"disable_translation", |
|
] |
|
DOCS_TRANSLATION_PROCESS_OPTIONS = [ |
|
"google_translator", |
|
"gpt-3.5-turbo-0125", |
|
"gpt-4-turbo-preview", |
|
"disable_translation", |
|
] |
|
|
|
|
|
def translate_iterative(segments, target, source=None): |
|
""" |
|
Translate text segments individually to the specified language. |
|
|
|
Parameters: |
|
- segments (list): A list of dictionaries with 'text' as a key for |
|
segment text. |
|
- target (str): Target language code. |
|
- source (str, optional): Source language code. Defaults to None. |
|
|
|
Returns: |
|
- list: Translated text segments in the target language. |
|
|
|
Notes: |
|
- Translates each segment using Google Translate. |
|
|
|
Example: |
|
segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] |
|
translated_segments = translate_iterative(segments, 'es') |
|
""" |
|
|
|
segments_ = copy.deepcopy(segments) |
|
|
|
if ( |
|
not source |
|
): |
|
logger.debug("No source language") |
|
source = "auto" |
|
|
|
translator = GoogleTranslator(source=source, target=target) |
|
|
|
for line in tqdm(range(len(segments_))): |
|
text = segments_[line]["text"] |
|
translated_line = translator.translate(text.strip()) |
|
segments_[line]["text"] = translated_line |
|
|
|
return segments_ |
|
|
|
|
|
def verify_translate( |
|
segments, |
|
segments_copy, |
|
translated_lines, |
|
target, |
|
source |
|
): |
|
""" |
|
Verify integrity and translate segments if lengths match, otherwise |
|
switch to iterative translation. |
|
""" |
|
if len(segments) == len(translated_lines): |
|
for line in range(len(segments_copy)): |
|
logger.debug( |
|
f"{segments_copy[line]['text']} >> " |
|
f"{translated_lines[line].strip()}" |
|
) |
|
segments_copy[line]["text"] = translated_lines[ |
|
line].replace("\t", "").replace("\n", "").strip() |
|
return segments_copy |
|
else: |
|
logger.error( |
|
"The translation failed, switching to google_translate iterative. " |
|
f"{len(segments), len(translated_lines)}" |
|
) |
|
return translate_iterative(segments, target, source) |
|
|
|
|
|
def translate_batch(segments, target, chunk_size=2000, source=None): |
|
""" |
|
Translate a batch of text segments into the specified language in chunks, |
|
respecting the character limit. |
|
|
|
Parameters: |
|
- segments (list): List of dictionaries with 'text' as a key for segment |
|
text. |
|
- target (str): Target language code. |
|
- chunk_size (int, optional): Maximum character limit for each translation |
|
chunk (default is 2000; max 5000). |
|
- source (str, optional): Source language code. Defaults to None. |
|
|
|
Returns: |
|
- list: Translated text segments in the target language. |
|
|
|
Notes: |
|
- Splits input segments into chunks respecting the character limit for |
|
translation. |
|
- Translates the chunks using Google Translate. |
|
- If chunked translation fails, switches to iterative translation using |
|
`translate_iterative()`. |
|
|
|
Example: |
|
segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] |
|
translated = translate_batch(segments, 'es', chunk_size=4000, source='en') |
|
""" |
|
|
|
segments_copy = copy.deepcopy(segments) |
|
|
|
if ( |
|
not source |
|
): |
|
logger.debug("No source language") |
|
source = "auto" |
|
|
|
|
|
text_lines = [] |
|
for line in range(len(segments_copy)): |
|
text = segments_copy[line]["text"].strip() |
|
text_lines.append(text) |
|
|
|
|
|
text_merge = [] |
|
actual_chunk = "" |
|
global_text_list = [] |
|
actual_text_list = [] |
|
for one_line in text_lines: |
|
one_line = " " if not one_line else one_line |
|
if (len(actual_chunk) + len(one_line)) <= chunk_size: |
|
if actual_chunk: |
|
actual_chunk += " ||||| " |
|
actual_chunk += one_line |
|
actual_text_list.append(one_line) |
|
else: |
|
text_merge.append(actual_chunk) |
|
actual_chunk = one_line |
|
global_text_list.append(actual_text_list) |
|
actual_text_list = [one_line] |
|
if actual_chunk: |
|
text_merge.append(actual_chunk) |
|
global_text_list.append(actual_text_list) |
|
|
|
|
|
progress_bar = tqdm(total=len(segments), desc="Translating") |
|
translator = GoogleTranslator(source=source, target=target) |
|
split_list = [] |
|
try: |
|
for text, text_iterable in zip(text_merge, global_text_list): |
|
translated_line = translator.translate(text.strip()) |
|
split_text = translated_line.split("|||||") |
|
if len(split_text) == len(text_iterable): |
|
progress_bar.update(len(split_text)) |
|
else: |
|
logger.debug( |
|
"Chunk fixing iteratively. Len chunk: " |
|
f"{len(split_text)}, expected: {len(text_iterable)}" |
|
) |
|
split_text = [] |
|
for txt_iter in text_iterable: |
|
translated_txt = translator.translate(txt_iter.strip()) |
|
split_text.append(translated_txt) |
|
progress_bar.update(1) |
|
split_list.append(split_text) |
|
progress_bar.close() |
|
except Exception as error: |
|
progress_bar.close() |
|
logger.error(str(error)) |
|
logger.warning( |
|
"The translation in chunks failed, switching to iterative." |
|
" Related: too many request" |
|
) |
|
return translate_iterative(segments, target, source) |
|
|
|
|
|
translated_lines = list(chain.from_iterable(split_list)) |
|
|
|
return verify_translate( |
|
segments, segments_copy, translated_lines, target, source |
|
) |
|
|
|
|
|
def call_gpt_translate( |
|
client, |
|
model, |
|
system_prompt, |
|
user_prompt, |
|
original_text=None, |
|
batch_lines=None, |
|
): |
|
|
|
|
|
response = client.chat.completions.create( |
|
model=model, |
|
response_format={"type": "json_object"}, |
|
messages=[ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": user_prompt} |
|
] |
|
) |
|
result = response.choices[0].message.content |
|
logger.debug(f"Result: {str(result)}") |
|
|
|
try: |
|
translation = json.loads(result) |
|
except Exception as error: |
|
match_result = re.search(r'\{.*?\}', result) |
|
if match_result: |
|
logger.error(str(error)) |
|
json_str = match_result.group(0) |
|
translation = json.loads(json_str) |
|
else: |
|
raise error |
|
|
|
|
|
if batch_lines: |
|
for conversation in translation.values(): |
|
if isinstance(conversation, dict): |
|
conversation = list(conversation.values())[0] |
|
if ( |
|
list( |
|
original_text["conversation"][0].values() |
|
)[0].strip() == |
|
list(conversation[0].values())[0].strip() |
|
): |
|
continue |
|
if len(conversation) == batch_lines: |
|
break |
|
|
|
fix_conversation_length = [] |
|
for line in conversation: |
|
for speaker_code, text_tr in line.items(): |
|
fix_conversation_length.append({speaker_code: text_tr}) |
|
|
|
logger.debug(f"Data batch: {str(fix_conversation_length)}") |
|
logger.debug( |
|
f"Lines Received: {len(fix_conversation_length)}," |
|
f" expected: {batch_lines}" |
|
) |
|
|
|
return fix_conversation_length |
|
|
|
else: |
|
if isinstance(translation, dict): |
|
translation = list(translation.values())[0] |
|
if isinstance(translation, list): |
|
translation = translation[0] |
|
if isinstance(translation, set): |
|
translation = list(translation)[0] |
|
if not isinstance(translation, str): |
|
raise ValueError(f"No valid response received: {str(translation)}") |
|
|
|
return translation |
|
|
|
|
|
def gpt_sequential(segments, model, target, source=None): |
|
from openai import OpenAI |
|
|
|
translated_segments = copy.deepcopy(segments) |
|
|
|
client = OpenAI() |
|
progress_bar = tqdm(total=len(segments), desc="Translating") |
|
|
|
lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() |
|
lang_sc = "" |
|
if source: |
|
lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() |
|
|
|
fixed_target = fix_code_language(target) |
|
fixed_source = fix_code_language(source) if source else "auto" |
|
|
|
system_prompt = "Machine translation designed to output the translated_text JSON." |
|
|
|
for i, line in enumerate(translated_segments): |
|
text = line["text"].strip() |
|
start = line["start"] |
|
user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}" |
|
|
|
time.sleep(0.5) |
|
|
|
try: |
|
translated_text = call_gpt_translate( |
|
client, |
|
model, |
|
system_prompt, |
|
user_prompt, |
|
) |
|
|
|
except Exception as error: |
|
logger.error( |
|
f"{str(error)} >> The text of segment {start} " |
|
"is being corrected with Google Translate" |
|
) |
|
translator = GoogleTranslator( |
|
source=fixed_source, target=fixed_target |
|
) |
|
translated_text = translator.translate(text.strip()) |
|
|
|
translated_segments[i]["text"] = translated_text.strip() |
|
progress_bar.update(1) |
|
|
|
progress_bar.close() |
|
|
|
return translated_segments |
|
|
|
|
|
def gpt_batch(segments, model, target, token_batch_limit=900, source=None): |
|
from openai import OpenAI |
|
import tiktoken |
|
|
|
token_batch_limit = max(100, (token_batch_limit - 40) // 2) |
|
progress_bar = tqdm(total=len(segments), desc="Translating") |
|
segments_copy = copy.deepcopy(segments) |
|
encoding = tiktoken.get_encoding("cl100k_base") |
|
client = OpenAI() |
|
|
|
lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() |
|
lang_sc = "" |
|
if source: |
|
lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() |
|
|
|
fixed_target = fix_code_language(target) |
|
fixed_source = fix_code_language(source) if source else "auto" |
|
|
|
name_speaker = "ABCDEFGHIJKL" |
|
|
|
translated_lines = [] |
|
text_data_dict = [] |
|
num_tokens = 0 |
|
count_sk = {char: 0 for char in "ABCDEFGHIJKL"} |
|
|
|
for i, line in enumerate(segments_copy): |
|
text = line["text"] |
|
speaker = line["speaker"] |
|
last_start = line["start"] |
|
|
|
index_sk = int(speaker[-2:]) |
|
character_sk = name_speaker[index_sk] |
|
count_sk[character_sk] += 1 |
|
code_sk = character_sk+str(count_sk[character_sk]) |
|
text_data_dict.append({code_sk: text}) |
|
num_tokens += len(encoding.encode(text)) + 7 |
|
if num_tokens >= token_batch_limit or i == len(segments_copy)-1: |
|
try: |
|
batch_lines = len(text_data_dict) |
|
batch_conversation = {"conversation": copy.deepcopy(text_data_dict)} |
|
|
|
num_tokens = 0 |
|
text_data_dict = [] |
|
count_sk = {char: 0 for char in "ABCDEFGHIJKL"} |
|
|
|
|
|
system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." |
|
user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}" |
|
logger.debug(f"Prompt: {str(user_prompt)}") |
|
|
|
conversation = call_gpt_translate( |
|
client, |
|
model, |
|
system_prompt, |
|
user_prompt, |
|
original_text=batch_conversation, |
|
batch_lines=batch_lines, |
|
) |
|
|
|
if len(conversation) < batch_lines: |
|
raise ValueError( |
|
"Incomplete result received. Batch lines: " |
|
f"{len(conversation)}, expected: {batch_lines}" |
|
) |
|
|
|
for i, translated_text in enumerate(conversation): |
|
if i+1 > batch_lines: |
|
break |
|
translated_lines.append(list(translated_text.values())[0]) |
|
|
|
progress_bar.update(batch_lines) |
|
|
|
except Exception as error: |
|
logger.error(str(error)) |
|
|
|
first_start = segments_copy[max(0, i-(batch_lines-1))]["start"] |
|
logger.warning( |
|
f"The batch from {first_start} to {last_start} " |
|
"failed, is being corrected with Google Translate" |
|
) |
|
|
|
translator = GoogleTranslator( |
|
source=fixed_source, |
|
target=fixed_target |
|
) |
|
|
|
for txt_source in batch_conversation["conversation"]: |
|
translated_txt = translator.translate( |
|
list(txt_source.values())[0].strip() |
|
) |
|
translated_lines.append(translated_txt.strip()) |
|
progress_bar.update(1) |
|
|
|
progress_bar.close() |
|
|
|
return verify_translate( |
|
segments, segments_copy, translated_lines, fixed_target, fixed_source |
|
) |
|
|
|
|
|
def translate_text( |
|
segments, |
|
target, |
|
translation_process="google_translator_batch", |
|
chunk_size=4500, |
|
source=None, |
|
token_batch_limit=1000, |
|
): |
|
"""Translates text segments using a specified process.""" |
|
match translation_process: |
|
case "google_translator_batch": |
|
return translate_batch( |
|
segments, |
|
fix_code_language(target), |
|
chunk_size, |
|
fix_code_language(source) |
|
) |
|
case "google_translator": |
|
return translate_iterative( |
|
segments, |
|
fix_code_language(target), |
|
fix_code_language(source) |
|
) |
|
case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]: |
|
return gpt_sequential(segments, model, target, source) |
|
case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]: |
|
return gpt_batch( |
|
segments, |
|
translation_process.replace("_batch", ""), |
|
target, |
|
token_batch_limit, |
|
source |
|
) |
|
case "disable_translation": |
|
return segments |
|
case _: |
|
raise ValueError("No valid translation process") |
|
|