File size: 1,747 Bytes
ff93898
 
0e9ff78
 
 
 
ff93898
 
 
 
0e9ff78
 
ff93898
0e9ff78
ff93898
 
7da22a4
0e9ff78
 
 
ff93898
0e9ff78
 
 
 
 
 
 
 
ff93898
0e9ff78
 
 
 
804add3
7da22a4
0e9ff78
 
ff93898
0e9ff78
 
 
 
 
ff93898
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import io
import pandas as pd
from translate.translator import translate_text_dict
import math
import chardet

def read_csv_with_auto_encoding_from_bytes(csv_bytes) -> pd.DataFrame:
    raw_data = csv_bytes.read()
    detect_result = chardet.detect(raw_data)
    encoding = detect_result["encoding"] or "utf-8"
    decoded_data = raw_data.decode(encoding, errors='replace')
    csv_data = io.StringIO(decoded_data)
    return pd.read_csv(csv_data)

def translate_csv(file_bytes, file_name, source_lang: str, target_lang: str = "vi", chunk_size: int = 50) -> bytes:
    df = read_csv_with_auto_encoding_from_bytes(file_bytes)
    text_columns = df.select_dtypes(include=["object"]).columns.tolist()

    num_rows = len(df)
    num_chunks = math.ceil(num_rows / chunk_size)
    translated_df = df.copy()

    for chunk_index in range(num_chunks):
        start_idx = chunk_index * chunk_size
        end_idx = min((chunk_index + 1) * chunk_size, num_rows)
        chunk_df = df.iloc[start_idx:end_idx]

        chunk_dict = {}
        for i, row in chunk_df.iterrows():
            row_dict = {col: str(row[col]) if pd.notnull(row[col]) else "" for col in text_columns}
            chunk_dict[str(i)] = row_dict

        translated_chunk = translate_text_dict(
            text_dict=chunk_dict,
            source_lang=source_lang,
            target_lang=target_lang
        )


        for i_str, row_data in translated_chunk.items():
            i = int(i_str)
            for col, translated_val in row_data.items():
                translated_df.at[i, col] = translated_val

    output_buffer = io.BytesIO()
    translated_df.to_csv(output_buffer, index=False, encoding='utf-8-sig')
    output_buffer.seek(0)
    return output_buffer, file_name