Upload excel_translate_bis.py

#1
Files changed (1) hide show
  1. excel/excel_translate_bis.py +196 -0
excel/excel_translate_bis.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import xlwings as xw
2
+ from typing import Dict, List
3
+ from translate.translator import translate_text_dict
4
+ import math
5
+ import chardet
6
+ import io
7
+ import pandas as pd
8
+ import pymongo
9
+ import gridfs
10
+ from io import BytesIO
11
+ import tempfile
12
+ import os
13
+ import openpyxl
14
+ from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
15
+
16
+ def translate_xlsx(file_id: str, target_lang: str = ""):
17
+ # Kết nối MongoDB
18
+ client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
19
+ db = client["excel"]
20
+ fs_input = gridfs.GridFS(db, collection="root_file")
21
+ fs_output = gridfs.GridFS(db, collection="final_file")
22
+
23
+ # Tải file từ MongoDB
24
+ file_data = fs_input.get(file_id)
25
+
26
+ # Lưu file tạm thời
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
28
+ temp_file.write(file_data.read())
29
+ temp_file_path = temp_file.name
30
+
31
+ # Đọc file Excel bằng openpyxl
32
+ wb = openpyxl.load_workbook(temp_file_path)
33
+ sheets = wb.worksheets
34
+
35
+ # 🔸 Translate Sheet Names
36
+ sheet_name_dict = {ws.title: [ws.title] for ws in sheets}
37
+ translated_sheet_names = translate_text_dict(sheet_name_dict, target_lang=target_lang)
38
+ for ws in sheets:
39
+ new_name = translated_sheet_names.get(ws.title, [ws.title])[0]
40
+ ws.title = new_name[:31] # Excel sheet names must be ≤ 31 characters
41
+
42
+ for ws in sheets:
43
+ max_row = ws.max_row
44
+ max_col = ws.max_column
45
+
46
+ # Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
47
+ text_dict: Dict[str, List[str]] = {}
48
+ cell_map: Dict[str, any] = {}
49
+
50
+ for row in range(1, max_row + 1):
51
+ for col in range(1, max_col + 1):
52
+ cell = ws.cell(row=row, column=col)
53
+ if isinstance(cell.value, str):
54
+ key = f"R{row}C{col}"
55
+ text_dict[key] = [cell.value]
56
+ cell_map[key] = cell
57
+
58
+ # 🔸 Extract Text Box Content
59
+ textbox_dict = {}
60
+ textbox_map = {}
61
+ if hasattr(ws, "_drawing") and ws._drawing:
62
+ for rel in ws._drawing._rels.values():
63
+ if isinstance(rel.target, SpreadsheetDrawing):
64
+ for shape in rel.target.shape_tree:
65
+ if hasattr(shape, "text") and shape.text:
66
+ key = f"TX-{hash(shape.text)}"
67
+ textbox_dict[key] = [shape.text]
68
+ textbox_map[key] = shape
69
+
70
+ # 🔸 Merge all translatable text and translate
71
+ full_dict = {**text_dict, **textbox_dict}
72
+ translated_dict = translate_text_dict(full_dict, target_lang=target_lang)
73
+
74
+ # Cập nhật cell
75
+ for key, cell in cell_map.items():
76
+ if key in translated_dict:
77
+ translated_text = translated_dict[key][0]
78
+ cell.value = translated_text
79
+
80
+ # Cập nhật text box
81
+ for key, shape in textbox_map.items():
82
+ if key in translated_dict:
83
+ translated_text = translated_dict[key][0]
84
+ shape.text = translated_text
85
+
86
+ # Lưu workbook vào file tạm thời
87
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
88
+ wb.save(output_file.name)
89
+ output_file.seek(0)
90
+ translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
91
+
92
+ # Đóng workbook và xóa file tạm
93
+ wb.close()
94
+ os.remove(temp_file_path)
95
+
96
+ print(f"✅ Dịch thành công! File đã lưu vào MongoDB với file_id: {translated_file_id}")
97
+ return translated_file_id
98
+
99
+
100
+ def read_csv_with_auto_encoding(csv_path):
101
+ # Đọc file dưới dạng nhị phân
102
+ with open(csv_path, "rb") as f:
103
+ raw_data = f.read()
104
+ # Dò tìm encoding
105
+ detect_result = chardet.detect(raw_data)
106
+ encoding = detect_result["encoding"]
107
+ confidence = detect_result["confidence"]
108
+
109
+ print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
110
+
111
+ # Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
112
+ if encoding is None:
113
+ encoding = "utf-8"
114
+
115
+ decoded_data = raw_data.decode(encoding, errors='replace')
116
+
117
+ # Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
118
+ csv_data = io.StringIO(decoded_data)
119
+ df = pd.read_csv(csv_data)
120
+ return df
121
+
122
+
123
+ def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
124
+ # Kết nối MongoDB
125
+ client = pymongo.MongoClient("mongodb://localhost:27017")
126
+ db = client[db_name]
127
+ fs_input = gridfs.GridFS(db, collection="root_file")
128
+ fs_output = gridfs.GridFS(db, collection="final_file")
129
+
130
+ # Tải file từ MongoDB
131
+ file_data = fs_input.get(file_id).read()
132
+
133
+ # Lưu file tạm thời
134
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
135
+ temp_file.write(file_data)
136
+ temp_file_path = temp_file.name
137
+
138
+ df = read_csv_with_auto_encoding(temp_file_path)
139
+
140
+ # If text_columns is not specified, we assume we want to translate everything that looks like text.
141
+ # Otherwise, only translate the given columns.
142
+ if text_columns is None:
143
+ # Example heuristic: choose all object/string columns
144
+ text_columns = df.select_dtypes(include=["object"]).columns.tolist()
145
+
146
+ num_rows = len(df)
147
+ num_chunks = math.ceil(num_rows / chunk_size)
148
+
149
+ translated_df = df.copy() # copy to store the final translations
150
+
151
+ for chunk_index in range(num_chunks):
152
+ start_idx = chunk_index * chunk_size
153
+ end_idx = min((chunk_index + 1) * chunk_size, num_rows)
154
+ chunk_df = df.iloc[start_idx:end_idx]
155
+
156
+ # Build a dictionary structure. For example, row-based:
157
+ # {
158
+ # "0": {"colA": "some text", "colB": "some text"},
159
+ # "1": {"colA": "some text", "colB": "some text"},
160
+ # ...
161
+ # }
162
+ chunk_dict = {}
163
+ for i, row in chunk_df.iterrows():
164
+ row_dict = {}
165
+ for col in text_columns:
166
+ row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
167
+ chunk_dict[str(i)] = row_dict
168
+
169
+ # Now call your LLM translator on this dictionary
170
+ translated_chunk = translate_text_dict(
171
+ text_dict=chunk_dict,
172
+ source_lang=source_lang,
173
+ target_lang=target_lang,
174
+ gemini_api=gemini_api
175
+ )
176
+
177
+ # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
178
+ for i_str, row_data in translated_chunk.items():
179
+ i = int(i_str)
180
+ for col, translated_val in row_data.items():
181
+ translated_df.at[i, col] = translated_val
182
+
183
+ # Lưu file dịch vào tệp tạm thời
184
+ translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
185
+ translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
186
+
187
+ # Đọc lại file tạm để lưu vào MongoDB
188
+ with open(translated_file_path, "rb") as f:
189
+ translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
190
+
191
+ # Xóa file tạm
192
+ os.remove(temp_file_path)
193
+ os.remove(translated_file_path)
194
+
195
+ print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
196
+ return translated_file_id