Spaces:
Running
Running
Upload excel_translate_bis.py
#1
by
Phong1
- opened
- excel/excel_translate_bis.py +196 -0
excel/excel_translate_bis.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import xlwings as xw
|
2 |
+
from typing import Dict, List
|
3 |
+
from translate.translator import translate_text_dict
|
4 |
+
import math
|
5 |
+
import chardet
|
6 |
+
import io
|
7 |
+
import pandas as pd
|
8 |
+
import pymongo
|
9 |
+
import gridfs
|
10 |
+
from io import BytesIO
|
11 |
+
import tempfile
|
12 |
+
import os
|
13 |
+
import openpyxl
|
14 |
+
from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
|
15 |
+
|
16 |
+
def translate_xlsx(file_id: str, target_lang: str = ""):
|
17 |
+
# Kết nối MongoDB
|
18 |
+
client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
|
19 |
+
db = client["excel"]
|
20 |
+
fs_input = gridfs.GridFS(db, collection="root_file")
|
21 |
+
fs_output = gridfs.GridFS(db, collection="final_file")
|
22 |
+
|
23 |
+
# Tải file từ MongoDB
|
24 |
+
file_data = fs_input.get(file_id)
|
25 |
+
|
26 |
+
# Lưu file tạm thời
|
27 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
|
28 |
+
temp_file.write(file_data.read())
|
29 |
+
temp_file_path = temp_file.name
|
30 |
+
|
31 |
+
# Đọc file Excel bằng openpyxl
|
32 |
+
wb = openpyxl.load_workbook(temp_file_path)
|
33 |
+
sheets = wb.worksheets
|
34 |
+
|
35 |
+
# 🔸 Translate Sheet Names
|
36 |
+
sheet_name_dict = {ws.title: [ws.title] for ws in sheets}
|
37 |
+
translated_sheet_names = translate_text_dict(sheet_name_dict, target_lang=target_lang)
|
38 |
+
for ws in sheets:
|
39 |
+
new_name = translated_sheet_names.get(ws.title, [ws.title])[0]
|
40 |
+
ws.title = new_name[:31] # Excel sheet names must be ≤ 31 characters
|
41 |
+
|
42 |
+
for ws in sheets:
|
43 |
+
max_row = ws.max_row
|
44 |
+
max_col = ws.max_column
|
45 |
+
|
46 |
+
# Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
|
47 |
+
text_dict: Dict[str, List[str]] = {}
|
48 |
+
cell_map: Dict[str, any] = {}
|
49 |
+
|
50 |
+
for row in range(1, max_row + 1):
|
51 |
+
for col in range(1, max_col + 1):
|
52 |
+
cell = ws.cell(row=row, column=col)
|
53 |
+
if isinstance(cell.value, str):
|
54 |
+
key = f"R{row}C{col}"
|
55 |
+
text_dict[key] = [cell.value]
|
56 |
+
cell_map[key] = cell
|
57 |
+
|
58 |
+
# 🔸 Extract Text Box Content
|
59 |
+
textbox_dict = {}
|
60 |
+
textbox_map = {}
|
61 |
+
if hasattr(ws, "_drawing") and ws._drawing:
|
62 |
+
for rel in ws._drawing._rels.values():
|
63 |
+
if isinstance(rel.target, SpreadsheetDrawing):
|
64 |
+
for shape in rel.target.shape_tree:
|
65 |
+
if hasattr(shape, "text") and shape.text:
|
66 |
+
key = f"TX-{hash(shape.text)}"
|
67 |
+
textbox_dict[key] = [shape.text]
|
68 |
+
textbox_map[key] = shape
|
69 |
+
|
70 |
+
# 🔸 Merge all translatable text and translate
|
71 |
+
full_dict = {**text_dict, **textbox_dict}
|
72 |
+
translated_dict = translate_text_dict(full_dict, target_lang=target_lang)
|
73 |
+
|
74 |
+
# Cập nhật cell
|
75 |
+
for key, cell in cell_map.items():
|
76 |
+
if key in translated_dict:
|
77 |
+
translated_text = translated_dict[key][0]
|
78 |
+
cell.value = translated_text
|
79 |
+
|
80 |
+
# Cập nhật text box
|
81 |
+
for key, shape in textbox_map.items():
|
82 |
+
if key in translated_dict:
|
83 |
+
translated_text = translated_dict[key][0]
|
84 |
+
shape.text = translated_text
|
85 |
+
|
86 |
+
# Lưu workbook vào file tạm thời
|
87 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
|
88 |
+
wb.save(output_file.name)
|
89 |
+
output_file.seek(0)
|
90 |
+
translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
|
91 |
+
|
92 |
+
# Đóng workbook và xóa file tạm
|
93 |
+
wb.close()
|
94 |
+
os.remove(temp_file_path)
|
95 |
+
|
96 |
+
print(f"✅ Dịch thành công! File đã lưu vào MongoDB với file_id: {translated_file_id}")
|
97 |
+
return translated_file_id
|
98 |
+
|
99 |
+
|
100 |
+
def read_csv_with_auto_encoding(csv_path):
|
101 |
+
# Đọc file dưới dạng nhị phân
|
102 |
+
with open(csv_path, "rb") as f:
|
103 |
+
raw_data = f.read()
|
104 |
+
# Dò tìm encoding
|
105 |
+
detect_result = chardet.detect(raw_data)
|
106 |
+
encoding = detect_result["encoding"]
|
107 |
+
confidence = detect_result["confidence"]
|
108 |
+
|
109 |
+
print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
|
110 |
+
|
111 |
+
# Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
|
112 |
+
if encoding is None:
|
113 |
+
encoding = "utf-8"
|
114 |
+
|
115 |
+
decoded_data = raw_data.decode(encoding, errors='replace')
|
116 |
+
|
117 |
+
# Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
|
118 |
+
csv_data = io.StringIO(decoded_data)
|
119 |
+
df = pd.read_csv(csv_data)
|
120 |
+
return df
|
121 |
+
|
122 |
+
|
123 |
+
def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
|
124 |
+
# Kết nối MongoDB
|
125 |
+
client = pymongo.MongoClient("mongodb://localhost:27017")
|
126 |
+
db = client[db_name]
|
127 |
+
fs_input = gridfs.GridFS(db, collection="root_file")
|
128 |
+
fs_output = gridfs.GridFS(db, collection="final_file")
|
129 |
+
|
130 |
+
# Tải file từ MongoDB
|
131 |
+
file_data = fs_input.get(file_id).read()
|
132 |
+
|
133 |
+
# Lưu file tạm thời
|
134 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
|
135 |
+
temp_file.write(file_data)
|
136 |
+
temp_file_path = temp_file.name
|
137 |
+
|
138 |
+
df = read_csv_with_auto_encoding(temp_file_path)
|
139 |
+
|
140 |
+
# If text_columns is not specified, we assume we want to translate everything that looks like text.
|
141 |
+
# Otherwise, only translate the given columns.
|
142 |
+
if text_columns is None:
|
143 |
+
# Example heuristic: choose all object/string columns
|
144 |
+
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
145 |
+
|
146 |
+
num_rows = len(df)
|
147 |
+
num_chunks = math.ceil(num_rows / chunk_size)
|
148 |
+
|
149 |
+
translated_df = df.copy() # copy to store the final translations
|
150 |
+
|
151 |
+
for chunk_index in range(num_chunks):
|
152 |
+
start_idx = chunk_index * chunk_size
|
153 |
+
end_idx = min((chunk_index + 1) * chunk_size, num_rows)
|
154 |
+
chunk_df = df.iloc[start_idx:end_idx]
|
155 |
+
|
156 |
+
# Build a dictionary structure. For example, row-based:
|
157 |
+
# {
|
158 |
+
# "0": {"colA": "some text", "colB": "some text"},
|
159 |
+
# "1": {"colA": "some text", "colB": "some text"},
|
160 |
+
# ...
|
161 |
+
# }
|
162 |
+
chunk_dict = {}
|
163 |
+
for i, row in chunk_df.iterrows():
|
164 |
+
row_dict = {}
|
165 |
+
for col in text_columns:
|
166 |
+
row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
|
167 |
+
chunk_dict[str(i)] = row_dict
|
168 |
+
|
169 |
+
# Now call your LLM translator on this dictionary
|
170 |
+
translated_chunk = translate_text_dict(
|
171 |
+
text_dict=chunk_dict,
|
172 |
+
source_lang=source_lang,
|
173 |
+
target_lang=target_lang,
|
174 |
+
gemini_api=gemini_api
|
175 |
+
)
|
176 |
+
|
177 |
+
# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
|
178 |
+
for i_str, row_data in translated_chunk.items():
|
179 |
+
i = int(i_str)
|
180 |
+
for col, translated_val in row_data.items():
|
181 |
+
translated_df.at[i, col] = translated_val
|
182 |
+
|
183 |
+
# Lưu file dịch vào tệp tạm thời
|
184 |
+
translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
|
185 |
+
translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
|
186 |
+
|
187 |
+
# Đọc lại file tạm để lưu vào MongoDB
|
188 |
+
with open(translated_file_path, "rb") as f:
|
189 |
+
translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
|
190 |
+
|
191 |
+
# Xóa file tạm
|
192 |
+
os.remove(temp_file_path)
|
193 |
+
os.remove(translated_file_path)
|
194 |
+
|
195 |
+
print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
|
196 |
+
return translated_file_id
|