Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

Upload excel_translate_bis.py

by Phong1 - opened May 28

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+196

-0

Files changed (1) hide show

excel/excel_translate_bis.py +196 -0

excel/excel_translate_bis.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import xlwings as xw
+from typing import Dict, List
+from translate.translator import translate_text_dict
+import math
+import chardet
+import io
+import pandas as pd
+import pymongo
+import gridfs
+from io import BytesIO
+import tempfile
+import os
+import openpyxl
+from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
+def translate_xlsx(file_id: str, target_lang: str = ""):
+    # Kết nối MongoDB
+    client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client["excel"]
+    fs_input = gridfs.GridFS(db, collection="root_file")
+    fs_output = gridfs.GridFS(db, collection="final_file")
+    # Tải file từ MongoDB
+    file_data = fs_input.get(file_id)
+    # Lưu file tạm thời
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
+        temp_file.write(file_data.read())
+        temp_file_path = temp_file.name
+    # Đọc file Excel bằng openpyxl
+    wb = openpyxl.load_workbook(temp_file_path)
+    sheets = wb.worksheets
+    # 🔸 Translate Sheet Names
+    sheet_name_dict = {ws.title: [ws.title] for ws in sheets}
+    translated_sheet_names = translate_text_dict(sheet_name_dict, target_lang=target_lang)
+    for ws in sheets:
+        new_name = translated_sheet_names.get(ws.title, [ws.title])[0]
+        ws.title = new_name[:31]  # Excel sheet names must be ≤ 31 characters
+    for ws in sheets:
+        max_row = ws.max_row
+        max_col = ws.max_column
+        # Tạo dictionary lưu trữ nội dung cần dịch và mapping từ key đến cell
+        text_dict: Dict[str, List[str]] = {}
+        cell_map: Dict[str, any] = {}
+        for row in range(1, max_row + 1):
+            for col in range(1, max_col + 1):
+                cell = ws.cell(row=row, column=col)
+                if isinstance(cell.value, str):
+                    key = f"R{row}C{col}"
+                    text_dict[key] = [cell.value]
+                    cell_map[key] = cell
+        # 🔸 Extract Text Box Content
+        textbox_dict = {}
+        textbox_map = {}
+        if hasattr(ws, "_drawing") and ws._drawing:
+            for rel in ws._drawing._rels.values():
+                if isinstance(rel.target, SpreadsheetDrawing):
+                    for shape in rel.target.shape_tree:
+                        if hasattr(shape, "text") and shape.text:
+                            key = f"TX-{hash(shape.text)}"
+                            textbox_dict[key] = [shape.text]
+                            textbox_map[key] = shape
+        # 🔸 Merge all translatable text and translate
+        full_dict = {**text_dict, **textbox_dict}
+        translated_dict = translate_text_dict(full_dict, target_lang=target_lang)
+        # Cập nhật cell
+        for key, cell in cell_map.items():
+            if key in translated_dict:
+                translated_text = translated_dict[key][0]
+                cell.value = translated_text
+        # Cập nhật text box
+        for key, shape in textbox_map.items():
+            if key in translated_dict:
+                translated_text = translated_dict[key][0]
+                shape.text = translated_text
+    # Lưu workbook vào file tạm thời
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
+        wb.save(output_file.name)
+        output_file.seek(0)
+        translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
+    # Đóng workbook và xóa file tạm
+    wb.close()
+    os.remove(temp_file_path)
+    print(f"✅ Dịch thành công! File đã lưu vào MongoDB với file_id: {translated_file_id}")
+    return translated_file_id
+def read_csv_with_auto_encoding(csv_path):
+    # Đọc file dưới dạng nhị phân
+    with open(csv_path, "rb") as f:
+        raw_data = f.read()
+        # Dò tìm encoding
+        detect_result = chardet.detect(raw_data)
+        encoding = detect_result["encoding"]
+        confidence = detect_result["confidence"]
+        print(f"Chardet dự đoán file '{csv_path}' có encoding = {encoding} (độ tin cậy = {confidence})")
+        # Nếu chardet không phát hiện được, ta đặt fallback = 'utf-8'
+        if encoding is None:
+            encoding = "utf-8"
+    decoded_data = raw_data.decode(encoding, errors='replace')
+    # Sử dụng io.StringIO để chuyển đổi chuỗi thành đối tượng file-like
+    csv_data = io.StringIO(decoded_data)
+    df = pd.read_csv(csv_data)
+    return df
+def translate_csv(file_id, source_lang="en", target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
+    # Kết nối MongoDB
+    client = pymongo.MongoClient("mongodb://localhost:27017")
+    db = client[db_name]
+    fs_input = gridfs.GridFS(db, collection="root_file")
+    fs_output = gridfs.GridFS(db, collection="final_file")
+    # Tải file từ MongoDB
+    file_data = fs_input.get(file_id).read()
+    # Lưu file tạm thời
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file:
+        temp_file.write(file_data)
+        temp_file_path = temp_file.name
+    df = read_csv_with_auto_encoding(temp_file_path)
+    # If text_columns is not specified, we assume we want to translate everything that looks like text.
+    # Otherwise, only translate the given columns.
+    if text_columns is None:
+        # Example heuristic: choose all object/string columns
+        text_columns = df.select_dtypes(include=["object"]).columns.tolist()
+    num_rows = len(df)
+    num_chunks = math.ceil(num_rows / chunk_size)
+    translated_df = df.copy()  # copy to store the final translations
+    for chunk_index in range(num_chunks):
+        start_idx = chunk_index * chunk_size
+        end_idx = min((chunk_index + 1) * chunk_size, num_rows)
+        chunk_df = df.iloc[start_idx:end_idx]
+        # Build a dictionary structure. For example, row-based:
+        # {
+        #   "0": {"colA": "some text", "colB": "some text"},
+        #   "1": {"colA": "some text", "colB": "some text"},
+        #   ...
+        # }
+        chunk_dict = {}
+        for i, row in chunk_df.iterrows():
+            row_dict = {}
+            for col in text_columns:
+                row_dict[col] = str(row[col]) if pd.notnull(row[col]) else ""
+            chunk_dict[str(i)] = row_dict
+        # Now call your LLM translator on this dictionary
+        translated_chunk = translate_text_dict(
+            text_dict=chunk_dict,
+            source_lang=source_lang,
+            target_lang=target_lang,
+            gemini_api=gemini_api
+        )
+        # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
+        for i_str, row_data in translated_chunk.items():
+            i = int(i_str)
+            for col, translated_val in row_data.items():
+                translated_df.at[i, col] = translated_val
+    # Lưu file dịch vào tệp tạm thời
+    translated_file_path = temp_file_path.replace(".csv", f"_translated_{target_lang}.csv")
+    translated_df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')
+    # Đọc lại file tạm để lưu vào MongoDB
+    with open(translated_file_path, "rb") as f:
+        translated_file_id = fs_output.put(f, filename=f"translated_{file_id}.csv")
+    # Xóa file tạm
+    os.remove(temp_file_path)
+    os.remove(translated_file_path)
+    print(f"Translation complete! Saved to MongoDB with file_id: {translated_file_id}")
+    return translated_file_id