Spaces:

mintlee
/

MT_deploy

Running

App Files Files Community

mintlee commited on Apr 3

Commit

73196e5

1 Parent(s): 0666452

update pptx

Browse files

Files changed (11) hide show

db/__pycache__/mongodb.cpython-310.pyc +0 -0
db/mongodb.py +1 -67
home.py +2 -2
pages/upload.py +12 -14
powerpoint/__pycache__/pptx.cpython-310.pyc +0 -0
powerpoint/__pycache__/xml_handling.cpython-310.pyc +0 -0
powerpoint/pptx.py +139 -0
powerpoint/xml_handling.py +513 -352
test.ipynb +0 -0
utils/__pycache__/utils.cpython-310.pyc +0 -0
utils/utils.py +247 -0

db/__pycache__/mongodb.cpython-310.pyc CHANGED Viewed

Binary files a/db/__pycache__/mongodb.cpython-310.pyc and b/db/__pycache__/mongodb.cpython-310.pyc differ

db/mongodb.py CHANGED Viewed

@@ -39,75 +39,9 @@ def save_file_to_mongodb(uploaded_file, db_name="ppt", collection_name="root_fil
     print(f"✅ File '{file_name}' đã được lưu vào '{collection_name}' với ID: {file_id}")
     client.close()
-    return file_id
-def delete_pptx_from_mongodb(file_id, db_name="ppt", collection_name="root_file"):
-    """
-    Xóa file PowerPoint khỏi MongoDB theo ID.
-    :param file_id: ID của file cần xóa (chuỗi hoặc ObjectId)
-    :param db_name: Tên database trong MongoDB
-    :param collection_name: Tên collection GridFS
-    """
-    # Kết nối đến MongoDB
-    client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs = gridfs.GridFS(db, collection=collection_name)
-    try:
-        # Chuyển đổi ID nếu cần
-        if not isinstance(file_id, ObjectId):
-            file_id = ObjectId(file_id)
-        # Kiểm tra file có tồn tại không
-        if fs.exists(file_id):
-            fs.delete(file_id)
-            print(f"✅ Đã xóa file với ID: {file_id}")
-        else:
-            print(f"⚠️ Không tìm thấy file với ID: {file_id}")
-    except Exception as e:
-        print(f"❌ Lỗi khi xóa file: {e}")
-    client.close()
-def download_pptx_from_mongodb(file_id, save_path, save_name, db_name="ppt", collection_name="root_file"):
-    """
-    Tải file PowerPoint từ MongoDB GridFS và lưu về máy.
-    :param file_id:       ID của file cần tải (dạng chuỗi hoặc ObjectId)
-    :param save_path:     Đường dẫn đến thư mục sẽ lưu file (VD: 'D:/output')
-    :param save_name:     Tên file khi lưu (VD: 'my_presentation.pptx')
-    :param db_name:       Tên database trong MongoDB (mặc định: 'ppt')
-    :param collection_name: Tên collection GridFS (mặc định: 'root_file')
-    """
-    # Đảm bảo thư mục lưu file tồn tại
-    os.makedirs(save_path, exist_ok=True)
-    # Tạo đường dẫn đầy đủ cho file
-    full_file_path = os.path.join(save_path, save_name)
-    # Kết nối đến MongoDB
-    client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs = gridfs.GridFS(db, collection=collection_name)
-    try:
-        # Chuyển đổi ID nếu cần
-        if not isinstance(file_id, ObjectId):
-            file_id = ObjectId(file_id)
-        # Lấy dữ liệu file từ GridFS
-        file_data = fs.get(file_id)
-        # Ghi dữ liệu ra file
-        with open(full_file_path, "wb") as f:
-            f.write(file_data.read())
-        print(f"✅ File đã được tải về: {full_file_path}")
-    except Exception as e:
-        print(f"❌ Lỗi khi tải file: {e}")
-    finally:
-        client.close()
 def save_xml_to_gridfs(xml_content, file_name, db_name="ppt", collection_name="original_xml"):
     """

     print(f"✅ File '{file_name}' đã được lưu vào '{collection_name}' với ID: {file_id}")
     client.close()
+    return file_id, file_name
 def save_xml_to_gridfs(xml_content, file_name, db_name="ppt", collection_name="original_xml"):
     """

home.py CHANGED Viewed

@@ -11,7 +11,7 @@ st.write(
     "Ứng dụng này giúp bạn dịch tài liệu một cách nhanh chóng và chính xác."
 )
-st.header("📌 Hướng dẫn sử dụng")
 st.write("1. Truy cập [trang upload](#).")
 st.write("2. Chọn tệp tin cần dịch (hỗ trợ .docx, .txt, .pdf).")
 st.write("3. Chọn ngôn ngữ muốn dịch sang.")
@@ -25,7 +25,7 @@ if to_upload:
     st.switch_page("pages/upload.py")  # Điều hướng đến trang upload
-st.header("🛠️ Patch Note")
 st.subheader("25/03/2025")
 st.write("1. Đã hoàn thành file Word, Excel")
 st.write("2. Đang tiến hành file PPTX (tiến độ 90% đã có thể dùng thử)")

     "Ứng dụng này giúp bạn dịch tài liệu một cách nhanh chóng và chính xác."
 )
+st.header("📌 Hướng dẫn sử dụng (đọc Patch Notes)")
 st.write("1. Truy cập [trang upload](#).")
 st.write("2. Chọn tệp tin cần dịch (hỗ trợ .docx, .txt, .pdf).")
 st.write("3. Chọn ngôn ngữ muốn dịch sang.")
     st.switch_page("pages/upload.py")  # Điều hướng đến trang upload
+st.header("🛠️ Patch Notes")
 st.subheader("25/03/2025")
 st.write("1. Đã hoàn thành file Word, Excel")
 st.write("2. Đang tiến hành file PPTX (tiến độ 90% đã có thể dùng thử)")

pages/upload.py CHANGED Viewed

@@ -1,11 +1,7 @@
 import streamlit as st
 import google.generativeai as genai
 from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
-from powerpoint.xml_handling import (
-    extract_text_from_xml, update_xml_with_translated_text_mongodb, ppt_to_xml_mongodb
-)
-from translate.translator import translate_text_dict
-from powerpoint.pptx_object import create_translated_ppt
 from excel.excel_translate import translate_xlsx, translate_csv
 from word.word_translate import translate_docx_from_mongodb
 import dotenv
@@ -18,22 +14,24 @@ genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
 st.title("Translate Your File Easily! 🌍")
 uploaded_file = st.file_uploader("📂 Chọn file để dịch")
-target_lang = st.selectbox("🌐 Chọn ngôn ngữ", ["english", "vietnamese"])
 def process_file(file, file_type):
     progress_bar = st.progress(0)
-    file_id = save_file_to_mongodb(uploaded_file=file, db_name=file_type.lower(), collection_name="root_file")
     progress_bar.progress(20)
     st.write(f"📂 File ID: {file_id}")
     if file_type == "PPTX":
-        xml_file_id = ppt_to_xml_mongodb(file_id)
-        progress_bar.progress(40)
-        text_dict = extract_text_from_xml(file_id=xml_file_id)
-        translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
-        progress_bar.progress(60)
-        final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
-        final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
     elif file_type == "Excel":
         final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
     elif file_type == "CSV":

 import streamlit as st
 import google.generativeai as genai
 from db.mongodb import save_file_to_mongodb, fetch_file_from_mongodb, detect_file_type
+from powerpoint.pptx import translate_pptx
 from excel.excel_translate import translate_xlsx, translate_csv
 from word.word_translate import translate_docx_from_mongodb
 import dotenv
 st.title("Translate Your File Easily! 🌍")
 uploaded_file = st.file_uploader("📂 Chọn file để dịch")
+source_lang = st.selectbox("🌐 Chọn ngôn ngữ của tài liệu", ["english", "vietnamese"])
+target_lang = st.selectbox("🌐 Chọn ngôn ngữ muốn dịch sang", ["english", "vietnamese"])
 def process_file(file, file_type):
     progress_bar = st.progress(0)
+    file_id, file_name = save_file_to_mongodb(uploaded_file=file, db_name=file_type.lower(), collection_name="root_file")
     progress_bar.progress(20)
     st.write(f"📂 File ID: {file_id}")
     if file_type == "PPTX":
+        final_id = translate_pptx(file_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5)
+        # progress_bar.progress(40)
+        # text_dict = extract_text_from_xml(file_id=xml_file_id)
+        # translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
+        # progress_bar.progress(60)
+        # final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
+        # final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
     elif file_type == "Excel":
         final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
     elif file_type == "CSV":

powerpoint/__pycache__/pptx.cpython-310.pyc ADDED Viewed

Binary file (4.32 kB). View file

powerpoint/__pycache__/xml_handling.cpython-310.pyc CHANGED Viewed

Binary files a/powerpoint/__pycache__/xml_handling.cpython-310.pyc and b/powerpoint/__pycache__/xml_handling.cpython-310.pyc differ

powerpoint/pptx.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import zipfile
+import shutil
+from pptx import Presentation
+from utils.utils import unzip_office_file, translate_text, preprocess_text, postprocess_text
+from powerpoint.xml_handling import *
+from pymongo import MongoClient
+import gridfs
+from bson import ObjectId
+from io import BytesIO
+def create_pptx_and_store_in_mongodb(temp_dir, pptx_filename):
+    """
+    Tạo file PPTX từ thư mục chứa nội dung đã giải nén và lưu vào MongoDB mà không lưu file trên ổ cứng.
+    """
+    pptx_buffer = BytesIO()
+    with zipfile.ZipFile(pptx_buffer, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for root_dir, _, files in os.walk(temp_dir):
+            for file in files:
+                file_path = os.path.join(root_dir, file)
+                arcname = os.path.relpath(file_path, temp_dir)
+                zipf.write(file_path, arcname)
+    pptx_buffer.seek(0)
+    client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client['pptx']
+    fs = gridfs.GridFS(db, collection='final_file')
+    file_id = fs.put(pptx_buffer, filename=pptx_filename)
+    print(f"PPTX đã được lưu vào MongoDB với ID: {file_id}")
+    client.close()
+    return file_id
+def translate_and_replace_pptx(xml_folder, file_name, source_lang='vn', target_lang='en', slides_per_batch=5):
+    slides_dir = os.path.join(xml_folder, "ppt/slides")
+    all_slides = sorted([f for f in os.listdir(slides_dir)
+                         if f.startswith("slide") and f.endswith(".xml")],
+                        key=lambda x: int(x[5:-4]))
+    # Xử lý theo từng batch slide
+    for i in range(0, len(all_slides), slides_per_batch):
+        batch_slides = all_slides[i:i + slides_per_batch]
+        slide_text_mapping = {}
+        smartart_text_mapping = {}
+        for slide_file in batch_slides:
+            slide_index = int(slide_file[5:-4])
+            slide_path = os.path.join(slides_dir, slide_file)
+            slide_text_mapping[slide_index] = extract_text_from_slide(slide_path)  # Lấy list các tuple (text, rPr)
+            # Xử lý SmartArt qua file .rels của slide
+            rels_file = os.path.join(xml_folder, "ppt/slides/_rels", slide_file + ".rels")
+            base_path = os.path.join(xml_folder, "ppt")
+            smartart_data_path = get_smartart_data_file(rels_file, base_path)
+            if smartart_data_path:
+                smartart_text_mapping[slide_index] = extract_text_from_smartart(smartart_data_path) # Lấy list các tuple (text, rPr)
+        # Gộp text để dịch theo batch, giữ lại rPr
+        combined_slide_text_list = []
+        for slide_index in sorted(slide_text_mapping.keys()):
+            combined_slide_text_list.extend(slide_text_mapping[slide_index])
+        combined_smartart_text_list = []
+        for slide_index in sorted(smartart_text_mapping.keys()):
+            combined_smartart_text_list.extend(smartart_text_mapping[slide_index])
+        # Tách text ra khỏi tuple để dịch
+        slide_texts_to_translate = [text for text, _ in combined_slide_text_list]
+        smartart_texts_to_translate = [text for text, _ in combined_smartart_text_list]
+        # Dịch văn bản slide và SmartArt
+        combined_slide_text_string = preprocess_text(slide_texts_to_translate)
+        combined_smartart_text_string = preprocess_text(smartart_texts_to_translate)
+        translated_slide_string = translate_text(combined_slide_text_string, source_lang, target_lang)
+        translated_smartart_string = translate_text(combined_smartart_text_string, source_lang, target_lang)
+        # Postprocess để có list các văn bản đã dịch
+        translated_slide_texts = postprocess_text(translated_slide_string)
+        translated_smartart_texts = postprocess_text(translated_smartart_string)
+        # **Quan trọng:** Tạo danh sách tuple (translated_text, rPr)
+        translated_slide_data = []
+        for i, (original_text, rPr) in enumerate(combined_slide_text_list):
+            if i < len(translated_slide_texts):
+                translated_slide_data.append((translated_slide_texts[i], rPr))
+            else:
+                translated_slide_data.append(("", rPr)) # Trường hợp không đủ translated text
+        translated_smartart_data = []
+        for i, (original_text, rPr) in enumerate(combined_smartart_text_list):
+            if i < len(translated_smartart_texts):
+                translated_smartart_data.append((translated_smartart_texts[i], rPr))
+            else:
+                translated_smartart_data.append(("", rPr))  # Trường hợp không đủ translated text
+        # Thay thế văn bản trong slide
+        slide_index = 0
+        for slide_index in sorted(slide_text_mapping.keys()):
+            slide_file = f"slide{slide_index}.xml"
+            slide_path = os.path.join(slides_dir, slide_file)
+            num_texts = len(slide_text_mapping[slide_index])
+            replace_data = translated_slide_data[:num_texts]
+            replace_text_in_slide(slide_path, replace_data) # truyền vào danh sách (translated_text, rPr)
+            translated_slide_data = translated_slide_data[num_texts:]  # Cập nhật danh sách cho slide tiếp theo
+        # Thay thế văn bản trong SmartArt
+        for slide_index in sorted(smartart_text_mapping.keys()):
+            rels_file = os.path.join(xml_folder, "ppt/slides/_rels", f"slide{slide_index}.xml.rels")
+            base_path = os.path.join(xml_folder, "ppt")
+            smartart_data_path = get_smartart_data_file(rels_file, base_path)
+            if smartart_data_path:
+                num_texts = len(smartart_text_mapping[slide_index])
+                replace_data = translated_smartart_data[:num_texts]
+                replace_text_in_smartart(smartart_data_path, replace_data, None) # truyền vào danh sách (translated_text, rPr)
+                translated_smartart_data = translated_smartart_data[num_texts:]  # Cập nhật danh sách cho slide tiếp theo
+    file_id = create_pptx_and_store_in_mongodb(xml_folder, file_name)
+    return file_id
+def translate_pptx(pptx_id, file_name, source_lang='vn', target_lang='en', slides_per_batch=5):
+    client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
+    db = client['pptx']
+    fs = gridfs.GridFS(db, collection='root_file')
+    ppt_file = fs.get(pptx_id)
+    prs = BytesIO(ppt_file.read())
+    xml_folder = unzip_office_file(prs)
+    file_id =  translate_and_replace_pptx(xml_folder, file_name, source_lang, target_lang, slides_per_batch=slides_per_batch)
+    shutil.rmtree(xml_folder)
+    return file_id

powerpoint/xml_handling.py CHANGED Viewed

@@ -1,377 +1,538 @@
-import xml.etree.ElementTree as ET
-from xml.dom import minidom
-import json
-from typing import Dict, List
-from concurrent.futures import ThreadPoolExecutor
-from pptx import Presentation
-from pptx.enum.shapes import MSO_SHAPE_TYPE
-from powerpoint.pptx_object import get_table_properties, get_shape_properties
-from pymongo import MongoClient
-import gridfs
-from bson import ObjectId
-from io import BytesIO
-gemini_api = "AIzaSyDtBIjTSfbvuEsobNwjtdyi9gVpDrCaWPM"
-def extract_text_from_group(group_shape, slide_number, shape_index, slide_element):
-    """Extracts text from shapes within a group, only adding the group if it contains text."""
-    group_element = ET.SubElement(slide_element, "group_element")
-    group_element.set("shape_index", str(shape_index))
-    group_element.set("group_name", group_shape.name)  # Add group name
-    group_has_text = False  # Flag to track if the group contains any text
-    for i, shape in enumerate(group_shape.shapes):
-        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
-            # Recursively check nested groups, and update group_has_text
-            if extract_text_from_group(shape, slide_number, i, group_element):
-                group_has_text = True
-        elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
-            table_element = ET.SubElement(group_element, "table_element")
-            table_element.set("shape_index", str(i))
-            table_data = get_table_properties(shape.table)
-            props_element = ET.SubElement(table_element, "properties")
-            props_element.text = json.dumps(table_data, indent=2)
-            group_has_text = True
-        elif hasattr(shape, "text_frame") and shape.text_frame:
-            text_element = ET.SubElement(group_element, "text_element")
-            text_element.set("shape_index", str(i))
-            shape_data = get_shape_properties(shape)
-            props_element = ET.SubElement(text_element, "properties")
-            props_element.text = json.dumps(shape_data, indent=2)
-            if shape_data.get("text") or (
-                "paragraphs" in shape_data
-                and any(p.get("text") for p in shape_data["paragraphs"])
-            ):
-                group_has_text = True
-    # Only keep the group element if it contains text
-    if not group_has_text:
-        slide_element.remove(group_element)
-        return False
-    return True
-def extract_text_from_slide(slide, slide_number, translate=False):
-    """Extract all text elements from a slide."""
-    slide_element = ET.Element("slide")
-    slide_element.set("number", str(slide_number))
-    for shape_index, shape in enumerate(slide.shapes):
-        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
-            extract_text_from_group(shape, slide_number, shape_index, slide_element)
-        elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
-            table_element = ET.SubElement(slide_element, "table_element")
-            table_element.set("shape_index", str(shape_index))
-            table_data = get_table_properties(shape.table)
-            props_element = ET.SubElement(table_element, "properties")
-            props_element.text = json.dumps(table_data, indent=2)
-        elif hasattr(shape, "text"):
-            text_element = ET.SubElement(slide_element, "text_element")
-            text_element.set("shape_index", str(shape_index))
-            shape_data = get_shape_properties(shape)
-            props_element = ET.SubElement(text_element, "properties")
-            props_element.text = json.dumps(shape_data, indent=2)
-    return slide_element
-def ppt_to_xml_mongodb(ppt_file_id: str, db_name="pptx"):
     """
-    Chuyển PowerPoint từ MongoDB thành XML và lưu vào MongoDB.
-    :param ppt_file_id: ID của file PPT gốc trong MongoDB (original_pptx)
-    :param db_name: Tên database MongoDB
-    :return: ID của file XML trong MongoDB (original_xml)
     """
-    # Kết nối MongoDB
-    client = MongoClient(
-        "mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0",
-        connectTimeoutMS=60000,  # 60 giây thay vì 20 giây
-        serverSelectionTimeoutMS=60000,  # Chờ phản hồi lâu hơn
-        socketTimeoutMS=60000,  # Tăng thời gian chờ socket
-        tls=True,
-        tlsAllowInvalidCertificates=True  # Giữ kết nối lâu hơn
-    )
-    db = client[db_name]
-    fs_ppt = gridfs.GridFS(db, collection="root_file")  # PPT gốc
-    fs_xml = gridfs.GridFS(db, collection="original_xml")  # XML lưu trữ
-    try:
-        # Lấy file PPT từ MongoDB
-        if not isinstance(ppt_file_id, ObjectId):
-            ppt_file_id = ObjectId(ppt_file_id)
-        ppt_file = fs_ppt.get(ppt_file_id)
-        prs = Presentation(BytesIO(ppt_file.read()))
-        # Tạo XML
-        root = ET.Element("presentation")
-        root.set("file_name", ppt_file.filename)
-        with ThreadPoolExecutor(max_workers=4) as executor:
-            future_to_slide = {
-                executor.submit(extract_text_from_slide, slide, slide_number): slide_number
-                for slide_number, slide in enumerate(prs.slides, 1)
-            }
-            for future in future_to_slide:
-                slide_number = future_to_slide[future]
-                try:
-                    slide_element = future.result()
-                    root.append(slide_element)
-                except Exception as e:
-                    print(f"Error processing slide {slide_number}: {str(e)}")
-        xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
-        # Lưu XML vào MongoDB
-        xml_output = BytesIO(xml_str.encode("utf-8"))
-        file_name = ppt_file.filename.replace(".pptx", ".xml")
-        xml_file_id = fs_xml.put(xml_output, filename=file_name)
-        print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
-        client.close()
-        return xml_file_id
     except Exception as e:
-        print(f"❌ Lỗi khi chuyển PPT sang XML: {str(e)}")
-        return None
-    finally:
-        client.close()
-def extract_text_from_xml(file_id=None, filename=None, db_name="pptx", collection_name="original_xml") -> Dict[str, List[str]]:
-    """
-    Tải XML từ MongoDB và trích xuất văn bản từ các slide.
-    :param file_id: ID của file trong MongoDB (dạng ObjectId hoặc string)
-    :param filename: Tên file cần tìm trong MongoDB (VD: "file.xml")
-    :param db_name: Tên database MongoDB
-    :param collection_name: Tên collection GridFS
-    :return: Dictionary {slide_number: [text1, text2, ...]}
     """
-    # Kết nối MongoDB
-    client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs = gridfs.GridFS(db, collection=collection_name)
     try:
-        # Tìm file theo file_id hoặc filename
-        if file_id:
-            if not isinstance(file_id, ObjectId):
-                file_id = ObjectId(file_id)
-            file_data = fs.get(file_id)
-        elif filename:
-            file_data = fs.find_one({"filename": filename})
-            if not file_data:
-                print(f"❌ Không tìm thấy file '{filename}' trong MongoDB!")
-                return {}
         else:
-            print("❌ Cần cung cấp 'file_id' hoặc 'filename' để tải file.")
-            return {}
-        # Đọc nội dung XML từ MongoDB
-        xml_content = file_data.read().decode("utf-8")
-        # print(f"✅ xml_content: {xml_content}")
-        # Chuyển đổi thành cây XML
-        root = ET.fromstring(xml_content)
-        slide_texts = {}
-        # Duyệt qua từng slide
-        for slide in root.findall("slide"):
-            slide_number = slide.get("number")
-            texts = []
-            # Helper function to extract text recursively
-            def extract_text_recursive(element):
-                if element.tag == "text_element":
-                    props = element.find("properties")
-                    if props is not None and props.text:
-                        try:
-                            shape_data = json.loads(props.text)
-                            # Handle both direct 'text' and paragraph-based text
-                            if 'text' in shape_data:
-                                texts.append(shape_data['text'])
-                            elif 'paragraphs' in shape_data:
-                                for paragraph in shape_data['paragraphs']:
-                                    if 'text' in paragraph:
-                                        texts.append(paragraph['text'])
-                                    #Also extract run level text
-                                    elif 'runs' in paragraph:
-                                        for run in paragraph['runs']:
-                                            if 'text' in run:
-                                                texts.append(run['text'])
-                        except json.JSONDecodeError:
-                            pass  # Ignore if JSON is invalid
-                elif element.tag == "table_element":
-                    props = element.find("properties")
-                    if props is not None and props.text:
-                        try:
-                            table_data = json.loads(props.text)
-                            for row in table_data.get("cells", []):
-                                for cell in row:
-                                    texts.append(cell.get("text", ""))
-                        except json.JSONDecodeError:
-                            pass  # Ignore if JSON is invalid
-                # Recursively process children of group_element
-                elif element.tag == "group_element":
-                    for child in element:
-                        extract_text_recursive(child)
-            # Iterate through all direct children of the slide
-            for child in slide:
-                extract_text_recursive(child)
-            slide_texts[str(slide_number)] = texts  # Ensure slide number is a string
-        print(slide_texts)
-        return slide_texts
     except Exception as e:
-        print(f"❌ Lỗi khi xử lý XML: {e}")
-        return {}
-    finally:
-        client.close()
-def adjust_size(original_text, translated_text, data_container):
-    """Adjust font size if translated text is significantly longer."""
-    if not original_text or not translated_text:
-        return
-    original_len = len(original_text)
-    translated_len = len(translated_text)
-    length_ratio = translated_len / original_len if original_len >0 else 1 # Avoid division by 0
-    if length_ratio > 1.5:  # Adjust threshold as needed
-        if 'paragraphs' in data_container:
-            for paragraph in data_container['paragraphs']:
-                if 'runs' in paragraph:
-                    for run in paragraph['runs']:
-                        if run.get('font') and run['font'].get('size'):
-                            run['font']['size'] = max(6, int(run['font']['size'] * 0.8))
-        elif 'font' in data_container and data_container['font'].get('size'):
-            data_container['font']['size'] = max(6, int(data_container['font']['size'] * 0.8))
-def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[str, List[str]], db_name="pptx"):
     """
-    Tải XML từ MongoDB (collection original_xml), cập nhật nội dung dịch, và lưu lại vào collection final_xml.
-    :param file_id: ID của file trong MongoDB (original_xml)
-    :param translated_dict: Dictionary {slide_number: [translated_text1, translated_text2, ...]}
-    :param db_name: Tên database MongoDB
     """
-    # Kết nối MongoDB
-    client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
-    db = client[db_name]
-    fs_original = gridfs.GridFS(db, collection="original_xml")  # Lấy file từ original_xml
-    fs_final = gridfs.GridFS(db, collection="final_xml")  # Lưu file vào final_xml
     try:
-        # Tải file từ MongoDB (original_xml)
-        if not isinstance(file_id, ObjectId):
-            file_id = ObjectId(file_id)
-        file_data = fs_original.get(file_id)
-        xml_content = file_data.read().decode("utf-8")
-        # Chuyển đổi XML string thành cây XML
-        root = ET.fromstring(xml_content)
-        # Cập nhật nội dung dịch
-        for slide in root.findall("slide"):
-            slide_num = slide.get("number")
-            if slide_num in translated_dict:
-                translated_texts = translated_dict[slide_num]
-                text_index = 0  # Keep track of the current translated text
-                def update_element_recursive(element):
-                    nonlocal text_index  # Access and modify the outer scope's index
-                    if element.tag == "text_element":
-                        props = element.find("properties")
-                        if props is not None and props.text:
-                            try:
-                                shape_data = json.loads(props.text)
-                                original_text = ""
-                                # Handle direct text and paragraph-based text
-                                if 'text' in shape_data:
-                                    original_text = shape_data['text']
-                                    if text_index < len(translated_texts):
-                                         shape_data['text'] = translated_texts[text_index]
-                                         adjust_size(original_text, translated_texts[text_index], shape_data)
-                                         text_index += 1
-                                elif 'paragraphs' in shape_data:
-                                    for paragraph in shape_data['paragraphs']:
-                                        if 'text' in paragraph:
-                                            original_text = paragraph['text']
-                                            if text_index < len(translated_texts):
-                                                paragraph['text'] = translated_texts[text_index]
-                                                adjust_size(original_text, translated_texts[text_index], paragraph)
-                                                text_index += 1
-                                        elif 'runs' in paragraph:
-                                            for run in paragraph['runs']:
-                                                if 'text' in run:
-                                                    original_text = run['text']
-                                                    if text_index < len(translated_texts):
-                                                        run['text'] = translated_texts[text_index]
-                                                        adjust_size(original_text, translated_texts[text_index], run)
-                                                        text_index += 1
-                                props.text = json.dumps(shape_data, indent=2)
-                            except json.JSONDecodeError:
-                                print(f"JSONDecodeError in text_element on slide {slide_num}")
-                    elif element.tag == "table_element":
-                        props = element.find("properties")
-                        if props is not None and props.text:
-                            try:
-                                table_data = json.loads(props.text)
-                                for row in table_data.get("cells", []):
-                                    for cell in row:
-                                        original_text = cell.get('text', '')
-                                        if text_index < len(translated_texts):
-                                            cell['text'] = translated_texts[text_index]
-                                            adjust_size(original_text, translated_texts[text_index], cell)
-                                            text_index += 1
-                                props.text = json.dumps(table_data, indent=2)
-                            except json.JSONDecodeError:
-                                print(f"JSONDecodeError in table_element on slide {slide_num}")
-                    elif element.tag == "group_element":
-                        print("Group element found")
-                        for child in element:
-                            update_element_recursive(child)  # Recursively process children
-                # Start the recursive update from the slide's direct children
-                for child in slide:
-                    update_element_recursive(child)
-        # Chuyển XML thành chuỗi và làm đẹp định dạng
-        updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent="  ")
-        # Lưu file cập nhật vào MongoDB (final_xml)
-        new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}")
-        print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
-        return new_file_id
     except Exception as e:
-        print(f"❌ Lỗi khi cập nhật XML: {e}")
-        return None
-    finally:
-        client.close()

+from lxml import etree as ET
+import copy  # Để tạo bản sao sâu của rPr
+import os
+import traceback # Để in chi tiết lỗi
+# --- Namespaces (giữ nguyên) ---
+ns = {
+    'a': "http://schemas.openxmlformats.org/drawingml/2006/main",
+    'p': "http://schemas.openxmlformats.org/presentationml/2006/main",
+    'r': "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+    'dgm': 'http://schemas.openxmlformats.org/drawingml/2006/diagram',
+    'pr': 'http://schemas.openxmlformats.org/package/2006/relationships'
+}
+# --- Đăng ký namespace (giữ nguyên) ---
+for prefix, uri in ns.items():
+    if prefix != 'pr':
+        ET.register_namespace(prefix, uri)
+def _get_paragraph_details(p_element):
     """
+    Helper function to extract merged text and the first rPr associated with text
+    from a given <a:p> element. Handles text within <a:r> and <a:fld>.
+    Args:
+        p_element (ET.Element): The <a:p> element.
+    Returns:
+        tuple | None: (merged_text, first_rPr_with_text) if text exists, else None.
+    """
+    paragraph_text_parts = []
+    first_rPr_with_text = None
+    found_first_rpr = False # Cờ để chỉ tìm rPr đầu tiên một lần
+    # Duyệt qua các con TRỰC TIẾP của <a:p> để xử lý <a:r> và <a:fld>
+    for child_elem in p_element:
+        current_rpr = None
+        found_text_in_child = None
+        # Trường hợp 1: Run thông thường (<a:r>)
+        if child_elem.tag == f"{{{ns['a']}}}r":
+            # Tìm text <a:t> bên trong run (dùng .// an toàn cho run lồng nhau nếu có)
+            t_elem = child_elem.find('.//a:t', ns)
+            if t_elem is not None and t_elem.text is not None:
+                found_text_in_child = t_elem.text
+                # Tìm rPr của run này
+                current_rpr = child_elem.find('.//a:rPr', ns) # Dùng .//
+        # Trường hợp 2: Field (<a:fld>)
+        elif child_elem.tag == f"{{{ns['a']}}}fld":
+            # Tìm text <a:t> là con TRỰC TIẾP của field
+            t_elem = child_elem.find('./a:t', ns)
+            if t_elem is not None and t_elem.text is not None:
+                found_text_in_child = t_elem.text
+                # Tìm rPr là con TRỰC TIẾP của field
+                current_rpr = child_elem.find('./a:rPr', ns)
+        # Xử lý nếu tìm thấy text trong child hiện tại (hoặc <a:r> hoặc <a:fld>)
+        if found_text_in_child is not None:
+            paragraph_text_parts.append(found_text_in_child)
+            # Nếu chưa lưu rPr đầu tiên, lưu rPr của child hiện tại
+            if not found_first_rpr:
+                first_rPr_with_text = current_rpr # Lưu rPr tìm được (có thể là None)
+                found_first_rpr = True # Đánh dấu đã tìm thấy
+    # Chỉ trả về kết quả nếu paragraph thực sự có nội dung text
+    if paragraph_text_parts:
+        merged_text = "".join(paragraph_text_parts).strip()
+        if merged_text:
+            # Trả về text đã ghép và rPr đầu tiên tìm thấy (có thể là None)
+            return (merged_text, first_rPr_with_text)
+    return None # Không có text trong paragraph này hoặc text rỗng
+# --- Hàm trích xuất chính (Trả về list các tuple chi tiết paragraph) ---
+def extract_text_from_slide(slide_file):
+    """
+    Trích xuất chi tiết từ từng thẻ <a:p> trong file slide XML.
+    Args:
+        slide_file (str): Đường dẫn đến file slide XML.
+    Returns:
+        list: Một list các tuple, mỗi tuple có dạng:
+              (paragraph_text, first_rPr_in_paragraph)
+              - paragraph_text (str): Toàn bộ text trong các <a:t> con cháu
+                của <a:p>, đã được ghép và strip().
+              - first_rPr_in_paragraph (ET.Element | None): Phần tử <a:rPr> của
+                <a:r> đầu tiên có chứa text trong <a:p> đó. Là None nếu run
+                đầu tiên có text không có thẻ <a:rPr>, hoặc nếu không có text
+                nào trong paragraph.
+              Trả về list rỗng nếu có lỗi hoặc không tìm thấy paragraph nào có text.
     """
+    # print(f"--- Bắt đầu trích xuất chi tiết từng <a:p> từ file: {slide_file} ---")
+    extracted_data = [] # Danh sách kết quả cuối cùng
+    if not os.path.exists(slide_file):
+        print(f"Lỗi: File không tồn tại: {slide_file}")
+        print(f"--- Kết thúc trích xuất file: {slide_file} (Lỗi) ---")
+        return extracted_data
+    try:
+        tree = ET.parse(slide_file)
+        root = tree.getroot()
+    except ET.ParseError as e:
+        print(f"Lỗi parse XML file {slide_file}: {e}")
+        print(f"--- Kết thúc trích xuất file: {slide_file} (Lỗi Parse) ---")
+        return extracted_data
     except Exception as e:
+        print(f"Lỗi không xác định khi parse {slide_file}: {e}")
+        # traceback.print_exc()
+        print(f"--- Kết thúc trích xuất file: {slide_file} (Lỗi Parse không xác định) ---")
+        return extracted_data
+    try:
+        processed_txBody_elements = set()
+        elements_to_check = []
+        # 1. Thu thập các container có thể chứa txBody
+        for sp in root.findall('.//p:spTree/p:sp', ns): elements_to_check.append(sp)
+        for grpSp in root.findall('.//p:spTree/p:grpSp', ns):
+            for sp_in_grp in grpSp.findall('.//p:sp', ns): elements_to_check.append(sp_in_grp)
+        for tc in root.findall('.//a:tbl//a:tc', ns): elements_to_check.append(tc)
+        # Thêm tìm kiếm khác nếu cần
+        # 2. Duyệt qua container, tìm txBody, rồi xử lý từng <a:p> bên trong
+        for container in elements_to_check:
+            txBody = container.find('./p:txBody', ns)
+            if txBody is None: txBody = container.find('./a:txBody', ns)
+            if txBody is not None and txBody not in processed_txBody_elements:
+                 # Tìm TẤT CẢ các thẻ <a:p> là con TRỰC TIẾP của txBody này
+                 paragraphs = txBody.findall('a:p', ns)
+                 for p_elem in paragraphs:
+                     # Gọi hàm helper để lấy chi tiết của paragraph này
+                     details = _get_paragraph_details(p_elem)
+                     # Nếu paragraph có nội dung text, thêm tuple vào kết quả
+                     if details:
+                         extracted_data.append(details)
+                 processed_txBody_elements.add(txBody)
+    except Exception as e:
+        print(f"Lỗi khi tìm kiếm hoặc trích xuất chi tiết <a:p>: {e}")
+    return extracted_data
+def replace_text_in_slide(xml_file_path, list_of_translated_paragraph_data):
     """
+    Thay thế văn bản trong file XML slide, ghi đè file gốc.
+    *** Logic mới: ***
+    - Giảm cỡ chữ đi 0.85 lần.
+    - Nếu text > 20 chars: Loại bỏ định dạng bold (giữ nguyên case).
+    - Nếu text <= 20 chars: Giữ nguyên định dạng bold gốc (và case).
+    Args:
+        xml_file_path (str): Đường dẫn file XML slide gốc (sẽ bị ghi đè).
+        list_of_translated_paragraph_data (list): List các tuple
+            (translated_paragraph_text, original_first_rPr_in_paragraph).
+    Returns:
+        bool: True nếu thành công (ghi file), False nếu có lỗi.
+    """
+    # print(f"\n--- Bắt đầu thay thế PARAGRAPH (ghi đè, logic length/bold) trong file: {os.path.basename(xml_file_path)} ---")
+    processed_p_count = 0
+    if not os.path.exists(xml_file_path):
+        print(f"Lỗi: Không tìm thấy file XML nguồn '{xml_file_path}'.")
+        return False
     try:
+        tree = ET.parse(xml_file_path)
+        root = tree.getroot()
+        # --- TÌM và LỌC <a:p> THEO CÙNG LOGIC NHƯ EXTRACT ---
+        paragraphs_to_modify = []
+        processed_txBody_elements = set()
+        elements_to_check = []
+        for sp in root.findall('.//p:spTree/p:sp', ns): elements_to_check.append(sp)
+        for grpSp in root.findall('.//p:spTree/p:grpSp', ns):
+            for sp_in_grp in grpSp.findall('.//p:sp', ns): elements_to_check.append(sp_in_grp)
+        for tc in root.findall('.//a:tbl//a:tc', ns): elements_to_check.append(tc)
+        for container in elements_to_check:
+            txBody = container.find('./p:txBody', ns)
+            if txBody is None: txBody = container.find('./a:txBody', ns)
+            if txBody is not None and txBody not in processed_txBody_elements:
+                 paragraphs = txBody.findall('a:p', ns)
+                 for p_elem in paragraphs:
+                     has_actual_text = False
+                     elements_with_text = p_elem.findall('.//a:r/a:t', ns) + p_elem.findall('.//a:fld/a:t', ns)
+                     for t in elements_with_text:
+                         if t.text and t.text.strip(): has_actual_text = True; break
+                     if has_actual_text: paragraphs_to_modify.append(p_elem)
+                 processed_txBody_elements.add(txBody)
+        # --- Kiểm tra số lượng khớp ---
+        num_paragraphs_found = len(paragraphs_to_modify)
+        num_data_items = len(list_of_translated_paragraph_data)
+        if num_paragraphs_found == 0:
+            #  print(f"Thông báo [...]: Không tìm thấy <a:p> nào có text để thay thế.")
+             if num_data_items > 0: print(f"Cảnh báo: Đã cung cấp {num_data_items} mục dữ liệu nhưng không có <a:p> nào để áp dụng.")
+            #  print(f"--- Kết thúc xử lý (không thay đổi): {os.path.basename(xml_file_path)} ---")
+             return True
+        if num_paragraphs_found != num_data_items:
+            print(f"CẢNH BÁO [...]: Số lượng <a:p> ({num_paragraphs_found}) KHÔNG KHỚP dữ liệu dịch ({num_data_items}).")
+            num_items_to_process = min(num_paragraphs_found, num_data_items)
+            print(f"=> Sẽ chỉ xử lý {num_items_to_process} mục đầu tiên.")
         else:
+            num_items_to_process = num_paragraphs_found
+        # --- Lặp và thực hiện thay thế ---
+        for i in range(num_items_to_process):
+            try:
+                p_elem_to_modify = paragraphs_to_modify[i]
+                translated_text, rpr_to_use_original = list_of_translated_paragraph_data[i]
+                p_id = hex(id(p_elem_to_modify))
+                # --- 1. Xử lý text ban đầu (chỉ strip) ---
+                cleaned_translated_text = translated_text.strip() if isinstance(translated_text, str) else ""
+                # --- 2. Chuẩn bị rPr cuối cùng (bắt đầu bằng copy hoặc trống) ---
+                final_rpr = None
+                if rpr_to_use_original is not None and ET.iselement(rpr_to_use_original) and rpr_to_use_original.tag == f"{{{ns['a']}}}rPr":
+                    try:
+                        final_rpr = copy.deepcopy(rpr_to_use_original)
+                    except Exception as clone_e:
+                        print(f"Lỗi sao chép rPr gốc cho <a:p> index {i} (ID {p_id}): {clone_e}")
+                        final_rpr = ET.Element(f"{{{ns['a']}}}rPr")
+                else:
+                     final_rpr = ET.Element(f"{{{ns['a']}}}rPr")
+                # --- 3. Luôn giảm cỡ chữ (nếu có) ---
+                original_sz_str = final_rpr.get('sz')
+                if original_sz_str:
+                    try:
+                        original_sz = int(original_sz_str)
+                        new_sz = max(100, int(original_sz * 0.85))
+                        final_rpr.set('sz', str(new_sz))
+                    except ValueError:
+                        print(f"Cảnh báo: Không thể chuyển đổi sz='{original_sz_str}' thành số nguyên cho p_id {p_id}.")
+                # --- 4. Áp dụng logic độ dài cho bold (KHÔNG ĐỔI CASE) ---
+                if len(cleaned_translated_text) > 10:
+                    # Dài > 20: BỎ BOLD (nếu có)
+                    final_rpr.attrib.pop('b', None) # Xóa thuộc tính bold
+                    # print(f"Debug: Text > 20 chars for p_id {p_id}. Removed bold.")
+                # else:
+                    # Ngắn <= 20: Giữ lại thuộc tính 'b' gốc (đã có trong final_rpr nếu có)
+                    # print(f"Debug: Text <= 20 chars for p_id {p_id}. Kept original bold.")
+                # --- 5. Xóa nội dung cũ (run và field) ---
+                runs_to_remove = p_elem_to_modify.findall('a:r', ns)
+                fields_to_remove = p_elem_to_modify.findall('a:fld', ns)
+                for elem_to_remove in runs_to_remove + fields_to_remove:
+                    try: p_elem_to_modify.remove(elem_to_remove)
+                    except ValueError: pass
+                # --- 6. Tạo nội dung mới (nếu text không rỗng) ---
+                if cleaned_translated_text:
+                    new_r = ET.Element(f"{{{ns['a']}}}r")
+                    new_r.insert(0, final_rpr) # Chèn rPr đã xử lý
+                    new_t = ET.SubElement(new_r, f"{{{ns['a']}}}t")
+                    new_t.text = cleaned_translated_text # Chèn text gốc (đã strip)
+                    # Chèn run mới
+                    end_para_rpr = p_elem_to_modify.find('./a:endParaRPr', ns)
+                    insert_index = -1
+                    if end_para_rpr is not None:
+                        try: insert_index = list(p_elem_to_modify).index(end_para_rpr)
+                        except ValueError: insert_index = -1
+                    if insert_index != -1: p_elem_to_modify.insert(insert_index, new_r)
+                    else: p_elem_to_modify.append(new_r)
+                    processed_p_count += 1
+            except (IndexError, ValueError, TypeError) as data_err: print(f"Lỗi lấy dữ liệu tại index {i}: {data_err}. Bỏ qua mục này.")
+            except Exception as p_replace_err:
+                 p_id_err = hex(id(paragraphs_to_modify[i])) if i < len(paragraphs_to_modify) else "N/A"
+                 print(f"Lỗi khi xử lý thay thế cho <a:p> tại index {i} (ID {p_id_err}): {p_replace_err}")
+        # --- Lưu cây XML ---
+        try:
+            tree.write(xml_file_path, encoding='utf-8', xml_declaration=True, pretty_print=True)
+        except TypeError:
+             tree.write(xml_file_path, encoding='utf-8', xml_declaration=True)
+        return True
+    except ET.ParseError as pe: print(f"Lỗi parse XML file '{xml_file_path}': {pe}"); return False
+    except IOError as ioe: print(f"Lỗi I/O với file '{xml_file_path}': {ioe}"); return False
+    except Exception as e: print(f"Lỗi nghiêm trọng: {e}"); traceback.print_exc(); return False
+# --------------------------
+# 2. Xử lý SmartArt
+# --------------------------
+def get_smartart_data_file(rels_file, base_path):
+    """
+    Đọc file .rels và tìm relationship có Type là diagramData,
+    trả về đường dẫn đầy đủ đến file data*.xml của SmartArt.
+    (Không thay đổi đáng kể)
+    """
+    try:
+        if not os.path.exists(rels_file):
+             # print(f"Thông báo: File rels không tồn tại: {rels_file}") # Có thể bỏ qua log này
+             return None
+        tree = ET.parse(rels_file)
+        root = tree.getroot()
+        # Sử dụng ns['pr']
+        for rel in root.findall('pr:Relationship', ns):
+            target = rel.attrib.get('Target')
+            rel_type = rel.attrib.get('Type')
+            # Kiểm tra Type chính xác
+            if rel_type == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData' and target:
+                target_fixed = target.replace("../", "")
+                full_target_path = os.path.join(base_path, target_fixed)
+                absolute_path = os.path.normpath(full_target_path)
+                if os.path.exists(absolute_path):
+                    return absolute_path
+                else:
+                    print(f"Cảnh báo: Tìm thấy relationship SmartArt nhưng file target không tồn tại: {absolute_path}")
+        return None
+    except ET.ParseError as e:
+        print(f"Lỗi parse XML file rels {rels_file}: {e}")
+        return None
     except Exception as e:
+        print(f"Lỗi khi xử lý file rels {rels_file}: {e}")
+        # traceback.print_exc()
+        return None
+def extract_text_from_smartart(xml_file_path):
+    """
+    Trích xuất văn bản tổng hợp từ mỗi đoạn <a:p> có chứa text
+    trong file XML SmartArt.
+    Args:
+        xml_file_path (str): Đường dẫn đến file XML SmartArt.
+    Returns:
+        list: Một list các tuple (paragraph_text, first_rPr_in_paragraph).
+              paragraph_text là toàn bộ text trong các <a:t> con cháu của <a:p>.
+              first_rPr_in_paragraph là element <a:rPr> của <a:r> đầu tiên
+              có chứa text trong <a:p> đó. Trả về list rỗng nếu lỗi.
+    """
+    paragraph_data = []
+    try:
+        tree = ET.parse(xml_file_path)
+        root = tree.getroot()
+        # Tìm tất cả các đoạn <a:p> trong cây XML (thường nằm trong <dgm:txBody>)
+        # Sử dụng .// để tìm ở mọi cấp độ sâu trong các cấu trúc SmartArt
+        for p_elem in root.findall('.//a:p', ns):
+            combined_text = ""
+            first_rPr = None
+            found_first_rpr_in_p = False # Cờ cho rPr đầu tiên trong đoạn p này
+            # Tìm tất cả các run <a:r> bên trong đoạn <a:p> hiện tại
+            for r_elem in p_elem.findall('.//a:r', ns):
+                t_element = r_elem.find('.//a:t', ns) # Tìm text trong run
+                if t_element is not None and t_element.text is not None:
+                    current_text = t_element.text
+                    combined_text += current_text # Nối text từ các run
+                    # Lấy rPr của run đầu tiên có text trong đoạn p này
+                    if not found_first_rpr_in_p and current_text.strip():
+                        rPr_element = r_elem.find('.//a:rPr', ns)
+                        first_rPr = rPr_element # Lưu trữ element rPr (có thể là None)
+                        found_first_rpr_in_p = True
+            # Sau khi duyệt hết các run trong <a:p>, thêm vào kết quả nếu có text
+            cleaned_text = combined_text.strip()
+            if cleaned_text:
+                paragraph_data.append((cleaned_text, first_rPr))
+    except FileNotFoundError:
+        print(f"Lỗi: Không tìm thấy file XML '{xml_file_path}'.")
+        return []
+    except ET.ParseError as pe:
+        print(f"Lỗi phân tích cú pháp XML file '{xml_file_path}': {pe}")
+        return []
+    except Exception as e:
+        print(f"Lỗi không xác định khi trích xuất text theo đoạn từ file '{xml_file_path}': {e}")
+        traceback.print_exc()
+        return []
+    return paragraph_data
+# --- Hàm thay thế theo từng đoạn <a:p> ---
+def replace_text_in_smartart(xml_file_path, list_of_translated_paragraph_data, output_xml_file_path):
     """
+    Thay thế văn bản trong file XML SmartArt dựa trên dữ liệu đoạn <a:p> đã dịch.
+    Mỗi mục dịch sẽ thay thế nội dung text của một <a:p> tương ứng,
+    đặt toàn bộ text dịch vào một run <a:r> duy nhất với định dạng rPr được cung cấp.
+    Args:
+        xml_file_path (str): Đường dẫn file XML gốc.
+        list_of_translated_paragraph_data (list): List các tuple
+            (translated_paragraph_text, original_first_rPr_in_paragraph).
+        output_xml_file_path (str): Đường dẫn file XML đầu ra.
+    Returns:
+        bool: True nếu thành công, False nếu lỗi.
     """
+    p_index_for_data = 0 # Index để lấy dữ liệu dịch
+    processed_p_count = 0 # Đếm số đoạn <a:p> đã được xử lý (thay thế)
+    if not output_xml_file_path:
+        output_xml_file_path = xml_file_path
     try:
+        tree = ET.parse(xml_file_path)
+        root = tree.getroot()
+        # Tạo parent map để xóa element an toàn khi dùng findall với './/'
+        parent_map = {c: p for p in root.iter() for c in p}
+        # Tìm lại tất cả các <a:p> theo cùng thứ tự như khi trích xuất
+        paragraphs_in_order = root.findall('.//a:p', ns)
+        # Lọc ra những đoạn <a:p> mà ban đầu có chứa text để khớp với logic trích xuất
+        paragraphs_to_modify = []
+        for p_elem in paragraphs_in_order:
+             has_actual_text = False
+             for t in p_elem.findall('.//a:t', ns):
+                 if t.text and t.text.strip():
+                     has_actual_text = True
+                     break
+             if has_actual_text:
+                 paragraphs_to_modify.append(p_elem)
+        # Kiểm tra số lượng khớp
+        if len(paragraphs_to_modify) != len(list_of_translated_paragraph_data):
+            print(f"Cảnh báo [File: {os.path.basename(xml_file_path)}]: Số lượng <a:p> có text ({len(paragraphs_to_modify)}) "
+                  f"không khớp số lượng dữ liệu dịch ({len(list_of_translated_paragraph_data)}). Thay thế có thể sai lệch.")
+            # Quyết định số lượng sẽ xử lý
+            num_items_to_process = min(len(paragraphs_to_modify), len(list_of_translated_paragraph_data))
+        else:
+            num_items_to_process = len(paragraphs_to_modify)
+        # Duyệt qua các <a:p> cần sửa đổi
+        for i in range(num_items_to_process):
+            p_elem = paragraphs_to_modify[i]
+            translated_text, original_first_rPr = list_of_translated_paragraph_data[p_index_for_data]
+            cleaned_translated_text = translated_text.strip() if translated_text else ""
+            # --- Xóa các run <a:r> cũ bên trong <a:p> này ---
+            # Sử dụng .// để nhất quán với extraction, cần parent map để xóa
+            runs_to_remove = p_elem.findall('.//a:r', ns)
+            for r_elem in runs_to_remove:
+                parent = parent_map.get(r_elem)
+                if parent is not None:
+                    try:
+                        # Cập nhật parent map nếu cấu trúc thay đổi động (ít khả năng ở đây)
+                        # parent_map = {c: p for p in root.iter() for c in p}
+                        parent.remove(r_elem)
+                    except ValueError:
+                         pass # Bỏ qua nếu không tìm thấy để xóa
+                # else: # r_elem không có parent trong map (hiếm)
+            if cleaned_translated_text:
+                new_r = ET.Element(f"{{{ns['a']}}}r") # Tạo run mới
+                # Áp dụng rPr gốc (đã deepcopy) cho run mới
+                applied_rPr = False
+                if original_first_rPr is not None and ET.iselement(original_first_rPr):
+                    # *** Thêm kiểm tra thẻ rPr ở đây cho an toàn ***
+                    if original_first_rPr.tag == f"{{{ns['a']}}}rPr":
+                        try:
+                            cloned_rPr = copy.deepcopy(original_first_rPr)
+                            new_r.insert(0, cloned_rPr) # Chèn rPr vào đầu run
+                            applied_rPr = True
+                        except Exception as clone_e:
+                            print(f"Lỗi sao chép rPr cho <a:p> index {i} (data index {p_index_for_data}): {clone_e}")
+                    else:
+                         print(f"Cảnh báo: Thẻ rPr gốc không phải <a:rPr> cho p_elem index {i}. Thẻ: {original_first_rPr.tag}")
+                if not applied_rPr:
+                    ET.SubElement(new_r, f"{{{ns['a']}}}rPr") # Thêm rPr trống nếu cần
+                # Thêm text vào run
+                new_t = ET.SubElement(new_r, f"{{{ns['a']}}}t")
+                new_t.text = cleaned_translated_text
+                # --- SỬA ĐỔI QUAN TRỌNG: Chèn run mới vào đúng vị trí ---
+                # Tìm phần tử <a:endParaRPr> là con TRỰC TIẾP của p_elem
+                end_para_rpr = p_elem.find('./a:endParaRPr', ns)
+                if end_para_rpr is not None:
+                    # Nếu tìm thấy, lấy danh sách con hiện tại và tìm index của nó
+                    try:
+                        children_list = list(p_elem)
+                        insert_index = children_list.index(end_para_rpr)
+                        # Chèn run mới *ngay trước* endParaRPr
+                        p_elem.insert(insert_index, new_r)
+                        # print(f"Inserted new_r at index {insert_index} before endParaRPr for p_elem {i}")
+                    except ValueError:
+                         # Hiếm khi xảy ra nếu find() hoạt động đúng, nhưng là fallback
+                         print(f"Cảnh báo: Không tìm thấy index của endParaRPr dù đã find thấy. Appending new_r cho p_elem {i}.")
+                         p_elem.append(new_r)
+                else:
+                    # Nếu không có endParaRPr, append vào cuối là hành vi chấp nhận được
+                    p_elem.append(new_r)
+                    # print(f"Appended new_r (no endParaRPr found) for p_elem {i}")
+            # Nếu cleaned_translated_text rỗng, đoạn <a:p> sẽ bị trống (đã xóa hết <a:r>)
+            p_index_for_data += 1 # Chuyển sang dữ liệu dịch tiếp theo
+            processed_p_count += 1 # Tăng số đoạn đã xử lý
+        # print(f"Thông tin [File: {os.path.basename(xml_file_path)}]: Đã xử lý {processed_p_count} đoạn <a:p>.")
+        if p_index_for_data < len(list_of_translated_paragraph_data):
+             print(f"Cảnh báo [File: {os.path.basename(xml_file_path)}]: Còn {len(list_of_translated_paragraph_data) - p_index_for_data} "
+                   f"mục dữ liệu dịch chưa được sử dụng do số lượng <a:p> không đủ.")
+        # --- Lưu cây XML đã sửa đổi ---
+        for prefix, uri in ns.items():
+            ET.register_namespace(prefix, uri)
+        tree.write(output_xml_file_path, encoding='utf-8', xml_declaration=True)
+        # print(f"Đã lưu SmartArt cập nhật (theo đoạn) vào: {output_xml_file_path}")
+        return True
+    except FileNotFoundError:
+        print(f"Lỗi: Không tìm thấy file XML nguồn '{xml_file_path}'.")
+        return False
+    except ET.ParseError as pe:
+        print(f"Lỗi phân tích cú pháp XML file '{xml_file_path}': {pe}")
+        return False
+    except IOError as ioe:
+         print(f"Lỗi I/O khi ghi file '{output_xml_file_path}': {ioe}")
+         return False
     except Exception as e:
+        print(f"Lỗi nghiêm trọng trong quá trình thay thế text SmartArt (theo đoạn) file '{xml_file_path}': {e}")
+        traceback.print_exc()
+        return False

test.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

utils/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (8.07 kB). View file

utils/utils.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import os
+import zipfile
+import google.generativeai as genai
+import tempfile
+import io
+import json
+genai.configure(api_key="AIzaSyBH8O5IfqYrJ5wtWnmUC21IfMjzJCrTm3I")
+def unzip_office_file(pptx_file: io.BytesIO):
+    """
+    Giải nén nội dung từ file PPTX (dạng BytesIO) vào thư mục tạm thời.
+    Trả về đường dẫn thư mục chứa nội dung đã giải nén và tên file gốc (không có đuôi .pptx).
+    """
+    # Tạo thư mục tạm để lưu nội dung giải nén
+    output_dir = tempfile.mkdtemp(prefix="pptx_extract_")
+    # Giải nén nội dung từ file PPTX (BytesIO)
+    with zipfile.ZipFile(pptx_file, 'r') as zip_ref:
+        zip_ref.extractall(output_dir)
+    return output_dir
+def translate_single_text(text, source_lang='English', target_lang="Vietnamese"):
+    if not text:
+        return ""  # Bỏ qua nếu chuỗi rỗng
+    try:
+        model = genai.GenerativeModel('gemini-2.0-flash') # Sử dụng model từ code gốc nếu hoạt động tốt
+        # --- Prompt đơn giản chỉ yêu cầu dịch thuật ---
+        system_prompt_simple = f"""You are a translation engine.
+        Translate the following text accurately from {source_lang} to {target_lang}.
+        Provide *only* the translated text as a single string.
+        Do NOT add any extra formatting, delimiters like '#', introductory phrases, or explanations."""
+        user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. Text to translate: {text}"
+        full_prompt = system_prompt_simple.strip() + "\n\n" + user_prompt.strip()
+        response = model.generate_content(
+            contents=full_prompt,
+            generation_config={
+                'temperature': 0.7, # Nhiệt độ phù hợp cho dịch thuật (có thể điều chỉnh)
+                'top_p': 1,
+                'top_k': 1,
+            }
+        )
+        translated_text = response.text.strip()
+        return translated_text
+    except Exception as e:
+        print(f"Lỗi trong quá trình dịch (translate_single_text): {e}")
+        return "" # Trả về chuỗi rỗng nếu có lỗi
+def preprocess_text(text_list):
+    """
+    Converts a list of strings into a dictionary where keys are the
+    list indices (int) and values are the strings.
+    """
+    if not isinstance(text_list, list):
+        return {}
+    if not text_list:
+        return {}
+    text_dict = {index: text for index, text in enumerate(text_list)}
+    return text_dict
+def translate_text(text_dict, source_lang='English', target_lang="Vietnamese"):
+    """
+    Translates the values of a dictionary {index: text} using an LLM.
+    It uses an intermediate JSON string format for reliable LLM interaction.
+    Returns a dictionary {index: translated_text} with the same keys.
+    """
+    if not isinstance(text_dict, dict):
+        print("Warning: translate_text_dict expected a dict, received:", type(text_dict))
+        return {}
+    if not text_dict:
+        return {}
+    # --- Internal Helper: Convert Dictionary to JSON String for LLM ---
+    def _dict_to_json_string(d):
+        json_compatible = {str(k): v for k, v in d.items()}
+        try:
+            return json.dumps(json_compatible, ensure_ascii=False, separators=(',',':'))
+        except Exception as e:
+            print(f"Internal Error (_dict_to_json_string): {e}")
+            return "{}"
+    # --- Internal Helper: Convert LLM's JSON String Response to Dictionary ---
+    def _json_string_to_dict(s):
+        res_dict = {}
+        if not s or not isinstance(s, str): return {}
+        try:
+            raw = json.loads(s)
+            if not isinstance(raw, dict):
+                 print(f"Internal Warning (_json_string_to_dict): LLM response is not a JSON object: {s}")
+                 return {}
+            for k_str, v in raw.items():
+                try:
+                    res_dict[int(k_str)] = v
+                except ValueError:
+                    print(f"Internal Warning (_json_string_to_dict): Non-integer key '{k_str}' in LLM response.")
+        except json.JSONDecodeError as e:
+            print(f"Internal Error (_json_string_to_dict): Failed decoding JSON '{s}'. Error: {e}")
+        except Exception as e:
+             print(f"Internal Error (_json_string_to_dict): {e}")
+        return res_dict
+    # --- End Internal Helpers ---
+    # 1. Convert input dictionary to JSON string
+    json_input_string = _dict_to_json_string(text_dict)
+    print(f"Input JSON String: {json_input_string}") # Debugging output
+    if json_input_string == "{}":
+        print("Skipping translation due to empty input dictionary or conversion error.")
+        return {key: "" for key in text_dict} # Return original structure with empty values
+    system_prompt = f"""Translate the string values within the following JSON object .
+        Follow these instructions carefully:
+        1.  Analyze the entire JSON object to understand the context.
+        2.  Translate *only* the string values.
+        3.  Keep the original keys *exactly* as they are.
+        4.  Do *not* translate non-string values (like hex color codes, numbers, or potentially proper nouns like 'CALISTOGA', 'DM SANS', 'Pexels', 'Pixabay' unless they have a common translation). Use your best judgment for proper nouns.
+        5.  Preserve the original JSON structure perfectly.
+        6.  Your output *must* be only the translated JSON object, without any introductory text, explanations, or markdown formatting like ```json ... ```.
+    """
+    # 3. Construct User Prompt
+    user_prompt = f"Source language: {source_lang}. Target language: {target_lang}. JSON String: {json_input_string} \n\n Translated JSON Output:"
+    # 4. Call the LLM API
+    raw_translated_json_string = "{}" # Default to empty JSON string
+    try:
+        model = genai.GenerativeModel('gemini-2.0-flash')
+        full_prompt = f"{system_prompt.strip()}\n\n{user_prompt.strip()}"
+        response = model.generate_content(
+            contents=full_prompt,
+            generation_config={
+                'temperature': 0.3, # Low temp for adherence
+                'top_p': 1,
+                'top_k': 1,
+            }
+            # safety_settings=[...]
+        )
+        # Extract text safely and clean
+        if response and response.parts:
+             if hasattr(response.parts[0], 'text'):
+                 raw_translated_json_string = response.parts[0].text.strip()
+             else:
+                 print(f"Warning: Received response part without text attribute: {response.parts[0]}")
+                 try: raw_translated_json_string = str(response.parts[0])
+                 except Exception as str_e: print(f"Could not convert response part to string: {str_e}")
+        elif response and hasattr(response, 'text'):
+             raw_translated_json_string = response.text.strip()
+        else:
+             print(f"Warning: Received unexpected or empty response format from API: {response}")
+        # Clean potential markdown backticks
+        if raw_translated_json_string.startswith("```json"): raw_translated_json_string = raw_translated_json_string[7:]
+        if raw_translated_json_string.startswith("```"): raw_translated_json_string = raw_translated_json_string[3:]
+        if raw_translated_json_string.endswith("```"): raw_translated_json_string = raw_translated_json_string[:-3]
+        raw_translated_json_string = raw_translated_json_string.strip()
+        # Ensure it's at least plausible JSON before parsing
+        if not raw_translated_json_string: raw_translated_json_string = "{}"
+    except Exception as e:
+        print(f"Lỗi trong quá trình gọi API dịch: {e}")
+        raw_translated_json_string = "{}" # Ensure empty JSON on error
+    print(raw_translated_json_string)
+    # 5. Convert the LLM's JSON string response back to a dictionary
+    translated_intermediate_dict = _json_string_to_dict(raw_translated_json_string)
+    # 6. Validation: Ensure output dict has same keys as input dict
+    final_translated_dict = {}
+    missing_keys = []
+    for key in text_dict.keys(): # Iterate using ORIGINAL keys
+        if key in translated_intermediate_dict:
+            final_translated_dict[key] = translated_intermediate_dict[key]
+        else:
+            final_translated_dict[key] = "" # Preserve key, use empty string if missing
+            missing_keys.append(key)
+    if missing_keys:
+        print(f"Warning: LLM response was missing keys: {sorted(missing_keys)}. Filled with empty strings.")
+    extra_keys = set(translated_intermediate_dict.keys()) - set(text_dict.keys())
+    if extra_keys:
+        print(f"Warning: LLM response contained unexpected extra keys: {sorted(list(extra_keys))}. These were ignored.")
+    return final_translated_dict
+# Function 3: Dictionary -> List
+def postprocess_text(translated_dict):
+    """
+    Converts a dictionary {index: translated_text} back into a list of
+    strings, ordered by the index (key).
+    """
+    if not isinstance(translated_dict, dict):
+        print("Warning: postprocess_text expected a dict, received:", type(translated_dict))
+        return []
+    if not translated_dict:
+        return []
+    # Sort the dictionary items by key (index)
+    try:
+        # Ensure keys are integers for correct sorting if possible, handle errors
+        items_to_sort = []
+        for k, v in translated_dict.items():
+            try:
+                items_to_sort.append((int(k), v))
+            except (ValueError, TypeError):
+                print(f"Warning: postprocess cannot sort non-integer key '{k}', skipping.")
+                continue # Skip non-integer keys for sorting
+        if not items_to_sort:
+            print("Warning: No sortable items found in dictionary for postprocessing.")
+            return []
+        sorted_items = sorted(items_to_sort)
+        # Check for gaps in indices (optional but good practice)
+        expected_length = sorted_items[-1][0] + 1
+        if len(sorted_items) != expected_length:
+            print(f"Warning: Index gaps detected in postprocessing. Expected {expected_length} items based on max index, got {len(sorted_items)}.")
+            # Reconstruct carefully to handle gaps, filling with empty strings
+            result_list = [""] * expected_length
+            for index, text in sorted_items:
+                if 0 <= index < expected_length:
+                     result_list[index] = text
+            return result_list
+        # If no gaps, simply extract values
+        translated_list = [text for index, text in sorted_items]
+        return translated_list
+    except Exception as e:
+        print(f"Error during postprocessing sorting/list creation: {e}")
+        return [] # Return empty list on error