Spaces:
Running
Running
- .env +1 -1
- excel/__pycache__/excel_translate.cpython-310.pyc +0 -0
- excel/excel_translate.py +14 -15
- pages/upload.py +3 -3
- powerpoint/__pycache__/pptx_object.cpython-310.pyc +0 -0
- powerpoint/__pycache__/xml_handling.cpython-310.pyc +0 -0
- powerpoint/pptx_object.py +12 -15
- powerpoint/xml_handling.py +3 -2
- translate/__pycache__/translator.cpython-310.pyc +0 -0
- translate/translator.py +1 -1
.env
CHANGED
@@ -1 +1 @@
|
|
1 |
-
GEMINI_API_KEY =
|
|
|
1 |
+
GEMINI_API_KEY = AIzaSyAk1LTwWMZyTfPAKmsn6JzFtI1MpnI7FH8
|
excel/__pycache__/excel_translate.cpython-310.pyc
CHANGED
Binary files a/excel/__pycache__/excel_translate.cpython-310.pyc and b/excel/__pycache__/excel_translate.cpython-310.pyc differ
|
|
excel/excel_translate.py
CHANGED
@@ -10,26 +10,27 @@ import gridfs
|
|
10 |
import tempfile
|
11 |
import os
|
12 |
|
13 |
-
def translate_xlsx(file_id: str,
|
14 |
# Kết nối MongoDB
|
15 |
client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
|
16 |
-
db = client[
|
17 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
18 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
19 |
|
20 |
# Tải file từ MongoDB
|
21 |
-
file_data = fs_input.get(file_id)
|
|
|
22 |
|
23 |
# Lưu file tạm thời
|
24 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
|
25 |
-
temp_file.write(file_data)
|
26 |
temp_file_path = temp_file.name
|
27 |
|
28 |
# Đọc file Excel bằng openpyxl
|
29 |
wb = openpyxl.load_workbook(temp_file_path)
|
30 |
|
31 |
-
|
32 |
-
sheets =
|
33 |
|
34 |
for ws in sheets:
|
35 |
max_row = ws.max_row
|
@@ -48,7 +49,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
|
|
48 |
cell_map[key] = cell
|
49 |
|
50 |
# Gọi hàm dịch theo dạng bulk
|
51 |
-
translated_dict = translate_text_dict(text_dict, target_lang=target_lang
|
52 |
|
53 |
# Cập nhật lại các cell với nội dung đã dịch
|
54 |
for key, cell in cell_map.items():
|
@@ -61,7 +62,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
|
|
61 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
|
62 |
wb.save(output_file.name)
|
63 |
output_file.seek(0)
|
64 |
-
translated_file_id = fs_output.put(output_file.read(), filename=
|
65 |
|
66 |
# Đóng workbook và xóa file tạm
|
67 |
wb.close()
|
@@ -94,10 +95,10 @@ def read_csv_with_auto_encoding(csv_path):
|
|
94 |
return df
|
95 |
|
96 |
|
97 |
-
def translate_csv(file_id, target_lang="vi",
|
98 |
# Kết nối MongoDB
|
99 |
client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
|
100 |
-
db = client[
|
101 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
102 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
103 |
|
@@ -113,9 +114,8 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
|
|
113 |
|
114 |
# If text_columns is not specified, we assume we want to translate everything that looks like text.
|
115 |
# Otherwise, only translate the given columns.
|
116 |
-
|
117 |
-
|
118 |
-
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
119 |
|
120 |
num_rows = len(df)
|
121 |
num_chunks = math.ceil(num_rows / chunk_size)
|
@@ -143,8 +143,7 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
|
|
143 |
# Now call your LLM translator on this dictionary
|
144 |
translated_chunk = translate_text_dict(
|
145 |
text_dict=chunk_dict,
|
146 |
-
target_lang=target_lang
|
147 |
-
gemini_api=gemini_api
|
148 |
)
|
149 |
|
150 |
# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
|
|
|
10 |
import tempfile
|
11 |
import os
|
12 |
|
13 |
+
def translate_xlsx(file_id: str, target_lang: str = ""):
|
14 |
# Kết nối MongoDB
|
15 |
client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
|
16 |
+
db = client["excel"]
|
17 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
18 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
19 |
|
20 |
# Tải file từ MongoDB
|
21 |
+
file_data = fs_input.get(file_id)
|
22 |
+
|
23 |
|
24 |
# Lưu file tạm thời
|
25 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
|
26 |
+
temp_file.write(file_data.read())
|
27 |
temp_file_path = temp_file.name
|
28 |
|
29 |
# Đọc file Excel bằng openpyxl
|
30 |
wb = openpyxl.load_workbook(temp_file_path)
|
31 |
|
32 |
+
|
33 |
+
sheets = wb.worksheets # Chọn tất cả sheets nếu sheet_name không hợp lệ
|
34 |
|
35 |
for ws in sheets:
|
36 |
max_row = ws.max_row
|
|
|
49 |
cell_map[key] = cell
|
50 |
|
51 |
# Gọi hàm dịch theo dạng bulk
|
52 |
+
translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
53 |
|
54 |
# Cập nhật lại các cell với nội dung đã dịch
|
55 |
for key, cell in cell_map.items():
|
|
|
62 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
|
63 |
wb.save(output_file.name)
|
64 |
output_file.seek(0)
|
65 |
+
translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
|
66 |
|
67 |
# Đóng workbook và xóa file tạm
|
68 |
wb.close()
|
|
|
95 |
return df
|
96 |
|
97 |
|
98 |
+
def translate_csv(file_id, target_lang="vi", chunk_size=50):
|
99 |
# Kết nối MongoDB
|
100 |
client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
|
101 |
+
db = client["csv"]
|
102 |
fs_input = gridfs.GridFS(db, collection="root_file")
|
103 |
fs_output = gridfs.GridFS(db, collection="final_file")
|
104 |
|
|
|
114 |
|
115 |
# If text_columns is not specified, we assume we want to translate everything that looks like text.
|
116 |
# Otherwise, only translate the given columns.
|
117 |
+
|
118 |
+
text_columns = df.select_dtypes(include=["object"]).columns.tolist()
|
|
|
119 |
|
120 |
num_rows = len(df)
|
121 |
num_chunks = math.ceil(num_rows / chunk_size)
|
|
|
143 |
# Now call your LLM translator on this dictionary
|
144 |
translated_chunk = translate_text_dict(
|
145 |
text_dict=chunk_dict,
|
146 |
+
target_lang=target_lang
|
|
|
147 |
)
|
148 |
|
149 |
# 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
|
pages/upload.py
CHANGED
@@ -33,11 +33,11 @@ def process_file(file, file_type):
|
|
33 |
translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
34 |
progress_bar.progress(60)
|
35 |
final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
36 |
-
final_id = create_translated_ppt("
|
37 |
elif file_type == "Excel":
|
38 |
-
final_id = translate_xlsx(file_id
|
39 |
elif file_type == "CSV":
|
40 |
-
final_id = translate_csv(file_id
|
41 |
elif file_type == "Word":
|
42 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
43 |
else:
|
|
|
33 |
translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
|
34 |
progress_bar.progress(60)
|
35 |
final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
|
36 |
+
final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
|
37 |
elif file_type == "Excel":
|
38 |
+
final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
|
39 |
elif file_type == "CSV":
|
40 |
+
final_id = translate_csv(file_id = file_id, target_lang = target_lang)
|
41 |
elif file_type == "Word":
|
42 |
final_id = translate_docx_from_mongodb(file_id, target_lang)
|
43 |
else:
|
powerpoint/__pycache__/pptx_object.cpython-310.pyc
CHANGED
Binary files a/powerpoint/__pycache__/pptx_object.cpython-310.pyc and b/powerpoint/__pycache__/pptx_object.cpython-310.pyc differ
|
|
powerpoint/__pycache__/xml_handling.cpython-310.pyc
CHANGED
Binary files a/powerpoint/__pycache__/xml_handling.cpython-310.pyc and b/powerpoint/__pycache__/xml_handling.cpython-310.pyc differ
|
|
powerpoint/pptx_object.py
CHANGED
@@ -283,7 +283,8 @@ def get_file_from_mongodb(db_name, collection_name, file_id):
|
|
283 |
db = client[db_name]
|
284 |
fs = GridFS(db, collection_name)
|
285 |
file_data = fs.get(file_id)
|
286 |
-
return
|
|
|
287 |
|
288 |
|
289 |
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
@@ -292,18 +293,19 @@ def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
|
292 |
db = client[db_name]
|
293 |
fs = GridFS(db, collection_name)
|
294 |
file_id = fs.put(file_data, filename=file_name)
|
|
|
295 |
return file_id
|
296 |
|
297 |
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
298 |
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
299 |
try:
|
300 |
# Kết nối MongoDB và tải file
|
301 |
-
|
302 |
-
|
303 |
|
304 |
# Load PowerPoint gốc và XML dịch
|
305 |
-
prs = Presentation(
|
306 |
-
tree = ET.parse(
|
307 |
root = tree.getroot()
|
308 |
|
309 |
# Áp dụng bản dịch
|
@@ -335,23 +337,18 @@ def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_co
|
|
335 |
except Exception as e:
|
336 |
print(f"Error applying shape properties: {str(e)}")
|
337 |
|
338 |
-
# Lưu PowerPoint vào MongoDB
|
339 |
output_io = BytesIO()
|
340 |
prs.save(output_io)
|
341 |
output_io.seek(0) # Reset vị trí đọc
|
342 |
|
343 |
-
|
|
|
|
|
|
|
344 |
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
345 |
|
346 |
return file_id
|
347 |
except Exception as e:
|
348 |
print(f"Error creating translated PowerPoint: {str(e)}")
|
349 |
return None
|
350 |
-
|
351 |
-
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
352 |
-
"""Lưu tệp vào MongoDB GridFS"""
|
353 |
-
client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
|
354 |
-
db = client[db_name]
|
355 |
-
fs = GridFS(db, collection_name)
|
356 |
-
file_id = fs.put(file_data, filename=file_name)
|
357 |
-
return file_id
|
|
|
283 |
db = client[db_name]
|
284 |
fs = GridFS(db, collection_name)
|
285 |
file_data = fs.get(file_id)
|
286 |
+
return file_data
|
287 |
+
# return BytesIO(file_data.read())
|
288 |
|
289 |
|
290 |
def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
|
|
|
293 |
db = client[db_name]
|
294 |
fs = GridFS(db, collection_name)
|
295 |
file_id = fs.put(file_data, filename=file_name)
|
296 |
+
client.close()
|
297 |
return file_id
|
298 |
|
299 |
def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
|
300 |
"""Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
|
301 |
try:
|
302 |
# Kết nối MongoDB và tải file
|
303 |
+
original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
|
304 |
+
translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
|
305 |
|
306 |
# Load PowerPoint gốc và XML dịch
|
307 |
+
prs = Presentation(BytesIO(original_ppt.read()))
|
308 |
+
tree = ET.parse(BytesIO(translated_xml.read()))
|
309 |
root = tree.getroot()
|
310 |
|
311 |
# Áp dụng bản dịch
|
|
|
337 |
except Exception as e:
|
338 |
print(f"Error applying shape properties: {str(e)}")
|
339 |
|
340 |
+
# Lưu PowerPoint vào MongoDB với tên gốc
|
341 |
output_io = BytesIO()
|
342 |
prs.save(output_io)
|
343 |
output_io.seek(0) # Reset vị trí đọc
|
344 |
|
345 |
+
# Giữ nguyên tên file gốc, thêm hậu tố "_translated"
|
346 |
+
translated_filename = original_ppt.filename.replace(".xml", ".pptx")
|
347 |
+
|
348 |
+
file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
|
349 |
print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
|
350 |
|
351 |
return file_id
|
352 |
except Exception as e:
|
353 |
print(f"Error creating translated PowerPoint: {str(e)}")
|
354 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
powerpoint/xml_handling.py
CHANGED
@@ -124,7 +124,8 @@ def ppt_to_xml_mongodb(ppt_file_id: str, db_name="pptx"):
|
|
124 |
|
125 |
# Lưu XML vào MongoDB
|
126 |
xml_output = BytesIO(xml_str.encode("utf-8"))
|
127 |
-
|
|
|
128 |
|
129 |
print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
|
130 |
client.close()
|
@@ -363,7 +364,7 @@ def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[
|
|
363 |
updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
|
364 |
|
365 |
# Lưu file cập nhật vào MongoDB (final_xml)
|
366 |
-
new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}
|
367 |
print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
|
368 |
|
369 |
return new_file_id
|
|
|
124 |
|
125 |
# Lưu XML vào MongoDB
|
126 |
xml_output = BytesIO(xml_str.encode("utf-8"))
|
127 |
+
file_name = ppt_file.filename.replace(".pptx", ".xml")
|
128 |
+
xml_file_id = fs_xml.put(xml_output, filename=file_name)
|
129 |
|
130 |
print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
|
131 |
client.close()
|
|
|
364 |
updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
|
365 |
|
366 |
# Lưu file cập nhật vào MongoDB (final_xml)
|
367 |
+
new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}")
|
368 |
print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
|
369 |
|
370 |
return new_file_id
|
translate/__pycache__/translator.cpython-310.pyc
CHANGED
Binary files a/translate/__pycache__/translator.cpython-310.pyc and b/translate/__pycache__/translator.cpython-310.pyc differ
|
|
translate/translator.py
CHANGED
@@ -21,7 +21,7 @@ def translate_text_dict(text_dict: Dict[str, List[str]], target_lang: str = "vi"
|
|
21 |
Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
|
22 |
|
23 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
24 |
-
model = genai.GenerativeModel("gemini-
|
25 |
|
26 |
response = model.generate_content(prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
|
27 |
|
|
|
21 |
Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
|
22 |
|
23 |
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
|
24 |
+
model = genai.GenerativeModel("gemini-2.0-flash")
|
25 |
|
26 |
response = model.generate_content(prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
|
27 |
|