mintlee commited on
Commit
7da22a4
·
1 Parent(s): f543be8
.env CHANGED
@@ -1 +1 @@
1
- GEMINI_API_KEY = AIzaSyAzKQgJcAufbpMFV8SVhhB_z057f8UgFWg
 
1
+ GEMINI_API_KEY = AIzaSyAk1LTwWMZyTfPAKmsn6JzFtI1MpnI7FH8
excel/__pycache__/excel_translate.cpython-310.pyc CHANGED
Binary files a/excel/__pycache__/excel_translate.cpython-310.pyc and b/excel/__pycache__/excel_translate.cpython-310.pyc differ
 
excel/excel_translate.py CHANGED
@@ -10,26 +10,27 @@ import gridfs
10
  import tempfile
11
  import os
12
 
13
- def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en', target_lang: str = "fr", gemini_api: str = "", db_name: str = "excel"):
14
  # Kết nối MongoDB
15
  client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
16
- db = client[db_name]
17
  fs_input = gridfs.GridFS(db, collection="root_file")
18
  fs_output = gridfs.GridFS(db, collection="final_file")
19
 
20
  # Tải file từ MongoDB
21
- file_data = fs_input.get(file_id).read()
 
22
 
23
  # Lưu file tạm thời
24
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
25
- temp_file.write(file_data)
26
  temp_file_path = temp_file.name
27
 
28
  # Đọc file Excel bằng openpyxl
29
  wb = openpyxl.load_workbook(temp_file_path)
30
 
31
- # Chọn sheet được chỉ định hoặc tất cả các sheet
32
- sheets = [wb[sheet_name]] if sheet_name else wb.worksheets
33
 
34
  for ws in sheets:
35
  max_row = ws.max_row
@@ -48,7 +49,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
48
  cell_map[key] = cell
49
 
50
  # Gọi hàm dịch theo dạng bulk
51
- translated_dict = translate_text_dict(text_dict, target_lang=target_lang, gemini_api=gemini_api)
52
 
53
  # Cập nhật lại các cell với nội dung đã dịch
54
  for key, cell in cell_map.items():
@@ -61,7 +62,7 @@ def translate_xlsx(file_id: str, sheet_name: str = None, from_lang: str = 'en',
61
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
62
  wb.save(output_file.name)
63
  output_file.seek(0)
64
- translated_file_id = fs_output.put(output_file.read(), filename=f"translated_{file_id}.xlsx")
65
 
66
  # Đóng workbook và xóa file tạm
67
  wb.close()
@@ -94,10 +95,10 @@ def read_csv_with_auto_encoding(csv_path):
94
  return df
95
 
96
 
97
- def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_columns=None, db_name="csv"):
98
  # Kết nối MongoDB
99
  client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
100
- db = client[db_name]
101
  fs_input = gridfs.GridFS(db, collection="root_file")
102
  fs_output = gridfs.GridFS(db, collection="final_file")
103
 
@@ -113,9 +114,8 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
113
 
114
  # If text_columns is not specified, we assume we want to translate everything that looks like text.
115
  # Otherwise, only translate the given columns.
116
- if text_columns is None:
117
- # Example heuristic: choose all object/string columns
118
- text_columns = df.select_dtypes(include=["object"]).columns.tolist()
119
 
120
  num_rows = len(df)
121
  num_chunks = math.ceil(num_rows / chunk_size)
@@ -143,8 +143,7 @@ def translate_csv(file_id, target_lang="vi", gemini_api="", chunk_size=50, text_
143
  # Now call your LLM translator on this dictionary
144
  translated_chunk = translate_text_dict(
145
  text_dict=chunk_dict,
146
- target_lang=target_lang,
147
- gemini_api=gemini_api
148
  )
149
 
150
  # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
 
10
  import tempfile
11
  import os
12
 
13
+ def translate_xlsx(file_id: str, target_lang: str = ""):
14
  # Kết nối MongoDB
15
  client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
16
+ db = client["excel"]
17
  fs_input = gridfs.GridFS(db, collection="root_file")
18
  fs_output = gridfs.GridFS(db, collection="final_file")
19
 
20
  # Tải file từ MongoDB
21
+ file_data = fs_input.get(file_id)
22
+
23
 
24
  # Lưu file tạm thời
25
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as temp_file:
26
+ temp_file.write(file_data.read())
27
  temp_file_path = temp_file.name
28
 
29
  # Đọc file Excel bằng openpyxl
30
  wb = openpyxl.load_workbook(temp_file_path)
31
 
32
+
33
+ sheets = wb.worksheets # Chọn tất cả sheets nếu sheet_name không hợp lệ
34
 
35
  for ws in sheets:
36
  max_row = ws.max_row
 
49
  cell_map[key] = cell
50
 
51
  # Gọi hàm dịch theo dạng bulk
52
+ translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
53
 
54
  # Cập nhật lại các cell với nội dung đã dịch
55
  for key, cell in cell_map.items():
 
62
  with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as output_file:
63
  wb.save(output_file.name)
64
  output_file.seek(0)
65
+ translated_file_id = fs_output.put(output_file.read(), filename=file_data.filename)
66
 
67
  # Đóng workbook và xóa file tạm
68
  wb.close()
 
95
  return df
96
 
97
 
98
+ def translate_csv(file_id, target_lang="vi", chunk_size=50):
99
  # Kết nối MongoDB
100
  client = pymongo.MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
101
+ db = client["csv"]
102
  fs_input = gridfs.GridFS(db, collection="root_file")
103
  fs_output = gridfs.GridFS(db, collection="final_file")
104
 
 
114
 
115
  # If text_columns is not specified, we assume we want to translate everything that looks like text.
116
  # Otherwise, only translate the given columns.
117
+
118
+ text_columns = df.select_dtypes(include=["object"]).columns.tolist()
 
119
 
120
  num_rows = len(df)
121
  num_chunks = math.ceil(num_rows / chunk_size)
 
143
  # Now call your LLM translator on this dictionary
144
  translated_chunk = translate_text_dict(
145
  text_dict=chunk_dict,
146
+ target_lang=target_lang
 
147
  )
148
 
149
  # 'translated_chunk' should be the same structure, so let's re-inject into the DataFrame
pages/upload.py CHANGED
@@ -33,11 +33,11 @@ def process_file(file, file_type):
33
  translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
34
  progress_bar.progress(60)
35
  final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
36
- final_id = create_translated_ppt("ppt", file_id, final_xml_id, "final_pptx")
37
  elif file_type == "Excel":
38
- final_id = translate_xlsx(file_id, "en", target_lang, os.getenv("GEMINI_API_KEY"))
39
  elif file_type == "CSV":
40
- final_id = translate_csv(file_id, "en", target_lang, os.getenv("GEMINI_API_KEY"))
41
  elif file_type == "Word":
42
  final_id = translate_docx_from_mongodb(file_id, target_lang)
43
  else:
 
33
  translated_dict = translate_text_dict(text_dict, target_lang=target_lang)
34
  progress_bar.progress(60)
35
  final_xml_id = update_xml_with_translated_text_mongodb(xml_file_id, translated_dict)
36
+ final_id = create_translated_ppt("pptx", file_id, final_xml_id, "final_file")
37
  elif file_type == "Excel":
38
+ final_id = translate_xlsx(file_id = file_id, target_lang = target_lang)
39
  elif file_type == "CSV":
40
+ final_id = translate_csv(file_id = file_id, target_lang = target_lang)
41
  elif file_type == "Word":
42
  final_id = translate_docx_from_mongodb(file_id, target_lang)
43
  else:
powerpoint/__pycache__/pptx_object.cpython-310.pyc CHANGED
Binary files a/powerpoint/__pycache__/pptx_object.cpython-310.pyc and b/powerpoint/__pycache__/pptx_object.cpython-310.pyc differ
 
powerpoint/__pycache__/xml_handling.cpython-310.pyc CHANGED
Binary files a/powerpoint/__pycache__/xml_handling.cpython-310.pyc and b/powerpoint/__pycache__/xml_handling.cpython-310.pyc differ
 
powerpoint/pptx_object.py CHANGED
@@ -283,7 +283,8 @@ def get_file_from_mongodb(db_name, collection_name, file_id):
283
  db = client[db_name]
284
  fs = GridFS(db, collection_name)
285
  file_data = fs.get(file_id)
286
- return BytesIO(file_data.read())
 
287
 
288
 
289
  def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
@@ -292,18 +293,19 @@ def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
292
  db = client[db_name]
293
  fs = GridFS(db, collection_name)
294
  file_id = fs.put(file_data, filename=file_name)
 
295
  return file_id
296
 
297
  def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
298
  """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
299
  try:
300
  # Kết nối MongoDB và tải file
301
- original_ppt_io = get_file_from_mongodb(db_name, "root_file", original_ppt_id)
302
- translated_xml_io = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
303
 
304
  # Load PowerPoint gốc và XML dịch
305
- prs = Presentation(original_ppt_io)
306
- tree = ET.parse(translated_xml_io)
307
  root = tree.getroot()
308
 
309
  # Áp dụng bản dịch
@@ -335,23 +337,18 @@ def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_co
335
  except Exception as e:
336
  print(f"Error applying shape properties: {str(e)}")
337
 
338
- # Lưu PowerPoint vào MongoDB
339
  output_io = BytesIO()
340
  prs.save(output_io)
341
  output_io.seek(0) # Reset vị trí đọc
342
 
343
- file_id = save_file_to_mongodb(db_name, output_collection, "translated_presentation.pptx", output_io)
 
 
 
344
  print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
345
 
346
  return file_id
347
  except Exception as e:
348
  print(f"Error creating translated PowerPoint: {str(e)}")
349
  return None
350
-
351
- def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
352
- """Lưu tệp vào MongoDB GridFS"""
353
- client = MongoClient("mongodb+srv://admin:1highbar456@cluster0.equkm.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
354
- db = client[db_name]
355
- fs = GridFS(db, collection_name)
356
- file_id = fs.put(file_data, filename=file_name)
357
- return file_id
 
283
  db = client[db_name]
284
  fs = GridFS(db, collection_name)
285
  file_data = fs.get(file_id)
286
+ return file_data
287
+ # return BytesIO(file_data.read())
288
 
289
 
290
  def save_file_to_mongodb(db_name, collection_name, file_name, file_data):
 
293
  db = client[db_name]
294
  fs = GridFS(db, collection_name)
295
  file_id = fs.put(file_data, filename=file_name)
296
+ client.close()
297
  return file_id
298
 
299
  def create_translated_ppt(db_name, original_ppt_id, translated_xml_id, output_collection):
300
  """Tạo PowerPoint dịch từ MongoDB và lưu vào MongoDB"""
301
  try:
302
  # Kết nối MongoDB và tải file
303
+ original_ppt= get_file_from_mongodb(db_name, "root_file", original_ppt_id)
304
+ translated_xml = get_file_from_mongodb(db_name, "final_xml", translated_xml_id)
305
 
306
  # Load PowerPoint gốc và XML dịch
307
+ prs = Presentation(BytesIO(original_ppt.read()))
308
+ tree = ET.parse(BytesIO(translated_xml.read()))
309
  root = tree.getroot()
310
 
311
  # Áp dụng bản dịch
 
337
  except Exception as e:
338
  print(f"Error applying shape properties: {str(e)}")
339
 
340
+ # Lưu PowerPoint vào MongoDB với tên gốc
341
  output_io = BytesIO()
342
  prs.save(output_io)
343
  output_io.seek(0) # Reset vị trí đọc
344
 
345
+ # Giữ nguyên tên file gốc, thêm hậu tố "_translated"
346
+ translated_filename = original_ppt.filename.replace(".xml", ".pptx")
347
+
348
+ file_id = save_file_to_mongodb(db_name, output_collection, translated_filename, output_io)
349
  print(f"Translated PowerPoint saved to MongoDB with ID: {file_id}")
350
 
351
  return file_id
352
  except Exception as e:
353
  print(f"Error creating translated PowerPoint: {str(e)}")
354
  return None
 
 
 
 
 
 
 
 
powerpoint/xml_handling.py CHANGED
@@ -124,7 +124,8 @@ def ppt_to_xml_mongodb(ppt_file_id: str, db_name="pptx"):
124
 
125
  # Lưu XML vào MongoDB
126
  xml_output = BytesIO(xml_str.encode("utf-8"))
127
- xml_file_id = fs_xml.put(xml_output, filename=f"{ppt_file.filename}.xml")
 
128
 
129
  print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
130
  client.close()
@@ -363,7 +364,7 @@ def update_xml_with_translated_text_mongodb(file_id: str, translated_dict: Dict[
363
  updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
364
 
365
  # Lưu file cập nhật vào MongoDB (final_xml)
366
- new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}_translated.xml")
367
  print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
368
 
369
  return new_file_id
 
124
 
125
  # Lưu XML vào MongoDB
126
  xml_output = BytesIO(xml_str.encode("utf-8"))
127
+ file_name = ppt_file.filename.replace(".pptx", ".xml")
128
+ xml_file_id = fs_xml.put(xml_output, filename=file_name)
129
 
130
  print(f"✅ XML đã được lưu vào MongoDB (original_xml) với file_id: {xml_file_id}")
131
  client.close()
 
364
  updated_xml_str = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ")
365
 
366
  # Lưu file cập nhật vào MongoDB (final_xml)
367
+ new_file_id = fs_final.put(updated_xml_str.encode("utf-8"), filename=f"{file_data.filename}")
368
  print(f"✅ XML đã cập nhật được lưu vào MongoDB (final_xml) với file_id: {new_file_id}")
369
 
370
  return new_file_id
translate/__pycache__/translator.cpython-310.pyc CHANGED
Binary files a/translate/__pycache__/translator.cpython-310.pyc and b/translate/__pycache__/translator.cpython-310.pyc differ
 
translate/translator.py CHANGED
@@ -21,7 +21,7 @@ def translate_text_dict(text_dict: Dict[str, List[str]], target_lang: str = "vi"
21
  Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
22
 
23
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
24
- model = genai.GenerativeModel("gemini-1.5-flash")
25
 
26
  response = model.generate_content(prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
27
 
 
21
  Return the translated texts formatted like the original dictionary. Do NOT say anthing else. Return it as a JSON block."""
22
 
23
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
24
+ model = genai.GenerativeModel("gemini-2.0-flash")
25
 
26
  response = model.generate_content(prompt) # Use a model appropriate for your needs and API key. gemini-2.0-flash doesn't exist. 1.5-pro is a good general-purpose model.
27