KevanSoon commited on
Commit
c156dc2
·
1 Parent(s): 4df3d15

major app.py changes

Browse files
Files changed (2) hide show
  1. app.py +284 -980
  2. requirements.txt +3 -1
app.py CHANGED
@@ -12,6 +12,10 @@ import uuid
12
  import tempfile
13
  import io
14
  import traceback
 
 
 
 
15
 
16
  # beautifulsoup
17
  from bs4 import BeautifulSoup
@@ -49,6 +53,12 @@ from auth.clerk import verify_clerk_jwt
49
  # --- MODIFIED: Replaced old tool imports with the new one ---
50
  from tools.tools import analyze_contract
51
 
 
 
 
 
 
 
52
 
53
  app = FastAPI(
54
  title="Document Translator (Final Architecture)",
@@ -112,608 +122,7 @@ async def analyze_contract_endpoint(file: UploadFile = File(...)):
112
  raise HTTPException(
113
  status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
114
  )
115
-
116
-
117
- # --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
118
-
119
-
120
- def wrap_words_with_spans(html: str) -> str:
121
- # Wrap each word in target tags with a span having data attributes
122
- def replacer(match):
123
- replacer.counter += 1
124
- word = match.group(0)
125
- return f'<span data-clickable="true" data-id="word-{replacer.counter}">{word}</span>'
126
-
127
- replacer.counter = 0
128
-
129
- pattern = r"\b\w+[.,?!]?\b" # matches words with optional trailing punctuation
130
-
131
- for tag in ["p", "h1", "h2", "td"]:
132
- # regex to capture content inside these tags
133
- regex = re.compile(rf"(<{tag}[^>]*>)(.*?)(</{tag}>)", re.DOTALL)
134
-
135
- def replacer_func(m):
136
- open_tag, inner_text, close_tag = m.groups()
137
- wrapped_text = re.sub(pattern, replacer, inner_text)
138
- return open_tag + wrapped_text + close_tag
139
-
140
- html = regex.sub(replacer_func, html)
141
-
142
- return html
143
-
144
-
145
- def inject_dropdown_script(html: str) -> str:
146
- script = """
147
- <script>
148
- window.addEventListener('DOMContentLoaded', () => {
149
-
150
- function createDropdown(x, y, wordEl, word) {
151
- // Remove any existing dropdown
152
- const oldDropdown = document.getElementById('translation-dropdown');
153
- if (oldDropdown) oldDropdown.remove();
154
-
155
- // Create dropdown select element
156
- const dropdown = document.createElement('select');
157
- dropdown.id = 'translation-dropdown';
158
- dropdown.style.position = 'absolute';
159
- dropdown.style.left = x + 'px';
160
- dropdown.style.top = y + 'px';
161
- dropdown.style.zIndex = 9999;
162
-
163
- // Languages options
164
- const languages = ['English', 'Chinese', 'Tamil', 'Hindi'];
165
- languages.forEach(lang => {
166
- const option = document.createElement('option');
167
- option.value = lang.toLowerCase();
168
- option.innerText = lang;
169
- dropdown.appendChild(option);
170
- });
171
-
172
- // Placeholder option
173
- const defaultOption = document.createElement('option');
174
- defaultOption.value = '';
175
- defaultOption.innerText = 'Select language';
176
- defaultOption.selected = true;
177
- defaultOption.disabled = true;
178
- dropdown.insertBefore(defaultOption, dropdown.firstChild);
179
-
180
- document.body.appendChild(dropdown);
181
- dropdown.focus();
182
-
183
- dropdown.addEventListener('change', () => {
184
- const selectedLang = dropdown.value;
185
- if (!selectedLang) return;
186
-
187
- // Call backend to translate word
188
- fetch('http://localhost:8080/api/translate_frontend', {
189
- method: 'POST',
190
- headers: { 'Content-Type': 'application/json' },
191
- body: JSON.stringify({ text: word, target_language: selectedLang }),
192
- })
193
- .then(res => {
194
- if (!res.ok) throw new Error('Translation API error');
195
- return res.json();
196
- })
197
- .then(data => {
198
- const translated = data.translated_text || word;
199
- wordEl.innerText = translated;
200
-
201
- // Add or update language label
202
- let label = wordEl.nextSibling;
203
- if (!label || !label.classList || !label.classList.contains('language-label')) {
204
- label = document.createElement('span');
205
- label.className = 'language-label';
206
- label.style.marginLeft = '6px';
207
- label.style.fontSize = '0.8em';
208
- label.style.color = '#555';
209
- wordEl.after(label);
210
- }
211
- label.textContent = `(${dropdown.options[dropdown.selectedIndex].text})`;
212
- })
213
- .catch(err => {
214
- console.error('Translation error:', err);
215
- alert('Translation failed, please try again.');
216
- });
217
-
218
- dropdown.remove();
219
- });
220
-
221
- // Clicking outside closes dropdown
222
- document.addEventListener('click', function onDocClick(e) {
223
- if (!dropdown.contains(e.target)) {
224
- dropdown.remove();
225
- document.removeEventListener('click', onDocClick);
226
- }
227
- });
228
- }
229
-
230
- // Add click handlers to all words wrapped in spans with data-clickable="true"
231
- document.querySelectorAll('span[data-clickable="true"]').forEach(el => {
232
- el.style.cursor = 'pointer';
233
- el.addEventListener('click', event => {
234
- event.stopPropagation();
235
- const word = el.innerText;
236
- const rect = el.getBoundingClientRect();
237
- const x = rect.left + window.scrollX;
238
- const y = rect.bottom + window.scrollY;
239
- createDropdown(x, y, el, word);
240
- });
241
- });
242
-
243
- });
244
- </script>
245
- """
246
- if "</body>" in html:
247
- return html.replace("</body>", script + "\n</body>")
248
- else:
249
- return html + script
250
-
251
-
252
- @app.post("/api/translate_frontend")
253
- async def translate_text(request: Request):
254
- try:
255
- data = await request.json()
256
- text = data.get("text")
257
- target_language = data.get("target_language")
258
-
259
- if not text or not target_language:
260
- raise HTTPException(
261
- status_code=400,
262
- detail="Missing 'text' or 'target_language' in request body",
263
- )
264
-
265
- url = "https://api.sea-lion.ai/v1/chat/completions"
266
- api_key = os.getenv("SEALION_API_KEY")
267
-
268
- headers = {
269
- "Authorization": f"Bearer {api_key}",
270
- "Content-Type": "application/json",
271
- # No "accept" header or set to "application/json"
272
- }
273
-
274
- prompt = (
275
- f"Please translate the following text to {target_language} and return "
276
- "ONLY the translated text without any explanations or extra formatting:\n\n"
277
- f'"{text}"'
278
- )
279
-
280
- payload = {
281
- "max_completion_tokens": 1024,
282
- "messages": [{"role": "user", "content": prompt}],
283
- "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
284
- }
285
-
286
- response = requests.post(url, headers=headers, data=json.dumps(payload))
287
- response.raise_for_status()
288
-
289
- # Parse JSON response
290
- response_json = response.json()
291
-
292
- # Extract translated text from response JSON
293
- translated_text = response_json["choices"][0]["message"]["content"].strip()
294
-
295
- if not translated_text:
296
- raise HTTPException(
297
- status_code=500, detail="Empty response from translation model."
298
- )
299
-
300
- return {"translated_text": translated_text}
301
-
302
- except requests.exceptions.RequestException as e:
303
- raise HTTPException(
304
- status_code=502, detail=f"Translation API request failed: {e}"
305
- )
306
- except Exception as e:
307
- raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
308
-
309
-
310
- # --- Model 2: Sea-Lion (The JSON Translator) ---
311
- @app.post("/api/translate")
312
- async def translate_text(text: str, target_language: str):
313
- """
314
- Receives text and a target language, and returns the translated text
315
- using the SEA-LION model.
316
- """
317
- # The API endpoint URL for translation
318
- url = "https://api.sea-lion.ai/v1/chat/completions"
319
-
320
- # It's recommended to store API keys securely, e.g., in environment variables
321
- api_key = os.getenv("SEALION_API_KEY")
322
-
323
- # The headers for the request
324
- headers = {
325
- "accept": "text/plain",
326
- "Authorization": f"Bearer {api_key}",
327
- "Content-Type": "application/json",
328
- }
329
-
330
- # Create a dynamic prompt for the translation task
331
- prompt = f'Translate the following text to {text}: "{target_language}"'
332
-
333
- # The JSON data payload for the request
334
- data = {
335
- "max_completion_tokens": 4096, # Increased token limit for longer translations
336
- "messages": [{"role": "user", "content": prompt}],
337
- "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
338
- }
339
-
340
- try:
341
- # Make the POST request to the SEA-LION API
342
- response = requests.post(url, headers=headers, data=json.dumps(data))
343
- response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
344
-
345
- # The response from this specific API is plain text, not JSON.
346
- # We will wrap it in a JSON structure for consistency in our API.
347
- translated_text = response.text
348
-
349
- # It's good practice to check if the response is empty
350
- if not translated_text:
351
- raise HTTPException(
352
- status_code=500,
353
- detail="Received an empty response from the translation model.",
354
- )
355
-
356
- return {"translated_text": translated_text}
357
-
358
- except requests.exceptions.RequestException as e:
359
- # Handle network-related errors
360
- raise HTTPException(
361
- status_code=502,
362
- detail=f"Failed to communicate with the translation AI model: {e}",
363
- )
364
- except Exception as e:
365
- # Handle other potential errors
366
- raise HTTPException(
367
- status_code=500,
368
- detail=f"An unexpected error occurred during translation: {e}",
369
- )
370
-
371
-
372
- # --- Model 3: Gemini (The HTML Generator) ---
373
- async def generate_html_from_translated_json(translated_json: dict) -> str:
374
- """
375
- Receives a translated JSON object and uses Gemini to generate the final
376
- structured HTML document.
377
- """
378
- try:
379
- api_key = os.getenv("GEMINI_API_KEY")
380
- if not api_key:
381
- raise ValueError("GEMINI_API_KEY not found in environment variables.")
382
-
383
- genai.configure(api_key=api_key)
384
- model = genai.GenerativeModel(model_name="gemini-2.0-flash")
385
- json_string_for_prompt = json.dumps(translated_json, indent=2)
386
-
387
- prompt = f"""
388
- You are an expert system that converts a JSON object containing PRE-TRANSLATED text into a clean, semantic HTML document.
389
-
390
- **Your Task:**
391
- 1. Analyze the following JSON object. Its text content has already been translated.
392
- 2. The core document data is located at the path: `choices[0]['message']['tool_calls'][0]['function']['arguments']`.
393
- 3. The value of 'arguments' is a JSON STRING. You must parse this inner string to access the list of document chunks.
394
- 4. Using the translated data from the 'text' fields, generate a single, complete HTML5 document. Use appropriate tags like <h1>, <h2>, <p>, and <table>.
395
- 5. if json contains "tabular" means mmake a table for that with some grey border and styling
396
- 6. Your final output must ONLY be the raw HTML code. Do not add comments or markdown.
397
-
398
- **Translated JSON object to process:**
399
- ```json
400
- {json_string_for_prompt}
401
- ```
402
- """
403
-
404
- # def do_request():
405
- # response = model.generate_content(prompt)
406
- # match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
407
- # if match:
408
- # return match.group(1).strip()
409
- # return response.text.strip()
410
-
411
- # return await asyncio.to_thread(do_request)
412
- def do_request():
413
- response = model.generate_content(prompt)
414
-
415
- # Extract raw HTML from Gemini markdown code block
416
- match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
417
- raw_html = match.group(1).strip() if match else response.text.strip()
418
-
419
- # Wrap each word in clickable spans
420
- wrapped_html = wrap_words_with_spans(raw_html)
421
-
422
- # Inject dropdown script
423
- final_html = inject_dropdown_script(wrapped_html)
424
-
425
- return final_html
426
-
427
- return await asyncio.to_thread(do_request)
428
- except google_exceptions.ResourceExhausted as e:
429
- error_message = "The request to the document processor (Gemini) was rejected due to API quota limits. Please wait or upgrade your API plan."
430
- return f"<html><body><h1>API Quota Error</h1><p>{html.escape(error_message)}</p></body></html>"
431
- except Exception as e:
432
- error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
433
- return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
434
-
435
-
436
- # --- API Endpoint Orchestrating the Pipeline ---
437
- @app.post("/api/translate_file", response_class=HTMLResponse)
438
- async def translate_document_to_raw_html(
439
- target_language: str = Form(...), file: UploadFile = File(...)
440
- ):
441
- """
442
- Processes a document using the final, robust pipeline:
443
- 1. Nemo extracts content to JSON.
444
- 2. Sea-Lion translates the text within the JSON.
445
- 3. Gemini generates the final HTML from the translated JSON.
446
- """
447
- content_type = file.content_type
448
- if content_type not in ["application/pdf", "image/png", "image/jpeg"]:
449
- raise HTTPException(status_code=400, detail="Unsupported file type.")
450
-
451
- try:
452
- # === STEP 1: Get raw JSON from Nemo (The Parser) ===
453
- file_content = await file.read()
454
- file_b64 = base64.b64encode(file_content).decode("utf-8")
455
- nemo_data = {
456
- "model": "nvidia/nemoretriever-parse",
457
- "messages": [
458
- {
459
- "role": "user",
460
- "content": [
461
- {
462
- "type": "image_url",
463
- "image_url": {
464
- "url": f"data:{content_type};base64,{file_b64}"
465
- },
466
- }
467
- ],
468
- }
469
- ],
470
- "max_tokens": 2048,
471
- }
472
- headers = {"accept": "application/json", "Content-Type": "application/json"}
473
- model_response = requests.post(
474
- "http://localhost:8000/v1/chat/completions",
475
- headers=headers,
476
- data=json.dumps(nemo_data),
477
- )
478
- model_response.raise_for_status()
479
- nemo_response_json = model_response.json()
480
- print(nemo_response_json)
481
- print("*********** Step 1 Done ***********")
482
-
483
- print("*********** Step 2 in Progress ***********")
484
- # === STEP 2: Get translated JSON from Sea-Lion (The Translator) ===
485
- translated_json = await translate_text(nemo_response_json, target_language)
486
- print(translated_json)
487
- print("*********** Step 2 Done ***********")
488
-
489
- print("*********** Step 3 in Progress ***********")
490
- # === STEP 3: Generate final HTML from Gemini (The HTML Generator) ===
491
- final_html = await generate_html_from_translated_json(translated_json)
492
- print(final_html)
493
- print("*********** Step 3 Done ***********")
494
- # Check if Gemini itself returned an error message
495
- if final_html.strip().startswith("<html><body><h1>"):
496
- return HTMLResponse(content=final_html)
497
-
498
- # === STEP 4: Return the final result to the frontend ===
499
- return HTMLResponse(content=final_html)
500
-
501
- except requests.exceptions.RequestException as e:
502
- raise HTTPException(
503
- status_code=502,
504
- detail=f"Failed to communicate with a downstream AI model: {e}",
505
- )
506
- except Exception as e:
507
- # This will catch any errors, including the ValueError from the Sea-Lion function
508
- raise HTTPException(
509
- status_code=500,
510
- detail=f"An unexpected error occurred during processing: {e}",
511
- )
512
-
513
-
514
- # <<< --- START OF MVP PIPELINE ADDITIONS (Layout-Aware Version) --- >>>
515
-
516
-
517
- async def extract_text_and_boxes_with_paddle(file_content: bytes) -> list[dict]:
518
- """
519
- Extracts text and their bounding boxes from an image using PaddleOCR.
520
- Returns the full list of dictionary objects from the OCR tool.
521
- """
522
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
523
- temp_file.write(file_content)
524
- temp_filepath = temp_file.name
525
-
526
- try:
527
-
528
- def do_ocr() -> list[dict]:
529
- """Synchronous function to be run in a separate thread."""
530
- client = Client("kevansoon/PaddleOCR")
531
- # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
532
- result = client.predict(
533
- img=handle_file(temp_filepath),
534
- lang="en",
535
- api_name="/predict",
536
- )
537
- return result
538
-
539
- loop = asyncio.get_running_loop()
540
- extracted_data = await loop.run_in_executor(None, do_ocr)
541
- return extracted_data
542
- finally:
543
- os.unlink(temp_filepath)
544
-
545
-
546
- async def translate_paddle_data_concurrently(
547
- paddle_data: list[dict], target_language: str
548
- ) -> list[dict]:
549
- """
550
- Translates the 'text' field of each item in the paddle_data list concurrently.
551
- """
552
-
553
- async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
554
- """Helper function to call the translation API for a single piece of text."""
555
- url = "https://api.sea-lion.ai/v1/chat/completions"
556
- api_key = os.getenv("SEALION_API_KEY")
557
- headers = {
558
- "Authorization": f"Bearer {api_key}",
559
- "Content-Type": "application/json",
560
- }
561
- prompt = f'Translate the following phrase to {lang} and return ONLY the translated text without explanations or extra formatting:\n\n"{text_to_translate}"'
562
- payload = {
563
- "max_completion_tokens": 256, # Tokens for a single phrase, not a whole doc
564
- "messages": [{"role": "user", "content": prompt}],
565
- "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
566
- }
567
- async with httpx.AsyncClient() as client:
568
- response = await client.post(
569
- url, headers=headers, json=payload, timeout=30.0
570
- )
571
- response.raise_for_status()
572
- response_json = response.json()
573
- return response_json["choices"][0]["message"]["content"].strip()
574
-
575
- # Create a list of translation tasks to run concurrently
576
- translation_tasks = [
577
- call_sealion_for_translation(item["text"], target_language)
578
- for item in paddle_data
579
- ]
580
-
581
- # Execute all translation tasks in parallel
582
- translated_texts = await asyncio.gather(*translation_tasks)
583
-
584
- # Reconstruct the data structure with translated text and original boxes
585
- translated_data = []
586
- for i, item in enumerate(paddle_data):
587
- translated_data.append({"text": translated_texts[i], "box": item["box"]})
588
-
589
- return translated_data
590
-
591
-
592
- async def generate_html_from_paddle_data(translated_data: list[dict]) -> str:
593
- """
594
- Receives translated OCR data (text with coordinates) and uses Gemini
595
- to generate a layout-aware HTML document.
596
- """
597
- try:
598
- api_key = os.getenv("GEMINI_API_KEY")
599
- if not api_key:
600
- raise ValueError("GEMINI_API_KEY not found in environment variables.")
601
-
602
- genai.configure(api_key=api_key)
603
- model = genai.GenerativeModel(model_name="gemini-2.5-flash")
604
-
605
- # Convert the list of data to a JSON string for the prompt
606
- # THE FIX IS HERE: Added ensure_ascii=False
607
- json_data_for_prompt = json.dumps(translated_data, indent=2, ensure_ascii=False)
608
-
609
- prompt = f"""
610
- You are an expert system specializing in converting structured OCR data into a well-formatted HTML document that preserves the original layout.
611
-
612
- **Your Task:**
613
- 1. Analyze the following JSON array. Each object contains a `text` field (pre-translated) and a `box` field (four [x, y] coordinates of its bounding box).
614
- 2. Use the `box` coordinates to understand the document's spatial structure.
615
- - Elements with similar y-coordinates are likely on the same row.
616
- - Elements aligned vertically form columns.
617
- 3. Reconstruct the visual layout using semantic HTML.
618
- - Use `<table>` for grid-like data (rows and columns). This is critical for payslips.
619
- - Use `<h1>`, `<h2>`, `<p>` for headings and paragraphs.
620
- - Do NOT use absolute positioning (e.g., `style="position: absolute; left: ..."`). Create a clean, flowing HTML structure.
621
- 4. Your final output must ONLY be the raw HTML code. Do not add comments, markdown backticks, or any other explanatory text.
622
-
623
- **OCR Data to process:**
624
- ```json
625
- {json_data_for_prompt}
626
- ```
627
- """
628
-
629
- def do_request():
630
- """Synchronous function to be run in a separate thread."""
631
- response = model.generate_content(prompt)
632
- match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
633
- raw_html = match.group(1).strip() if match else response.text.strip()
634
- # Reuse existing functions to make the HTML interactive
635
- wrapped_html = wrap_words_with_spans(raw_html)
636
- final_html = inject_dropdown_script(wrapped_html)
637
- return final_html
638
-
639
- return await asyncio.to_thread(do_request)
640
- except Exception as e:
641
- error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
642
- return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
643
-
644
-
645
- @app.post("/api/translate_file_mvp", response_class=HTMLResponse)
646
- async def translate_document_mvp(
647
- target_language: str = Form(...), file: UploadFile = File(...)
648
- ):
649
- """
650
- Processes a document using the Layout-Aware MVP pipeline:
651
- 1. PaddleOCR extracts text and coordinates.
652
- 2. Sea-Lion translates each text block concurrently.
653
- 3. Gemini uses the translated text and original coordinates to generate layout-aware HTML.
654
- """
655
- content_type = file.content_type
656
- if content_type not in ["image/png", "image/jpeg"]:
657
- raise HTTPException(
658
- status_code=400,
659
- detail="Unsupported file type for MVP pipeline. Please use PNG or JPG.",
660
- )
661
-
662
- try:
663
- file_content = await file.read()
664
-
665
- # === MVP STEP 1: Extract text and coordinates with PaddleOCR ===
666
- paddle_data = await extract_text_and_boxes_with_paddle(file_content)
667
- if not paddle_data:
668
- raise HTTPException(
669
- status_code=400,
670
- detail="PaddleOCR could not extract any text from the image.",
671
- )
672
- print("***** Step 1 Done ******")
673
-
674
- # === MVP STEP 2: Translate each text block concurrently ===
675
- translated_data = await translate_paddle_data_concurrently(
676
- paddle_data, target_language
677
- )
678
- print("***** Step 2 Done ******")
679
- # === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
680
- final_html = await generate_html_from_paddle_data(translated_data)
681
- print("***** Step 3 Done ******")
682
- return HTMLResponse(content=final_html)
683
-
684
- except httpx.HTTPStatusError as e:
685
- raise HTTPException(
686
- status_code=e.response.status_code,
687
- detail=f"Error from a downstream AI service: {e.response.text}",
688
- )
689
- except Exception as e:
690
- raise HTTPException(
691
- status_code=500,
692
- detail=f"An unexpected error occurred during MVP processing: {str(e)}",
693
- )
694
-
695
-
696
- # <<< --- END OF MVP PIPELINE ADDITIONS (Layout-Aware Version) --- >>>
697
-
698
-
699
- # testing clerk backend authentication
700
- # @app.post("/upload")
701
- # async def upload_file(
702
- # authorization: str = Header(...),
703
- # file: UploadFile = File(...)
704
- # ):
705
- # if not authorization.startswith("Bearer "):
706
- # raise HTTPException(status_code=401, detail="Missing Bearer token")
707
-
708
- # token = authorization.split(" ")[1]
709
- # claims = await verify_clerk_jwt(token)
710
-
711
- # user_id = claims.get("sub") # Clerk user ID
712
-
713
- # # ✅ Now the Clerk user is verified
714
- # # You can securely store this file, e.g., to Supabase or local
715
- # return {"message": f"File uploaded by Clerk user {user_id}"}
716
-
717
 
718
  @app.post("/upload")
719
  async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
@@ -828,11 +237,31 @@ async def get_user_documents(
828
  return documents
829
 
830
 
831
- # ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
 
 
 
 
 
 
 
 
 
 
832
 
833
- # --- Helper Functions (Sealion, Gemini Configuration) ---
 
834
 
 
835
 
 
 
 
 
 
 
 
 
836
  async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
837
  """Helper function to call the translation API for a single piece of text."""
838
  if not text_to_translate.strip():
@@ -874,217 +303,68 @@ async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str
874
  return f"Translation Parsing Error: {text_to_translate}"
875
 
876
 
877
- # --- PIPELINE FUNCTIONS (Corrected and Verified) ---
878
-
879
- async def get_hocr_from_image(file: UploadFile) -> str:
880
- """
881
- Performs OCR using Tesseract to get raw hOCR HTML output.
882
- This function accepts an UploadFile object, reads its byte content,
883
- and passes those bytes to Pillow and Tesseract.
884
- """
885
- # ** THE FIX IS HERE **
886
- # We must first read the file's content into memory as bytes.
887
- # The UploadFile object itself cannot be processed by Pillow.
888
- image_bytes = await file.read()
889
- await file.close() # It's good practice to close the file.
890
-
891
- if not image_bytes:
892
- raise HTTPException(status_code=400, detail="Uploaded file is empty.")
893
-
894
- try:
895
- # We then open the bytes using Pillow through an in-memory stream (io.BytesIO).
896
- # This correctly provides the image data to the image processing library.
897
- image = Image.open(io.BytesIO(image_bytes))
898
- except Exception as e:
899
- raise HTTPException(
900
- status_code=400,
901
- detail=f"Cannot open image. It may be corrupted or unsupported. Error: {e}",
902
- )
903
-
904
- # Run Tesseract OCR in a separate thread to avoid blocking the asyncio event loop.
905
- # Pytesseract works with the Pillow 'Image' object directly.
906
- loop = asyncio.get_running_loop()
907
- hocr_bytes = await loop.run_in_executor(
908
- None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
909
- )
910
-
911
- return hocr_bytes.decode("utf-8")
912
-
913
 
914
- async def translate_hocr_html_concurrently(hocr_html: str, target_language: str) -> str:
915
  """
916
- Parses hOCR to find text, translates it concurrently, and injects the
917
- translations back into the HTML structure.
918
  """
919
- soup = BeautifulSoup(hocr_html, "html.parser")
920
- elements_to_translate = soup.find_all(class_="ocrx_word")
921
- if not elements_to_translate:
922
- elements_to_translate = soup.find_all(class_="ocr_line")
923
-
924
- original_texts = [el.get_text(strip=True) for el in elements_to_translate]
925
-
926
- # Translate all texts concurrently
927
- translation_tasks = [
928
- call_sealion_for_translation(text, target_language) for text in original_texts
929
- ]
930
- translated_texts = await asyncio.gather(*translation_tasks)
931
-
932
- # Replace the text in the soup object with the translations
933
- for i, element in enumerate(elements_to_translate):
934
- if element.string:
935
- element.string.replace_with(translated_texts[i])
936
-
937
- return str(soup)
938
-
939
 
940
- async def generate_html_from_hocr(translated_hocr_html: str) -> str:
941
- """
942
- Receives translated hOCR HTML and uses Gemini to generate a final,
943
- layout-aware HTML document.
944
- """
945
  try:
946
  api_key = os.getenv("GEMINI_API_KEY")
947
  if not api_key:
948
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
949
 
950
  genai.configure(api_key=api_key)
951
- model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Updated model name
952
 
 
953
  prompt = f"""
954
- Given the following hOCR HTML, which contains translated text and positional information, convert it into a clean, well-styled HTML document suitable for display in an iframe.
 
 
955
 
956
- - Reconstruct the layout based on the bounding box (`bbox`) information in the element titles.
957
- - The final output should be a single HTML file with embedded CSS.
958
- - Ensure text does not overlap.
959
- - Use tables or other appropriate structures if they help maintain the layout.
960
- - Do not include any explanations, just the raw HTML code.
961
 
962
- hOCR Input:
963
- {translated_hocr_html}
964
  """
965
 
966
  def do_request():
967
- """Synchronous function to be run in a separate thread."""
968
- response = model.generate_content(prompt)
969
- # Clean up the response to ensure it's just raw HTML
970
- cleaned_html = response.text.strip()
971
- if cleaned_html.startswith("```html"):
972
- cleaned_html = cleaned_html[7:]
973
- if cleaned_html.endswith("```"):
974
- cleaned_html = cleaned_html[:-3]
975
- return cleaned_html.strip()
976
-
977
- # Run the synchronous Gemini API call in a thread pool executor
978
- return await asyncio.to_thread(do_request)
979
-
980
- except Exception as e:
981
- error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
982
- traceback.print_exc()
983
- return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
984
-
985
-
986
- # --- API ENDPOINT ---
987
-
988
- @app.post("/api/translate_file_pytesseract", response_class=HTMLResponse)
989
- async def translate_document_with_hocr(
990
- target_language: str = Form(...), file: UploadFile = File(...)
991
- ):
992
- """
993
- Processes a document using the simplified hOCR-based pipeline:
994
- 1. Tesseract extracts text and coordinates into an hOCR file.
995
- 2. Sea-Lion translates the text directly within the hOCR HTML structure.
996
- 3. Gemini uses the translated hOCR to generate the final layout-aware HTML.
997
- """
998
- content_type = file.content_type
999
- if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
1000
- raise HTTPException(
1001
- status_code=400,
1002
- detail="Unsupported file type. Please use PNG, JPG, BMP or TIFF.",
1003
- )
1004
-
1005
- try:
1006
- # === STEP 1: Extract text and coordinates with Tesseract hOCR ===
1007
- hocr_html = await get_hocr_from_image(file)
1008
- if not hocr_html or "ocr_page" not in hocr_html:
1009
- raise HTTPException(
1010
- status_code=400,
1011
- detail="Tesseract could not extract any hOCR data from the image.",
1012
- )
1013
- print("***** Step 1 Done: Generated hOCR from image ******")
1014
-
1015
- # === STEP 2: Translate text directly within the hOCR structure ===
1016
- translated_hocr = await translate_hocr_html_concurrently(
1017
- hocr_html, target_language
1018
- )
1019
- print("***** Step 2 Done: Translated hOCR in-place ******")
1020
-
1021
- # === STEP 3: Generate final, layout-aware HTML from Gemini ===
1022
- final_html = await generate_html_from_hocr(translated_hocr)
1023
- print("***** Step 3 Done: Generated final HTML from translated hOCR ******")
1024
-
1025
- # Return the final HTML generated by Gemini
1026
- return HTMLResponse(content=final_html)
1027
-
1028
- except HTTPException:
1029
- # Re-raise HTTPExceptions directly to preserve status code and detail
1030
- raise
1031
- except Exception as e:
1032
- traceback.print_exc()
1033
- raise HTTPException(
1034
- status_code=500,
1035
- detail=f"An unexpected error occurred during processing: {str(e)}",
1036
- )
1037
-
1038
-
1039
- # ----------------------------------END OF PYTESSERACT workflow-----------------------------------
1040
-
1041
-
1042
- # ----------------------------------Start OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
1043
-
1044
- # --- Helper Functions (Sealion, Gemini Configuration) ---
1045
 
 
 
1046
 
1047
- # This helper function for calling the Sea-Lion API is used by both translation functions.
1048
- async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
1049
- """Helper function to call the translation API for a single piece of text."""
1050
- if not text_to_translate.strip():
1051
- return "" # Don't send empty strings for translation
 
1052
 
1053
- url = "https://api.sea-lion.ai/v1/chat/completions"
1054
- api_key = os.getenv("SEALION_API_KEY")
1055
- if not api_key:
1056
- print("Warning: SEALION_API_KEY not set. Skipping translation.")
1057
- return f"{text_to_translate} (Translation Skipped)"
1058
 
1059
- headers = {
1060
- "Authorization": f"Bearer {api_key}",
1061
- "Content-Type": "application/json",
1062
- }
1063
- # Precise prompt for clean output
1064
- prompt = f'Translate the following text to {lang}. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text_to_translate}"'
1065
- payload = {
1066
- "max_completion_tokens": 2048,
1067
- "messages": [{"role": "user", "content": prompt}],
1068
- "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
1069
- }
1070
 
1071
- async with httpx.AsyncClient() as client:
1072
- try:
1073
- response = await client.post(
1074
- url, headers=headers, json=payload, timeout=45.0
1075
- )
1076
- response.raise_for_status()
1077
- response_json = response.json()
1078
- translated_text = response_json["choices"][0]["message"]["content"].strip()
1079
- # Clean up potential extra quotes that the model might add
1080
- return re.sub(r'^"|"$', "", translated_text)
1081
- except httpx.RequestError as e:
1082
- print(f"Translation request failed: {e}")
1083
- return f"Translation Error: {text_to_translate}"
1084
- except (KeyError, IndexError) as e:
1085
- print(f"Could not parse translation response: {e}")
1086
- return f"Translation Parsing Error: {text_to_translate}"
1087
 
 
 
 
 
1088
 
1089
  # --- OCR EXTRACTION FUNCTIONS ---
1090
 
@@ -1145,12 +425,13 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
1145
  os.unlink(temp_filepath)
1146
 
1147
 
1148
- # --- TRANSLATION FUNCTIONS ---
1149
 
1150
 
1151
- async def translate_hocr_html_concurrently(hocr_html: str, target_language: str) -> str:
1152
  """
1153
- Parses hOCR, translates text concurrently, and injects translations back into the HTML.
 
1154
  """
1155
  soup = BeautifulSoup(hocr_html, "html.parser")
1156
  elements_to_translate = soup.find_all(class_="ocrx_word")
@@ -1158,33 +439,37 @@ async def translate_hocr_html_concurrently(hocr_html: str, target_language: str)
1158
  elements_to_translate = soup.find_all(class_="ocr_line")
1159
 
1160
  original_texts = [el.get_text(strip=True) for el in elements_to_translate]
1161
- translation_tasks = [
1162
- call_sealion_for_translation(text, target_language) for text in original_texts
1163
- ]
1164
- translated_texts = await asyncio.gather(*translation_tasks)
1165
 
 
 
 
 
1166
  for i, element in enumerate(elements_to_translate):
1167
  if element.string:
1168
- element.string.replace_with(translated_texts[i])
 
 
1169
 
1170
  return str(soup)
1171
 
1172
 
1173
- async def translate_paddle_data_concurrently(
1174
  paddle_data: list[dict], target_language: str
1175
  ) -> list[dict]:
1176
  """
1177
- Translates the 'text' field of each item in the paddle_data list concurrently.
 
1178
  """
1179
  original_texts = [item.get("text", "") for item in paddle_data]
1180
- translation_tasks = [
1181
- call_sealion_for_translation(text, target_language) for text in original_texts
1182
- ]
1183
- translated_texts = await asyncio.gather(*translation_tasks)
1184
 
1185
  translated_data = []
1186
  for i, item in enumerate(paddle_data):
1187
- translated_data.append({"text": translated_texts[i], "box": item.get("box")})
 
 
1188
 
1189
  return translated_data
1190
 
@@ -1205,7 +490,7 @@ async def generate_html_from_dual_ocr(
1205
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
1206
 
1207
  genai.configure(api_key=api_key)
1208
- model = genai.GenerativeModel(model_name="gemini-2.5-flash")
1209
 
1210
  prompt = f"""
1211
  You are provided with two different translated OCR outputs for the same document.
@@ -1222,10 +507,10 @@ async def generate_html_from_dual_ocr(
1222
  --- PADDLEOCR END ---
1223
 
1224
  STRICT RULES:
1225
- 1. You MUST output ONLY the FINAL RAW HTML code.
1226
- - No ```html, no triple quotes, no markdown, no explanations.
1227
  - Output must begin with <!DOCTYPE html> and end with </html>.
1228
- 2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
1229
  - Every PaddleOCR text must appear exactly once in the correct order and location.
1230
  3. The HTML must be fully self-contained:
1231
  - Include <html>, <head>, <style>, and <body>.
@@ -1260,14 +545,14 @@ async def generate_html_from_dual_ocr(
1260
  return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
1261
 
1262
 
1263
- @app.post("/api/translate_file_dual_ocr", response_class=HTMLResponse)
1264
  async def translate_document_dual_ocr(
1265
  target_language: str = Form(...), file: UploadFile = File(...)
1266
  ):
1267
  """
1268
  Processes a document using a dual OCR pipeline:
1269
  1. Tesseract and PaddleOCR extract text and coordinates concurrently.
1270
- 2. Sea-Lion translates the text from both outputs concurrently.
1271
  3. Gemini uses both translated outputs to generate the final layout-aware HTML.
1272
  """
1273
  content_type = file.content_type
@@ -1296,22 +581,19 @@ async def translate_document_dual_ocr(
1296
  status_code=400,
1297
  detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
1298
  )
1299
- print(paddle_task)
1300
- print(hocr_task)
1301
  print("***** Step 1 Done: Finished OCR extraction ******")
1302
 
1303
- # === STEP 2: Translate both OCR outputs concurrently ===
1304
- print("***** Step 2: Starting concurrent translation ******")
1305
- translated_hocr_task = translate_hocr_html_concurrently(
1306
  hocr_html, target_language
1307
  )
1308
- translated_paddle_task = translate_paddle_data_concurrently(
1309
  paddle_data, target_language
1310
  )
1311
  translated_hocr, translated_paddle = await asyncio.gather(
1312
  translated_hocr_task, translated_paddle_task
1313
  )
1314
- print(translated_paddle_task)
1315
  print("***** Step 2 Done: Finished translation ******")
1316
 
1317
  # === STEP 3: Generate final HTML from both translated outputs ===
@@ -1321,7 +603,7 @@ async def translate_document_dual_ocr(
1321
  final_html = await generate_html_from_dual_ocr(
1322
  translated_hocr, translated_paddle
1323
  )
1324
-
1325
  print("***** Step 3 Done: Generated final HTML ******")
1326
 
1327
  return HTMLResponse(content=final_html)
@@ -1334,74 +616,146 @@ async def translate_document_dual_ocr(
1334
  status_code=500,
1335
  detail=f"An unexpected error occurred during processing: {str(e)}",
1336
  )
1337
-
1338
-
1339
- # ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
1340
-
1341
-
1342
- #------------------------ start of gemini workflow ---------------------------------
1343
-
1344
- # This helper function for calling the Sea-Lion API is now UNUSED in the pipeline,
1345
- # but is kept here as requested.
1346
- async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
1347
- """Helper function to call the translation API for a single piece of text."""
1348
- if not text_to_translate.strip():
1349
- return "" # Don't send empty strings for translation
1350
-
1351
- url = "https://api.sea-lion.ai/v1/chat/completions"
1352
- api_key = os.getenv("SEALION_API_KEY")
1353
- if not api_key:
1354
- print("Warning: SEALION_API_KEY not set. Skipping translation.")
1355
- return f"{text_to_translate} (Translation Skipped)"
1356
-
1357
- headers = {
1358
- "Authorization": f"Bearer {api_key}",
1359
- "Content-Type": "application/json",
1360
- }
1361
- # Precise prompt for clean output
1362
- prompt = f'Translate the following text to {lang}. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text_to_translate}"'
1363
- payload = {
1364
- "max_completion_tokens": 2048,
1365
- "messages": [{"role": "user", "content": prompt}],
1366
- "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
1367
- }
1368
-
1369
- async with httpx.AsyncClient() as client:
1370
- try:
1371
- response = await client.post(
1372
- url, headers=headers, json=payload, timeout=45.0
1373
- )
1374
- response.raise_for_status()
1375
- response_json = response.json()
1376
- translated_text = response_json["choices"][0]["message"]["content"].strip()
1377
- # Clean up potential extra quotes that the model might add
1378
- return re.sub(r'^"|"$', "", translated_text)
1379
- except httpx.RequestError as e:
1380
- print(f"Translation request failed: {e}")
1381
- return f"Translation Error: {text_to_translate}"
1382
- except (KeyError, IndexError) as e:
1383
- print(f"Could not parse translation response: {e}")
1384
- return f"Translation Parsing Error: {text_to_translate}"
1385
-
1386
-
1387
- # --- NEW GEMINI TRANSLATION FUNCTION ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1388
 
1389
  async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
1390
  """
1391
  Translates a list of texts using Gemini in a single batch API call.
1392
  """
1393
- if not texts:
1394
- return []
1395
 
1396
  try:
1397
  api_key = os.getenv("GEMINI_API_KEY")
1398
  if not api_key:
1399
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
 
 
1400
 
1401
  genai.configure(api_key=api_key)
1402
- model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
1403
 
1404
- # Create a single prompt asking for a JSON array response
1405
  prompt = f"""
1406
  Translate each string in the following JSON array of strings to {target_language}.
1407
  Return a single JSON array where each element is the translated string corresponding
@@ -1422,14 +776,10 @@ async def translate_texts_with_gemini(texts: list[str], target_language: str) ->
1422
  response = model.generate_content(prompt)
1423
  return response.text.strip()
1424
 
1425
- # Run the synchronous SDK call in a thread to avoid blocking asyncio
1426
  response_text = await asyncio.to_thread(do_request)
1427
-
1428
- # Clean the response to ensure it's valid JSON
1429
  json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
1430
  if not json_response_match:
1431
  print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
1432
- # Fallback: return original texts if parsing fails
1433
  return texts
1434
 
1435
  cleaned_json = json_response_match.group(0)
@@ -1437,78 +787,55 @@ async def translate_texts_with_gemini(texts: list[str], target_language: str) ->
1437
 
1438
  if len(translated_texts) != len(texts):
1439
  print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
1440
- # Fallback in case of length mismatch
1441
  return texts
1442
 
1443
  return translated_texts
1444
 
1445
  except Exception as e:
1446
  print(f"An error occurred during Gemini translation: {e}")
1447
- # Return original texts as a fallback
1448
  return texts
1449
 
1450
- # --- OCR EXTRACTION FUNCTIONS ---
1451
 
 
1452
 
1453
  async def get_hocr_from_image(image_bytes: bytes) -> str:
1454
  """
1455
  Performs OCR using Tesseract to get raw hOCR HTML output.
1456
- This function accepts image bytes.
1457
  """
1458
  if not image_bytes:
1459
  raise ValueError("Image bytes cannot be empty.")
1460
-
1461
  try:
1462
  image = Image.open(io.BytesIO(image_bytes))
 
 
 
 
1463
  except Exception as e:
1464
  raise HTTPException(
1465
  status_code=400,
1466
- detail=f"Cannot open image for Tesseract. It may be corrupted or unsupported. Error: {e}",
1467
  )
1468
 
1469
- # Run Tesseract OCR in a thread to avoid blocking the asyncio event loop
1470
- loop = asyncio.get_running_loop()
1471
- hocr_bytes = await loop.run_in_executor(
1472
- None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
1473
- )
1474
- return hocr_bytes.decode("utf-8")
1475
-
1476
 
1477
- async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
1478
  """
1479
- Extracts text and their bounding boxes from an image using PaddleOCR.
1480
- Returns the full list of dictionary objects from the OCR tool.
1481
  """
1482
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
1483
- temp_file.write(image_bytes)
1484
- temp_filepath = temp_file.name
1485
-
1486
  try:
1487
-
1488
- def do_ocr() -> list[dict]:
1489
- """Synchronous function to be run in a separate thread."""
1490
- client = Client("kevansoon/PaddleOCR")
1491
- # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
1492
- result = client.predict(
1493
- img=handle_file(temp_filepath),
1494
- lang="en",
1495
- api_name="/predict",
1496
- )
1497
- return result
1498
-
1499
- loop = asyncio.get_running_loop()
1500
- extracted_data = await loop.run_in_executor(None, do_ocr)
1501
  if not extracted_data:
1502
- print("Warning: PaddleOCR returned no data.")
1503
- return []
1504
  return extracted_data
1505
- finally:
1506
- os.unlink(temp_filepath)
 
 
 
1507
 
1508
 
1509
  # --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
1510
 
1511
-
1512
  async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
1513
  """
1514
  Parses hOCR, translates all text in a single batch call to Gemini,
@@ -1520,17 +847,14 @@ async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str)
1520
  elements_to_translate = soup.find_all(class_="ocr_line")
1521
 
1522
  original_texts = [el.get_text(strip=True) for el in elements_to_translate]
 
 
1523
 
1524
- # Translate all texts in one go
1525
  translated_texts = await translate_texts_with_gemini(original_texts, target_language)
1526
 
1527
- # Inject translations back
1528
  for i, element in enumerate(elements_to_translate):
1529
- if element.string:
1530
- # Ensure we don't go out of bounds if translation failed
1531
- if i < len(translated_texts):
1532
- element.string.replace_with(translated_texts[i])
1533
-
1534
  return str(soup)
1535
 
1536
 
@@ -1542,22 +866,20 @@ async def translate_paddle_data_with_gemini(
1542
  using a single batch call to Gemini.
1543
  """
1544
  original_texts = [item.get("text", "") for item in paddle_data]
 
 
1545
 
1546
- # Translate all texts in one go
1547
  translated_texts = await translate_texts_with_gemini(original_texts, target_language)
1548
 
1549
  translated_data = []
1550
  for i, item in enumerate(paddle_data):
1551
- # Ensure we don't go out of bounds if translation failed
1552
  translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
1553
  translated_data.append({"text": translated_text, "box": item.get("box")})
1554
-
1555
  return translated_data
1556
 
1557
 
1558
  # --- FINAL HTML GENERATION ---
1559
 
1560
-
1561
  async def generate_html_from_dual_ocr(
1562
  translated_hocr_html: str, translated_paddle_data: list[dict]
1563
  ) -> str:
@@ -1569,54 +891,43 @@ async def generate_html_from_dual_ocr(
1569
  api_key = os.getenv("GEMINI_API_KEY")
1570
  if not api_key:
1571
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
 
 
1572
 
1573
  genai.configure(api_key=api_key)
1574
- model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
1575
 
1576
  prompt = f"""
1577
- You are provided with two different translated OCR outputs for the same document.
1578
- Your task is to MERGE them into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
1579
 
1580
- Input 1: Translated hOCR HTML
1581
  --- HOCR START ---
1582
  {translated_hocr_html}
1583
  --- HOCR END ---
1584
 
1585
- Input 2: Translated PaddleOCR data (Python list of dicts with 'text' and 'box'):
1586
  --- PADDLEOCR START ---
1587
  {str(translated_paddle_data)}
1588
  --- PADDLEOCR END ---
1589
 
1590
- STRICT RULES:
1591
- 1. You MUST output ONLY the FINAL RAW HTML code.
1592
- - No ```html, no triple quotes, no markdown, no explanations.
1593
- - Output must begin with <!DOCTYPE html> and end with </html>.
1594
- 2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
1595
- - Every PaddleOCR text must appear exactly once in the correct order and location.
1596
- 3. The HTML must be fully self-contained:
1597
- - Include <html>, <head>, <style>, and <body>.
1598
- - Include CSS in a <style> block so it renders exactly in an iframe.
1599
- 4. Table structure requirement:
1600
- - Use <table>, <tbody>, <tr>, and <td> to organize words into rows and columns.
1601
- - Each PaddleOCR word must be placed in a separate <td> within the correct row based on vertical alignment.
1602
- - Apply CSS for borders, padding, and cell alignment to ensure readability.
1603
- - Use colspan/rowspan where necessary to match the original layout.
1604
- 5. Positioning:
1605
- - Use bounding box data to size and place each cell proportionally.
1606
- - Avoid text overlap — if bounding boxes would overlap, adjust table cell spans or widths.
1607
- 6. Before outputting:
1608
- - Validate internally that the HTML is valid.
1609
- - Confirm every PaddleOCR text appears in the table.
1610
- - Confirm the table renders correctly in an iframe.
1611
 
1612
  FINAL OUTPUT REQUIREMENT:
1613
- - Output ONLY the complete, valid HTML — no commentary, no extra text.
1614
  """
1615
 
1616
  def do_request():
1617
- """Synchronous function to be run in a separate thread."""
1618
  response = model.generate_content(prompt)
1619
- return response.text.strip()
 
 
 
1620
 
1621
  return await asyncio.to_thread(do_request)
1622
 
@@ -1626,9 +937,13 @@ async def generate_html_from_dual_ocr(
1626
  return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
1627
 
1628
 
1629
- @app.post("/api/translate_file_gemini", response_class=HTMLResponse)
 
 
1630
  async def translate_document_dual_ocr(
1631
- target_language: str = Form(...), file: UploadFile = File(...)
 
 
1632
  ):
1633
  """
1634
  Processes a document using a dual OCR pipeline:
@@ -1644,17 +959,14 @@ async def translate_document_dual_ocr(
1644
  )
1645
 
1646
  try:
1647
- await file.seek(0)
1648
  image_bytes = await file.read()
1649
  if not image_bytes:
1650
  raise HTTPException(status_code=400, detail="Uploaded file is empty.")
1651
 
1652
  # === STEP 1: Run both OCR extractions concurrently ===
1653
- print(
1654
- "***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******"
1655
- )
1656
  hocr_task = get_hocr_from_image(image_bytes)
1657
- paddle_task = extract_text_and_boxes_with_paddle(image_bytes)
1658
  hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
1659
 
1660
  if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
@@ -1666,25 +978,16 @@ async def translate_document_dual_ocr(
1666
 
1667
  # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
1668
  print("***** Step 2: Starting concurrent translation with Gemini ******")
1669
- translated_hocr_task = translate_hocr_html_with_gemini(
1670
- hocr_html, target_language
1671
- )
1672
- translated_paddle_task = translate_paddle_data_with_gemini(
1673
- paddle_data, target_language
1674
- )
1675
  translated_hocr, translated_paddle = await asyncio.gather(
1676
  translated_hocr_task, translated_paddle_task
1677
  )
1678
  print("***** Step 2 Done: Finished translation ******")
1679
 
1680
  # === STEP 3: Generate final HTML from both translated outputs ===
1681
- print(
1682
- "***** Step 3: Generating final HTML from dual OCR data via Gemini ******"
1683
- )
1684
- final_html = await generate_html_from_dual_ocr(
1685
- translated_hocr, translated_paddle
1686
- )
1687
-
1688
  print("***** Step 3 Done: Generated final HTML ******")
1689
 
1690
  return HTMLResponse(content=final_html)
@@ -1697,4 +1000,5 @@ async def translate_document_dual_ocr(
1697
  status_code=500,
1698
  detail=f"An unexpected error occurred during processing: {str(e)}",
1699
  )
1700
- #-------------------------- end of gemini workflow ----------------------------------
 
 
12
  import tempfile
13
  import io
14
  import traceback
15
+ import atexit
16
+ import functools
17
+ from queue import Queue
18
+ from threading import Event, Thread
19
 
20
  # beautifulsoup
21
  from bs4 import BeautifulSoup
 
53
  # --- MODIFIED: Replaced old tool imports with the new one ---
54
  from tools.tools import analyze_contract
55
 
56
+ #numpy and paddleocr
57
+ import numpy as np
58
+ from paddleocr import PaddleOCR
59
+
60
+
61
+
62
 
63
  app = FastAPI(
64
  title="Document Translator (Final Architecture)",
 
122
  raise HTTPException(
123
  status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
124
  )
125
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  @app.post("/upload")
128
  async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
 
237
  return documents
238
 
239
 
240
+ # --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
241
+
242
+
243
+ # testing clerk backend authentication
244
+ # @app.post("/upload")
245
+ # async def upload_file(
246
+ # authorization: str = Header(...),
247
+ # file: UploadFile = File(...)
248
+ # ):
249
+ # if not authorization.startswith("Bearer "):
250
+ # raise HTTPException(status_code=401, detail="Missing Bearer token")
251
 
252
+ # token = authorization.split(" ")[1]
253
+ # claims = await verify_clerk_jwt(token)
254
 
255
+ # user_id = claims.get("sub") # Clerk user ID
256
 
257
+ # # ✅ Now the Clerk user is verified
258
+ # # You can securely store this file, e.g., to Supabase or local
259
+ # return {"message": f"File uploaded by Clerk user {user_id}"}
260
+
261
+ #------------------------ start of gemini workflow ---------------------------------
262
+
263
+ # This helper function for calling the Sea-Lion API is now UNUSED in the pipeline,
264
+ # but is kept here as requested.
265
  async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
266
  """Helper function to call the translation API for a single piece of text."""
267
  if not text_to_translate.strip():
 
303
  return f"Translation Parsing Error: {text_to_translate}"
304
 
305
 
306
+ # --- NEW GEMINI TRANSLATION FUNCTION ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
+ async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
309
  """
310
+ Translates a list of texts using Gemini in a single batch API call.
 
311
  """
312
+ if not texts:
313
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
 
 
 
 
 
315
  try:
316
  api_key = os.getenv("GEMINI_API_KEY")
317
  if not api_key:
318
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
319
 
320
  genai.configure(api_key=api_key)
321
+ model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
322
 
323
+ # Create a single prompt asking for a JSON array response
324
  prompt = f"""
325
+ Translate each string in the following JSON array of strings to {target_language}.
326
+ Return a single JSON array where each element is the translated string corresponding
327
+ to the original at the same index. Your output MUST be only the JSON array and nothing else.
328
 
329
+ Example Input:
330
+ ["Hello world", "How are you?"]
331
+
332
+ Example Output for target language 'Spanish':
333
+ ["Hola mundo", "¿Cómo estás?"]
334
 
335
+ Input for this task:
336
+ {json.dumps(texts)}
337
  """
338
 
339
  def do_request():
340
+ """Synchronous function to be run in a separate thread."""
341
+ response = model.generate_content(prompt)
342
+ return response.text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
+ # Run the synchronous SDK call in a thread to avoid blocking asyncio
345
+ response_text = await asyncio.to_thread(do_request)
346
 
347
+ # Clean the response to ensure it's valid JSON
348
+ json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
349
+ if not json_response_match:
350
+ print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
351
+ # Fallback: return original texts if parsing fails
352
+ return texts
353
 
354
+ cleaned_json = json_response_match.group(0)
355
+ translated_texts = json.loads(cleaned_json)
 
 
 
356
 
357
+ if len(translated_texts) != len(texts):
358
+ print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
359
+ # Fallback in case of length mismatch
360
+ return texts
 
 
 
 
 
 
 
361
 
362
+ return translated_texts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
+ except Exception as e:
365
+ print(f"An error occurred during Gemini translation: {e}")
366
+ # Return original texts as a fallback
367
+ return texts
368
 
369
  # --- OCR EXTRACTION FUNCTIONS ---
370
 
 
425
  os.unlink(temp_filepath)
426
 
427
 
428
+ # --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
429
 
430
 
431
+ async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
432
  """
433
+ Parses hOCR, translates all text in a single batch call to Gemini,
434
+ and injects translations back into the HTML.
435
  """
436
  soup = BeautifulSoup(hocr_html, "html.parser")
437
  elements_to_translate = soup.find_all(class_="ocrx_word")
 
439
  elements_to_translate = soup.find_all(class_="ocr_line")
440
 
441
  original_texts = [el.get_text(strip=True) for el in elements_to_translate]
 
 
 
 
442
 
443
+ # Translate all texts in one go
444
+ translated_texts = await translate_texts_with_gemini(original_texts, target_language)
445
+
446
+ # Inject translations back
447
  for i, element in enumerate(elements_to_translate):
448
  if element.string:
449
+ # Ensure we don't go out of bounds if translation failed
450
+ if i < len(translated_texts):
451
+ element.string.replace_with(translated_texts[i])
452
 
453
  return str(soup)
454
 
455
 
456
+ async def translate_paddle_data_with_gemini(
457
  paddle_data: list[dict], target_language: str
458
  ) -> list[dict]:
459
  """
460
+ Translates the 'text' field of each item in the paddle_data list
461
+ using a single batch call to Gemini.
462
  """
463
  original_texts = [item.get("text", "") for item in paddle_data]
464
+
465
+ # Translate all texts in one go
466
+ translated_texts = await translate_texts_with_gemini(original_texts, target_language)
 
467
 
468
  translated_data = []
469
  for i, item in enumerate(paddle_data):
470
+ # Ensure we don't go out of bounds if translation failed
471
+ translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
472
+ translated_data.append({"text": translated_text, "box": item.get("box")})
473
 
474
  return translated_data
475
 
 
490
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
491
 
492
  genai.configure(api_key=api_key)
493
+ model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
494
 
495
  prompt = f"""
496
  You are provided with two different translated OCR outputs for the same document.
 
507
  --- PADDLEOCR END ---
508
 
509
  STRICT RULES:
510
+ 1. You MUST output ONLY the FINAL RAW HTML code.
511
+ - No ```html, no triple quotes, no markdown, no explanations.
512
  - Output must begin with <!DOCTYPE html> and end with </html>.
513
+ 2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
514
  - Every PaddleOCR text must appear exactly once in the correct order and location.
515
  3. The HTML must be fully self-contained:
516
  - Include <html>, <head>, <style>, and <body>.
 
545
  return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
546
 
547
 
548
+ @app.post("/api/translate_file_gemini", response_class=HTMLResponse)
549
  async def translate_document_dual_ocr(
550
  target_language: str = Form(...), file: UploadFile = File(...)
551
  ):
552
  """
553
  Processes a document using a dual OCR pipeline:
554
  1. Tesseract and PaddleOCR extract text and coordinates concurrently.
555
+ 2. Gemini translates the text from both outputs concurrently using a batch method.
556
  3. Gemini uses both translated outputs to generate the final layout-aware HTML.
557
  """
558
  content_type = file.content_type
 
581
  status_code=400,
582
  detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
583
  )
 
 
584
  print("***** Step 1 Done: Finished OCR extraction ******")
585
 
586
+ # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
587
+ print("***** Step 2: Starting concurrent translation with Gemini ******")
588
+ translated_hocr_task = translate_hocr_html_with_gemini(
589
  hocr_html, target_language
590
  )
591
+ translated_paddle_task = translate_paddle_data_with_gemini(
592
  paddle_data, target_language
593
  )
594
  translated_hocr, translated_paddle = await asyncio.gather(
595
  translated_hocr_task, translated_paddle_task
596
  )
 
597
  print("***** Step 2 Done: Finished translation ******")
598
 
599
  # === STEP 3: Generate final HTML from both translated outputs ===
 
603
  final_html = await generate_html_from_dual_ocr(
604
  translated_hocr, translated_paddle
605
  )
606
+
607
  print("***** Step 3 Done: Generated final HTML ******")
608
 
609
  return HTMLResponse(content=final_html)
 
616
  status_code=500,
617
  detail=f"An unexpected error occurred during processing: {str(e)}",
618
  )
619
+ #-------------------------- end of gemini workflow ----------------------------------
620
+
621
+ #-------------------------- start of updated gemini workflow ----------------------------------
622
+
623
+ # --- PADDLEOCR LOCAL MODEL MANAGER SETUP ---
624
+
625
+ LANG_CONFIG = {
626
+ "ch": {"num_workers": 2},
627
+ "en": {"num_workers": 2},
628
+ "fr": {"num_workers": 1},
629
+ "german": {"num_workers": 1},
630
+ "korean": {"num_workers": 1},
631
+ "japan": {"num_workers": 1},
632
+ }
633
+ CONCURRENCY_LIMIT = 8
634
+
635
+
636
+ class PaddleOCRModelManager(object):
637
+ def __init__(self,
638
+ num_workers,
639
+ model_factory):
640
+ super().__init__()
641
+ self._model_factory = model_factory
642
+ self._queue = Queue()
643
+ self._workers = []
644
+ self._model_initialized_event = Event()
645
+ for _ in range(num_workers):
646
+ worker = Thread(target=self._worker, daemon=True) # Use daemon threads
647
+ worker.start()
648
+ self._model_initialized_event.wait()
649
+ self._model_initialized_event.clear()
650
+ self._workers.append(worker)
651
+
652
+ def infer(self, *args, **kwargs):
653
+ result_queue = Queue(maxsize=1)
654
+ self._queue.put((args, kwargs, result_queue))
655
+ success, payload = result_queue.get()
656
+ if success:
657
+ return payload
658
+ else:
659
+ raise payload
660
+
661
+ def close(self):
662
+ for _ in self._workers:
663
+ self._queue.put(None)
664
+ for worker in self._workers:
665
+ worker.join()
666
+
667
+ def _worker(self):
668
+ print(f"Initializing PaddleOCR model in worker thread...")
669
+ model = self._model_factory()
670
+ self._model_initialized_event.set()
671
+ print(f"PaddleOCR model initialized.")
672
+ while True:
673
+ item = self._queue.get()
674
+ if item is None:
675
+ break
676
+ args, kwargs, result_queue = item
677
+ try:
678
+ result = model.ocr(*args, **kwargs)
679
+ result_queue.put((True, result))
680
+ except Exception as e:
681
+ result_queue.put((False, e))
682
+ finally:
683
+ self._queue.task_done()
684
+
685
+
686
+ def create_model(lang):
687
+ print(f"Creating PaddleOCR model for language: {lang}")
688
+ return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False)
689
+
690
+
691
+ model_managers = {}
692
+ for lang, config in LANG_CONFIG.items():
693
+ print(f"Setting up model manager for language: {lang}")
694
+ model_manager = PaddleOCRModelManager(config["num_workers"], functools.partial(create_model, lang=lang))
695
+ model_managers[lang] = model_manager
696
+
697
+
698
+ def close_model_managers():
699
+ print("Closing all PaddleOCR model managers...")
700
+ for manager in model_managers.values():
701
+ manager.close()
702
+
703
+
704
+ atexit.register(close_model_managers)
705
+
706
+
707
+ def local_inference(img_bytes: bytes, lang: str) -> list[dict]:
708
+ """
709
+ Performs OCR using the local PaddleOCRModelManager.
710
+ Accepts image bytes and returns the structured output.
711
+ """
712
+ ocr_manager = model_managers.get(lang)
713
+ if not ocr_manager:
714
+ print(f"Warning: Language '{lang}' not configured. Falling back to 'en'.")
715
+ ocr_manager = model_managers['en']
716
+
717
+ # Convert image bytes to a numpy array that PaddleOCR can process
718
+ image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
719
+ img_array = np.array(image)
720
+
721
+ # The result from paddleocr is a list of lists, e.g., [[box, [text, score]], ...]
722
+ result = ocr_manager.infer(img_array, cls=True)
723
+
724
+ # PaddleOCR can sometimes return a list of results (one per page/batch item)
725
+ if result and isinstance(result, list) and isinstance(result[0], list):
726
+ result = result[0]
727
+
728
+ output = []
729
+ if result:
730
+ for line in result:
731
+ box = line[0] # Bounding box coordinates
732
+ text = line[1][0] # Extracted text
733
+ output.append({
734
+ "text": text,
735
+ "box": box
736
+ })
737
+ return output
738
+
739
+
740
+ # --- GEMINI TRANSLATION FUNCTION ---
741
 
742
  async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
743
  """
744
  Translates a list of texts using Gemini in a single batch API call.
745
  """
746
+ if not texts or all(not s.strip() for s in texts):
747
+ return [""] * len(texts)
748
 
749
  try:
750
  api_key = os.getenv("GEMINI_API_KEY")
751
  if not api_key:
752
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
753
+ if not genai:
754
+ raise ImportError("google.generativeai library is not available.")
755
 
756
  genai.configure(api_key=api_key)
757
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash")
758
 
 
759
  prompt = f"""
760
  Translate each string in the following JSON array of strings to {target_language}.
761
  Return a single JSON array where each element is the translated string corresponding
 
776
  response = model.generate_content(prompt)
777
  return response.text.strip()
778
 
 
779
  response_text = await asyncio.to_thread(do_request)
 
 
780
  json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
781
  if not json_response_match:
782
  print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
 
783
  return texts
784
 
785
  cleaned_json = json_response_match.group(0)
 
787
 
788
  if len(translated_texts) != len(texts):
789
  print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
 
790
  return texts
791
 
792
  return translated_texts
793
 
794
  except Exception as e:
795
  print(f"An error occurred during Gemini translation: {e}")
 
796
  return texts
797
 
 
798
 
799
+ # --- OCR EXTRACTION FUNCTIONS ---
800
 
801
  async def get_hocr_from_image(image_bytes: bytes) -> str:
802
  """
803
  Performs OCR using Tesseract to get raw hOCR HTML output.
 
804
  """
805
  if not image_bytes:
806
  raise ValueError("Image bytes cannot be empty.")
 
807
  try:
808
  image = Image.open(io.BytesIO(image_bytes))
809
+ hocr_bytes = await asyncio.to_thread(
810
+ pytesseract.image_to_pdf_or_hocr, image, extension="hocr"
811
+ )
812
+ return hocr_bytes.decode("utf-8")
813
  except Exception as e:
814
  raise HTTPException(
815
  status_code=400,
816
+ detail=f"Tesseract OCR failed. Error: {e}",
817
  )
818
 
 
 
 
 
 
 
 
819
 
820
+ async def extract_text_and_boxes_with_paddle(image_bytes: bytes, lang: str = "en") -> list[dict]:
821
  """
822
+ Extracts text and their bounding boxes from an image using the local
823
+ PaddleOCRModelManager, running it in a thread to keep the API async.
824
  """
 
 
 
 
825
  try:
826
+ extracted_data = await asyncio.to_thread(local_inference, image_bytes, lang)
 
 
 
 
 
 
 
 
 
 
 
 
 
827
  if not extracted_data:
828
+ print("Warning: Local PaddleOCR returned no data.")
 
829
  return extracted_data
830
+ except Exception as e:
831
+ print(f"An error occurred during local PaddleOCR processing: {e}")
832
+ traceback.print_exc()
833
+ # Return empty list on failure to avoid breaking the pipeline
834
+ return []
835
 
836
 
837
  # --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
838
 
 
839
  async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
840
  """
841
  Parses hOCR, translates all text in a single batch call to Gemini,
 
847
  elements_to_translate = soup.find_all(class_="ocr_line")
848
 
849
  original_texts = [el.get_text(strip=True) for el in elements_to_translate]
850
+ if not original_texts:
851
+ return str(soup)
852
 
 
853
  translated_texts = await translate_texts_with_gemini(original_texts, target_language)
854
 
 
855
  for i, element in enumerate(elements_to_translate):
856
+ if element.string and i < len(translated_texts):
857
+ element.string.replace_with(translated_texts[i])
 
 
 
858
  return str(soup)
859
 
860
 
 
866
  using a single batch call to Gemini.
867
  """
868
  original_texts = [item.get("text", "") for item in paddle_data]
869
+ if not original_texts:
870
+ return []
871
 
 
872
  translated_texts = await translate_texts_with_gemini(original_texts, target_language)
873
 
874
  translated_data = []
875
  for i, item in enumerate(paddle_data):
 
876
  translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
877
  translated_data.append({"text": translated_text, "box": item.get("box")})
 
878
  return translated_data
879
 
880
 
881
  # --- FINAL HTML GENERATION ---
882
 
 
883
  async def generate_html_from_dual_ocr(
884
  translated_hocr_html: str, translated_paddle_data: list[dict]
885
  ) -> str:
 
891
  api_key = os.getenv("GEMINI_API_KEY")
892
  if not api_key:
893
  raise ValueError("GEMINI_API_KEY not found in environment variables.")
894
+ if not genai:
895
+ raise ImportError("google.generativeai library is not available.")
896
 
897
  genai.configure(api_key=api_key)
898
+ model = genai.GenerativeModel(model_name="gemini-1.5-flash")
899
 
900
  prompt = f"""
901
+ You are an expert web developer. Your task is to merge two different translated OCR outputs for the same document into a single, clean, and well-styled HTML document that can be rendered directly in an iframe.
 
902
 
903
+ Input 1: Translated hOCR HTML. This provides a basic structural layout.
904
  --- HOCR START ---
905
  {translated_hocr_html}
906
  --- HOCR END ---
907
 
908
+ Input 2: Translated PaddleOCR data. This is a precise list of words and their bounding boxes.
909
  --- PADDLEOCR START ---
910
  {str(translated_paddle_data)}
911
  --- PADDLEOCR END ---
912
 
913
+ STRICT INSTRUCTIONS:
914
+ 1. **Output Raw HTML Only**: Your entire output must be only the final HTML code. It must start with `<!DOCTYPE html>` and end with `</html>`. Do NOT include markdown fences like ```html or any explanations.
915
+ 2. **Prioritize PaddleOCR Data**: ALL text from the PaddleOCR input MUST be included in the final HTML. Use the hOCR as a structural guide, but the PaddleOCR data is the source of truth for the content and positioning.
916
+ 3. **Self-Contained HTML**: The HTML must be fully self-contained with embedded CSS in a `<style>` block within the `<head>`.
917
+ 4. **Layout Reconstruction**: Use absolute positioning (`position: absolute;`) for `<span>` or `<div>` elements containing the text. Use the bounding box coordinates from the PaddleOCR data to set the `top`, `left`, `width`, and `height` CSS properties for each element to reconstruct the original document layout precisely.
918
+ 5. **Coordinate System**: The bounding box format is [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]. You can approximate the position using `left: x1`, `top: y1`, `width: x2 - x1`, and `height: y3 - y1`.
919
+ 6. **Validation**: Before outputting, mentally confirm that every single text element from the PaddleOCR data is present in the final HTML and positioned correctly.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
920
 
921
  FINAL OUTPUT REQUIREMENT:
922
+ - Output ONLY the complete, valid, and self-contained HTML.
923
  """
924
 
925
  def do_request():
 
926
  response = model.generate_content(prompt)
927
+ # Clean up potential markdown fences
928
+ clean_text = re.sub(r'^```html\s*', '', response.text.strip(), flags=re.IGNORECASE)
929
+ clean_text = re.sub(r'\s*```$', '', clean_text)
930
+ return clean_text
931
 
932
  return await asyncio.to_thread(do_request)
933
 
 
937
  return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
938
 
939
 
940
+ # --- FASTAPI ENDPOINT ---
941
+
942
+ @app.post("/api/translate_file_gemini_local", response_class=HTMLResponse)
943
  async def translate_document_dual_ocr(
944
+ target_language: str = Form(...),
945
+ source_language: str = Form("en"), # Add source language for OCR
946
+ file: UploadFile = File(...)
947
  ):
948
  """
949
  Processes a document using a dual OCR pipeline:
 
959
  )
960
 
961
  try:
 
962
  image_bytes = await file.read()
963
  if not image_bytes:
964
  raise HTTPException(status_code=400, detail="Uploaded file is empty.")
965
 
966
  # === STEP 1: Run both OCR extractions concurrently ===
967
+ print("***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******")
 
 
968
  hocr_task = get_hocr_from_image(image_bytes)
969
+ paddle_task = extract_text_and_boxes_with_paddle(image_bytes, lang=source_language)
970
  hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
971
 
972
  if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
 
978
 
979
  # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
980
  print("***** Step 2: Starting concurrent translation with Gemini ******")
981
+ translated_hocr_task = translate_hocr_html_with_gemini(hocr_html, target_language)
982
+ translated_paddle_task = translate_paddle_data_with_gemini(paddle_data, target_language)
 
 
 
 
983
  translated_hocr, translated_paddle = await asyncio.gather(
984
  translated_hocr_task, translated_paddle_task
985
  )
986
  print("***** Step 2 Done: Finished translation ******")
987
 
988
  # === STEP 3: Generate final HTML from both translated outputs ===
989
+ print("***** Step 3: Generating final HTML from dual OCR data via Gemini ******")
990
+ final_html = await generate_html_from_dual_ocr(translated_hocr, translated_paddle)
 
 
 
 
 
991
  print("***** Step 3 Done: Generated final HTML ******")
992
 
993
  return HTMLResponse(content=final_html)
 
1000
  status_code=500,
1001
  detail=f"An unexpected error occurred during processing: {str(e)}",
1002
  )
1003
+
1004
+ #-------------------------- end of updated gemini workflow ----------------------------------
requirements.txt CHANGED
@@ -95,4 +95,6 @@ watchfiles==1.1.0
95
  websockets==15.0.1
96
  langextract
97
  gradio_client
98
- pytesseract
 
 
 
95
  websockets==15.0.1
96
  langextract
97
  gradio_client
98
+ pytesseract
99
+ paddlepaddle
100
+ paddleocr==2.10.0