KevanSoon
commited on
Commit
·
c156dc2
1
Parent(s):
4df3d15
major app.py changes
Browse files- app.py +284 -980
- requirements.txt +3 -1
app.py
CHANGED
@@ -12,6 +12,10 @@ import uuid
|
|
12 |
import tempfile
|
13 |
import io
|
14 |
import traceback
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# beautifulsoup
|
17 |
from bs4 import BeautifulSoup
|
@@ -49,6 +53,12 @@ from auth.clerk import verify_clerk_jwt
|
|
49 |
# --- MODIFIED: Replaced old tool imports with the new one ---
|
50 |
from tools.tools import analyze_contract
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
app = FastAPI(
|
54 |
title="Document Translator (Final Architecture)",
|
@@ -112,608 +122,7 @@ async def analyze_contract_endpoint(file: UploadFile = File(...)):
|
|
112 |
raise HTTPException(
|
113 |
status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
|
114 |
)
|
115 |
-
|
116 |
-
|
117 |
-
# --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
|
118 |
-
|
119 |
-
|
120 |
-
def wrap_words_with_spans(html: str) -> str:
|
121 |
-
# Wrap each word in target tags with a span having data attributes
|
122 |
-
def replacer(match):
|
123 |
-
replacer.counter += 1
|
124 |
-
word = match.group(0)
|
125 |
-
return f'<span data-clickable="true" data-id="word-{replacer.counter}">{word}</span>'
|
126 |
-
|
127 |
-
replacer.counter = 0
|
128 |
-
|
129 |
-
pattern = r"\b\w+[.,?!]?\b" # matches words with optional trailing punctuation
|
130 |
-
|
131 |
-
for tag in ["p", "h1", "h2", "td"]:
|
132 |
-
# regex to capture content inside these tags
|
133 |
-
regex = re.compile(rf"(<{tag}[^>]*>)(.*?)(</{tag}>)", re.DOTALL)
|
134 |
-
|
135 |
-
def replacer_func(m):
|
136 |
-
open_tag, inner_text, close_tag = m.groups()
|
137 |
-
wrapped_text = re.sub(pattern, replacer, inner_text)
|
138 |
-
return open_tag + wrapped_text + close_tag
|
139 |
-
|
140 |
-
html = regex.sub(replacer_func, html)
|
141 |
-
|
142 |
-
return html
|
143 |
-
|
144 |
-
|
145 |
-
def inject_dropdown_script(html: str) -> str:
|
146 |
-
script = """
|
147 |
-
<script>
|
148 |
-
window.addEventListener('DOMContentLoaded', () => {
|
149 |
-
|
150 |
-
function createDropdown(x, y, wordEl, word) {
|
151 |
-
// Remove any existing dropdown
|
152 |
-
const oldDropdown = document.getElementById('translation-dropdown');
|
153 |
-
if (oldDropdown) oldDropdown.remove();
|
154 |
-
|
155 |
-
// Create dropdown select element
|
156 |
-
const dropdown = document.createElement('select');
|
157 |
-
dropdown.id = 'translation-dropdown';
|
158 |
-
dropdown.style.position = 'absolute';
|
159 |
-
dropdown.style.left = x + 'px';
|
160 |
-
dropdown.style.top = y + 'px';
|
161 |
-
dropdown.style.zIndex = 9999;
|
162 |
-
|
163 |
-
// Languages options
|
164 |
-
const languages = ['English', 'Chinese', 'Tamil', 'Hindi'];
|
165 |
-
languages.forEach(lang => {
|
166 |
-
const option = document.createElement('option');
|
167 |
-
option.value = lang.toLowerCase();
|
168 |
-
option.innerText = lang;
|
169 |
-
dropdown.appendChild(option);
|
170 |
-
});
|
171 |
-
|
172 |
-
// Placeholder option
|
173 |
-
const defaultOption = document.createElement('option');
|
174 |
-
defaultOption.value = '';
|
175 |
-
defaultOption.innerText = 'Select language';
|
176 |
-
defaultOption.selected = true;
|
177 |
-
defaultOption.disabled = true;
|
178 |
-
dropdown.insertBefore(defaultOption, dropdown.firstChild);
|
179 |
-
|
180 |
-
document.body.appendChild(dropdown);
|
181 |
-
dropdown.focus();
|
182 |
-
|
183 |
-
dropdown.addEventListener('change', () => {
|
184 |
-
const selectedLang = dropdown.value;
|
185 |
-
if (!selectedLang) return;
|
186 |
-
|
187 |
-
// Call backend to translate word
|
188 |
-
fetch('http://localhost:8080/api/translate_frontend', {
|
189 |
-
method: 'POST',
|
190 |
-
headers: { 'Content-Type': 'application/json' },
|
191 |
-
body: JSON.stringify({ text: word, target_language: selectedLang }),
|
192 |
-
})
|
193 |
-
.then(res => {
|
194 |
-
if (!res.ok) throw new Error('Translation API error');
|
195 |
-
return res.json();
|
196 |
-
})
|
197 |
-
.then(data => {
|
198 |
-
const translated = data.translated_text || word;
|
199 |
-
wordEl.innerText = translated;
|
200 |
-
|
201 |
-
// Add or update language label
|
202 |
-
let label = wordEl.nextSibling;
|
203 |
-
if (!label || !label.classList || !label.classList.contains('language-label')) {
|
204 |
-
label = document.createElement('span');
|
205 |
-
label.className = 'language-label';
|
206 |
-
label.style.marginLeft = '6px';
|
207 |
-
label.style.fontSize = '0.8em';
|
208 |
-
label.style.color = '#555';
|
209 |
-
wordEl.after(label);
|
210 |
-
}
|
211 |
-
label.textContent = `(${dropdown.options[dropdown.selectedIndex].text})`;
|
212 |
-
})
|
213 |
-
.catch(err => {
|
214 |
-
console.error('Translation error:', err);
|
215 |
-
alert('Translation failed, please try again.');
|
216 |
-
});
|
217 |
-
|
218 |
-
dropdown.remove();
|
219 |
-
});
|
220 |
-
|
221 |
-
// Clicking outside closes dropdown
|
222 |
-
document.addEventListener('click', function onDocClick(e) {
|
223 |
-
if (!dropdown.contains(e.target)) {
|
224 |
-
dropdown.remove();
|
225 |
-
document.removeEventListener('click', onDocClick);
|
226 |
-
}
|
227 |
-
});
|
228 |
-
}
|
229 |
-
|
230 |
-
// Add click handlers to all words wrapped in spans with data-clickable="true"
|
231 |
-
document.querySelectorAll('span[data-clickable="true"]').forEach(el => {
|
232 |
-
el.style.cursor = 'pointer';
|
233 |
-
el.addEventListener('click', event => {
|
234 |
-
event.stopPropagation();
|
235 |
-
const word = el.innerText;
|
236 |
-
const rect = el.getBoundingClientRect();
|
237 |
-
const x = rect.left + window.scrollX;
|
238 |
-
const y = rect.bottom + window.scrollY;
|
239 |
-
createDropdown(x, y, el, word);
|
240 |
-
});
|
241 |
-
});
|
242 |
-
|
243 |
-
});
|
244 |
-
</script>
|
245 |
-
"""
|
246 |
-
if "</body>" in html:
|
247 |
-
return html.replace("</body>", script + "\n</body>")
|
248 |
-
else:
|
249 |
-
return html + script
|
250 |
-
|
251 |
-
|
252 |
-
@app.post("/api/translate_frontend")
|
253 |
-
async def translate_text(request: Request):
|
254 |
-
try:
|
255 |
-
data = await request.json()
|
256 |
-
text = data.get("text")
|
257 |
-
target_language = data.get("target_language")
|
258 |
-
|
259 |
-
if not text or not target_language:
|
260 |
-
raise HTTPException(
|
261 |
-
status_code=400,
|
262 |
-
detail="Missing 'text' or 'target_language' in request body",
|
263 |
-
)
|
264 |
-
|
265 |
-
url = "https://api.sea-lion.ai/v1/chat/completions"
|
266 |
-
api_key = os.getenv("SEALION_API_KEY")
|
267 |
-
|
268 |
-
headers = {
|
269 |
-
"Authorization": f"Bearer {api_key}",
|
270 |
-
"Content-Type": "application/json",
|
271 |
-
# No "accept" header or set to "application/json"
|
272 |
-
}
|
273 |
-
|
274 |
-
prompt = (
|
275 |
-
f"Please translate the following text to {target_language} and return "
|
276 |
-
"ONLY the translated text without any explanations or extra formatting:\n\n"
|
277 |
-
f'"{text}"'
|
278 |
-
)
|
279 |
-
|
280 |
-
payload = {
|
281 |
-
"max_completion_tokens": 1024,
|
282 |
-
"messages": [{"role": "user", "content": prompt}],
|
283 |
-
"model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
|
284 |
-
}
|
285 |
-
|
286 |
-
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
287 |
-
response.raise_for_status()
|
288 |
-
|
289 |
-
# Parse JSON response
|
290 |
-
response_json = response.json()
|
291 |
-
|
292 |
-
# Extract translated text from response JSON
|
293 |
-
translated_text = response_json["choices"][0]["message"]["content"].strip()
|
294 |
-
|
295 |
-
if not translated_text:
|
296 |
-
raise HTTPException(
|
297 |
-
status_code=500, detail="Empty response from translation model."
|
298 |
-
)
|
299 |
-
|
300 |
-
return {"translated_text": translated_text}
|
301 |
-
|
302 |
-
except requests.exceptions.RequestException as e:
|
303 |
-
raise HTTPException(
|
304 |
-
status_code=502, detail=f"Translation API request failed: {e}"
|
305 |
-
)
|
306 |
-
except Exception as e:
|
307 |
-
raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
|
308 |
-
|
309 |
-
|
310 |
-
# --- Model 2: Sea-Lion (The JSON Translator) ---
|
311 |
-
@app.post("/api/translate")
|
312 |
-
async def translate_text(text: str, target_language: str):
|
313 |
-
"""
|
314 |
-
Receives text and a target language, and returns the translated text
|
315 |
-
using the SEA-LION model.
|
316 |
-
"""
|
317 |
-
# The API endpoint URL for translation
|
318 |
-
url = "https://api.sea-lion.ai/v1/chat/completions"
|
319 |
-
|
320 |
-
# It's recommended to store API keys securely, e.g., in environment variables
|
321 |
-
api_key = os.getenv("SEALION_API_KEY")
|
322 |
-
|
323 |
-
# The headers for the request
|
324 |
-
headers = {
|
325 |
-
"accept": "text/plain",
|
326 |
-
"Authorization": f"Bearer {api_key}",
|
327 |
-
"Content-Type": "application/json",
|
328 |
-
}
|
329 |
-
|
330 |
-
# Create a dynamic prompt for the translation task
|
331 |
-
prompt = f'Translate the following text to {text}: "{target_language}"'
|
332 |
-
|
333 |
-
# The JSON data payload for the request
|
334 |
-
data = {
|
335 |
-
"max_completion_tokens": 4096, # Increased token limit for longer translations
|
336 |
-
"messages": [{"role": "user", "content": prompt}],
|
337 |
-
"model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
|
338 |
-
}
|
339 |
-
|
340 |
-
try:
|
341 |
-
# Make the POST request to the SEA-LION API
|
342 |
-
response = requests.post(url, headers=headers, data=json.dumps(data))
|
343 |
-
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
|
344 |
-
|
345 |
-
# The response from this specific API is plain text, not JSON.
|
346 |
-
# We will wrap it in a JSON structure for consistency in our API.
|
347 |
-
translated_text = response.text
|
348 |
-
|
349 |
-
# It's good practice to check if the response is empty
|
350 |
-
if not translated_text:
|
351 |
-
raise HTTPException(
|
352 |
-
status_code=500,
|
353 |
-
detail="Received an empty response from the translation model.",
|
354 |
-
)
|
355 |
-
|
356 |
-
return {"translated_text": translated_text}
|
357 |
-
|
358 |
-
except requests.exceptions.RequestException as e:
|
359 |
-
# Handle network-related errors
|
360 |
-
raise HTTPException(
|
361 |
-
status_code=502,
|
362 |
-
detail=f"Failed to communicate with the translation AI model: {e}",
|
363 |
-
)
|
364 |
-
except Exception as e:
|
365 |
-
# Handle other potential errors
|
366 |
-
raise HTTPException(
|
367 |
-
status_code=500,
|
368 |
-
detail=f"An unexpected error occurred during translation: {e}",
|
369 |
-
)
|
370 |
-
|
371 |
-
|
372 |
-
# --- Model 3: Gemini (The HTML Generator) ---
|
373 |
-
async def generate_html_from_translated_json(translated_json: dict) -> str:
|
374 |
-
"""
|
375 |
-
Receives a translated JSON object and uses Gemini to generate the final
|
376 |
-
structured HTML document.
|
377 |
-
"""
|
378 |
-
try:
|
379 |
-
api_key = os.getenv("GEMINI_API_KEY")
|
380 |
-
if not api_key:
|
381 |
-
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
382 |
-
|
383 |
-
genai.configure(api_key=api_key)
|
384 |
-
model = genai.GenerativeModel(model_name="gemini-2.0-flash")
|
385 |
-
json_string_for_prompt = json.dumps(translated_json, indent=2)
|
386 |
-
|
387 |
-
prompt = f"""
|
388 |
-
You are an expert system that converts a JSON object containing PRE-TRANSLATED text into a clean, semantic HTML document.
|
389 |
-
|
390 |
-
**Your Task:**
|
391 |
-
1. Analyze the following JSON object. Its text content has already been translated.
|
392 |
-
2. The core document data is located at the path: `choices[0]['message']['tool_calls'][0]['function']['arguments']`.
|
393 |
-
3. The value of 'arguments' is a JSON STRING. You must parse this inner string to access the list of document chunks.
|
394 |
-
4. Using the translated data from the 'text' fields, generate a single, complete HTML5 document. Use appropriate tags like <h1>, <h2>, <p>, and <table>.
|
395 |
-
5. if json contains "tabular" means mmake a table for that with some grey border and styling
|
396 |
-
6. Your final output must ONLY be the raw HTML code. Do not add comments or markdown.
|
397 |
-
|
398 |
-
**Translated JSON object to process:**
|
399 |
-
```json
|
400 |
-
{json_string_for_prompt}
|
401 |
-
```
|
402 |
-
"""
|
403 |
-
|
404 |
-
# def do_request():
|
405 |
-
# response = model.generate_content(prompt)
|
406 |
-
# match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
|
407 |
-
# if match:
|
408 |
-
# return match.group(1).strip()
|
409 |
-
# return response.text.strip()
|
410 |
-
|
411 |
-
# return await asyncio.to_thread(do_request)
|
412 |
-
def do_request():
|
413 |
-
response = model.generate_content(prompt)
|
414 |
-
|
415 |
-
# Extract raw HTML from Gemini markdown code block
|
416 |
-
match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
|
417 |
-
raw_html = match.group(1).strip() if match else response.text.strip()
|
418 |
-
|
419 |
-
# Wrap each word in clickable spans
|
420 |
-
wrapped_html = wrap_words_with_spans(raw_html)
|
421 |
-
|
422 |
-
# Inject dropdown script
|
423 |
-
final_html = inject_dropdown_script(wrapped_html)
|
424 |
-
|
425 |
-
return final_html
|
426 |
-
|
427 |
-
return await asyncio.to_thread(do_request)
|
428 |
-
except google_exceptions.ResourceExhausted as e:
|
429 |
-
error_message = "The request to the document processor (Gemini) was rejected due to API quota limits. Please wait or upgrade your API plan."
|
430 |
-
return f"<html><body><h1>API Quota Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
431 |
-
except Exception as e:
|
432 |
-
error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
|
433 |
-
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
434 |
-
|
435 |
-
|
436 |
-
# --- API Endpoint Orchestrating the Pipeline ---
|
437 |
-
@app.post("/api/translate_file", response_class=HTMLResponse)
|
438 |
-
async def translate_document_to_raw_html(
|
439 |
-
target_language: str = Form(...), file: UploadFile = File(...)
|
440 |
-
):
|
441 |
-
"""
|
442 |
-
Processes a document using the final, robust pipeline:
|
443 |
-
1. Nemo extracts content to JSON.
|
444 |
-
2. Sea-Lion translates the text within the JSON.
|
445 |
-
3. Gemini generates the final HTML from the translated JSON.
|
446 |
-
"""
|
447 |
-
content_type = file.content_type
|
448 |
-
if content_type not in ["application/pdf", "image/png", "image/jpeg"]:
|
449 |
-
raise HTTPException(status_code=400, detail="Unsupported file type.")
|
450 |
-
|
451 |
-
try:
|
452 |
-
# === STEP 1: Get raw JSON from Nemo (The Parser) ===
|
453 |
-
file_content = await file.read()
|
454 |
-
file_b64 = base64.b64encode(file_content).decode("utf-8")
|
455 |
-
nemo_data = {
|
456 |
-
"model": "nvidia/nemoretriever-parse",
|
457 |
-
"messages": [
|
458 |
-
{
|
459 |
-
"role": "user",
|
460 |
-
"content": [
|
461 |
-
{
|
462 |
-
"type": "image_url",
|
463 |
-
"image_url": {
|
464 |
-
"url": f"data:{content_type};base64,{file_b64}"
|
465 |
-
},
|
466 |
-
}
|
467 |
-
],
|
468 |
-
}
|
469 |
-
],
|
470 |
-
"max_tokens": 2048,
|
471 |
-
}
|
472 |
-
headers = {"accept": "application/json", "Content-Type": "application/json"}
|
473 |
-
model_response = requests.post(
|
474 |
-
"http://localhost:8000/v1/chat/completions",
|
475 |
-
headers=headers,
|
476 |
-
data=json.dumps(nemo_data),
|
477 |
-
)
|
478 |
-
model_response.raise_for_status()
|
479 |
-
nemo_response_json = model_response.json()
|
480 |
-
print(nemo_response_json)
|
481 |
-
print("*********** Step 1 Done ***********")
|
482 |
-
|
483 |
-
print("*********** Step 2 in Progress ***********")
|
484 |
-
# === STEP 2: Get translated JSON from Sea-Lion (The Translator) ===
|
485 |
-
translated_json = await translate_text(nemo_response_json, target_language)
|
486 |
-
print(translated_json)
|
487 |
-
print("*********** Step 2 Done ***********")
|
488 |
-
|
489 |
-
print("*********** Step 3 in Progress ***********")
|
490 |
-
# === STEP 3: Generate final HTML from Gemini (The HTML Generator) ===
|
491 |
-
final_html = await generate_html_from_translated_json(translated_json)
|
492 |
-
print(final_html)
|
493 |
-
print("*********** Step 3 Done ***********")
|
494 |
-
# Check if Gemini itself returned an error message
|
495 |
-
if final_html.strip().startswith("<html><body><h1>"):
|
496 |
-
return HTMLResponse(content=final_html)
|
497 |
-
|
498 |
-
# === STEP 4: Return the final result to the frontend ===
|
499 |
-
return HTMLResponse(content=final_html)
|
500 |
-
|
501 |
-
except requests.exceptions.RequestException as e:
|
502 |
-
raise HTTPException(
|
503 |
-
status_code=502,
|
504 |
-
detail=f"Failed to communicate with a downstream AI model: {e}",
|
505 |
-
)
|
506 |
-
except Exception as e:
|
507 |
-
# This will catch any errors, including the ValueError from the Sea-Lion function
|
508 |
-
raise HTTPException(
|
509 |
-
status_code=500,
|
510 |
-
detail=f"An unexpected error occurred during processing: {e}",
|
511 |
-
)
|
512 |
-
|
513 |
-
|
514 |
-
# <<< --- START OF MVP PIPELINE ADDITIONS (Layout-Aware Version) --- >>>
|
515 |
-
|
516 |
-
|
517 |
-
async def extract_text_and_boxes_with_paddle(file_content: bytes) -> list[dict]:
|
518 |
-
"""
|
519 |
-
Extracts text and their bounding boxes from an image using PaddleOCR.
|
520 |
-
Returns the full list of dictionary objects from the OCR tool.
|
521 |
-
"""
|
522 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
|
523 |
-
temp_file.write(file_content)
|
524 |
-
temp_filepath = temp_file.name
|
525 |
-
|
526 |
-
try:
|
527 |
-
|
528 |
-
def do_ocr() -> list[dict]:
|
529 |
-
"""Synchronous function to be run in a separate thread."""
|
530 |
-
client = Client("kevansoon/PaddleOCR")
|
531 |
-
# Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
|
532 |
-
result = client.predict(
|
533 |
-
img=handle_file(temp_filepath),
|
534 |
-
lang="en",
|
535 |
-
api_name="/predict",
|
536 |
-
)
|
537 |
-
return result
|
538 |
-
|
539 |
-
loop = asyncio.get_running_loop()
|
540 |
-
extracted_data = await loop.run_in_executor(None, do_ocr)
|
541 |
-
return extracted_data
|
542 |
-
finally:
|
543 |
-
os.unlink(temp_filepath)
|
544 |
-
|
545 |
-
|
546 |
-
async def translate_paddle_data_concurrently(
|
547 |
-
paddle_data: list[dict], target_language: str
|
548 |
-
) -> list[dict]:
|
549 |
-
"""
|
550 |
-
Translates the 'text' field of each item in the paddle_data list concurrently.
|
551 |
-
"""
|
552 |
-
|
553 |
-
async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
|
554 |
-
"""Helper function to call the translation API for a single piece of text."""
|
555 |
-
url = "https://api.sea-lion.ai/v1/chat/completions"
|
556 |
-
api_key = os.getenv("SEALION_API_KEY")
|
557 |
-
headers = {
|
558 |
-
"Authorization": f"Bearer {api_key}",
|
559 |
-
"Content-Type": "application/json",
|
560 |
-
}
|
561 |
-
prompt = f'Translate the following phrase to {lang} and return ONLY the translated text without explanations or extra formatting:\n\n"{text_to_translate}"'
|
562 |
-
payload = {
|
563 |
-
"max_completion_tokens": 256, # Tokens for a single phrase, not a whole doc
|
564 |
-
"messages": [{"role": "user", "content": prompt}],
|
565 |
-
"model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
|
566 |
-
}
|
567 |
-
async with httpx.AsyncClient() as client:
|
568 |
-
response = await client.post(
|
569 |
-
url, headers=headers, json=payload, timeout=30.0
|
570 |
-
)
|
571 |
-
response.raise_for_status()
|
572 |
-
response_json = response.json()
|
573 |
-
return response_json["choices"][0]["message"]["content"].strip()
|
574 |
-
|
575 |
-
# Create a list of translation tasks to run concurrently
|
576 |
-
translation_tasks = [
|
577 |
-
call_sealion_for_translation(item["text"], target_language)
|
578 |
-
for item in paddle_data
|
579 |
-
]
|
580 |
-
|
581 |
-
# Execute all translation tasks in parallel
|
582 |
-
translated_texts = await asyncio.gather(*translation_tasks)
|
583 |
-
|
584 |
-
# Reconstruct the data structure with translated text and original boxes
|
585 |
-
translated_data = []
|
586 |
-
for i, item in enumerate(paddle_data):
|
587 |
-
translated_data.append({"text": translated_texts[i], "box": item["box"]})
|
588 |
-
|
589 |
-
return translated_data
|
590 |
-
|
591 |
-
|
592 |
-
async def generate_html_from_paddle_data(translated_data: list[dict]) -> str:
|
593 |
-
"""
|
594 |
-
Receives translated OCR data (text with coordinates) and uses Gemini
|
595 |
-
to generate a layout-aware HTML document.
|
596 |
-
"""
|
597 |
-
try:
|
598 |
-
api_key = os.getenv("GEMINI_API_KEY")
|
599 |
-
if not api_key:
|
600 |
-
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
601 |
-
|
602 |
-
genai.configure(api_key=api_key)
|
603 |
-
model = genai.GenerativeModel(model_name="gemini-2.5-flash")
|
604 |
-
|
605 |
-
# Convert the list of data to a JSON string for the prompt
|
606 |
-
# THE FIX IS HERE: Added ensure_ascii=False
|
607 |
-
json_data_for_prompt = json.dumps(translated_data, indent=2, ensure_ascii=False)
|
608 |
-
|
609 |
-
prompt = f"""
|
610 |
-
You are an expert system specializing in converting structured OCR data into a well-formatted HTML document that preserves the original layout.
|
611 |
-
|
612 |
-
**Your Task:**
|
613 |
-
1. Analyze the following JSON array. Each object contains a `text` field (pre-translated) and a `box` field (four [x, y] coordinates of its bounding box).
|
614 |
-
2. Use the `box` coordinates to understand the document's spatial structure.
|
615 |
-
- Elements with similar y-coordinates are likely on the same row.
|
616 |
-
- Elements aligned vertically form columns.
|
617 |
-
3. Reconstruct the visual layout using semantic HTML.
|
618 |
-
- Use `<table>` for grid-like data (rows and columns). This is critical for payslips.
|
619 |
-
- Use `<h1>`, `<h2>`, `<p>` for headings and paragraphs.
|
620 |
-
- Do NOT use absolute positioning (e.g., `style="position: absolute; left: ..."`). Create a clean, flowing HTML structure.
|
621 |
-
4. Your final output must ONLY be the raw HTML code. Do not add comments, markdown backticks, or any other explanatory text.
|
622 |
-
|
623 |
-
**OCR Data to process:**
|
624 |
-
```json
|
625 |
-
{json_data_for_prompt}
|
626 |
-
```
|
627 |
-
"""
|
628 |
-
|
629 |
-
def do_request():
|
630 |
-
"""Synchronous function to be run in a separate thread."""
|
631 |
-
response = model.generate_content(prompt)
|
632 |
-
match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
|
633 |
-
raw_html = match.group(1).strip() if match else response.text.strip()
|
634 |
-
# Reuse existing functions to make the HTML interactive
|
635 |
-
wrapped_html = wrap_words_with_spans(raw_html)
|
636 |
-
final_html = inject_dropdown_script(wrapped_html)
|
637 |
-
return final_html
|
638 |
-
|
639 |
-
return await asyncio.to_thread(do_request)
|
640 |
-
except Exception as e:
|
641 |
-
error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
|
642 |
-
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
643 |
-
|
644 |
-
|
645 |
-
@app.post("/api/translate_file_mvp", response_class=HTMLResponse)
|
646 |
-
async def translate_document_mvp(
|
647 |
-
target_language: str = Form(...), file: UploadFile = File(...)
|
648 |
-
):
|
649 |
-
"""
|
650 |
-
Processes a document using the Layout-Aware MVP pipeline:
|
651 |
-
1. PaddleOCR extracts text and coordinates.
|
652 |
-
2. Sea-Lion translates each text block concurrently.
|
653 |
-
3. Gemini uses the translated text and original coordinates to generate layout-aware HTML.
|
654 |
-
"""
|
655 |
-
content_type = file.content_type
|
656 |
-
if content_type not in ["image/png", "image/jpeg"]:
|
657 |
-
raise HTTPException(
|
658 |
-
status_code=400,
|
659 |
-
detail="Unsupported file type for MVP pipeline. Please use PNG or JPG.",
|
660 |
-
)
|
661 |
-
|
662 |
-
try:
|
663 |
-
file_content = await file.read()
|
664 |
-
|
665 |
-
# === MVP STEP 1: Extract text and coordinates with PaddleOCR ===
|
666 |
-
paddle_data = await extract_text_and_boxes_with_paddle(file_content)
|
667 |
-
if not paddle_data:
|
668 |
-
raise HTTPException(
|
669 |
-
status_code=400,
|
670 |
-
detail="PaddleOCR could not extract any text from the image.",
|
671 |
-
)
|
672 |
-
print("***** Step 1 Done ******")
|
673 |
-
|
674 |
-
# === MVP STEP 2: Translate each text block concurrently ===
|
675 |
-
translated_data = await translate_paddle_data_concurrently(
|
676 |
-
paddle_data, target_language
|
677 |
-
)
|
678 |
-
print("***** Step 2 Done ******")
|
679 |
-
# === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
|
680 |
-
final_html = await generate_html_from_paddle_data(translated_data)
|
681 |
-
print("***** Step 3 Done ******")
|
682 |
-
return HTMLResponse(content=final_html)
|
683 |
-
|
684 |
-
except httpx.HTTPStatusError as e:
|
685 |
-
raise HTTPException(
|
686 |
-
status_code=e.response.status_code,
|
687 |
-
detail=f"Error from a downstream AI service: {e.response.text}",
|
688 |
-
)
|
689 |
-
except Exception as e:
|
690 |
-
raise HTTPException(
|
691 |
-
status_code=500,
|
692 |
-
detail=f"An unexpected error occurred during MVP processing: {str(e)}",
|
693 |
-
)
|
694 |
-
|
695 |
-
|
696 |
-
# <<< --- END OF MVP PIPELINE ADDITIONS (Layout-Aware Version) --- >>>
|
697 |
-
|
698 |
-
|
699 |
-
# testing clerk backend authentication
|
700 |
-
# @app.post("/upload")
|
701 |
-
# async def upload_file(
|
702 |
-
# authorization: str = Header(...),
|
703 |
-
# file: UploadFile = File(...)
|
704 |
-
# ):
|
705 |
-
# if not authorization.startswith("Bearer "):
|
706 |
-
# raise HTTPException(status_code=401, detail="Missing Bearer token")
|
707 |
-
|
708 |
-
# token = authorization.split(" ")[1]
|
709 |
-
# claims = await verify_clerk_jwt(token)
|
710 |
-
|
711 |
-
# user_id = claims.get("sub") # Clerk user ID
|
712 |
-
|
713 |
-
# # ✅ Now the Clerk user is verified
|
714 |
-
# # You can securely store this file, e.g., to Supabase or local
|
715 |
-
# return {"message": f"File uploaded by Clerk user {user_id}"}
|
716 |
-
|
717 |
|
718 |
@app.post("/upload")
|
719 |
async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
|
@@ -828,11 +237,31 @@ async def get_user_documents(
|
|
828 |
return documents
|
829 |
|
830 |
|
831 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
832 |
|
833 |
-
#
|
|
|
834 |
|
|
|
835 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
836 |
async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
|
837 |
"""Helper function to call the translation API for a single piece of text."""
|
838 |
if not text_to_translate.strip():
|
@@ -874,217 +303,68 @@ async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str
|
|
874 |
return f"Translation Parsing Error: {text_to_translate}"
|
875 |
|
876 |
|
877 |
-
# ---
|
878 |
-
|
879 |
-
async def get_hocr_from_image(file: UploadFile) -> str:
|
880 |
-
"""
|
881 |
-
Performs OCR using Tesseract to get raw hOCR HTML output.
|
882 |
-
This function accepts an UploadFile object, reads its byte content,
|
883 |
-
and passes those bytes to Pillow and Tesseract.
|
884 |
-
"""
|
885 |
-
# ** THE FIX IS HERE **
|
886 |
-
# We must first read the file's content into memory as bytes.
|
887 |
-
# The UploadFile object itself cannot be processed by Pillow.
|
888 |
-
image_bytes = await file.read()
|
889 |
-
await file.close() # It's good practice to close the file.
|
890 |
-
|
891 |
-
if not image_bytes:
|
892 |
-
raise HTTPException(status_code=400, detail="Uploaded file is empty.")
|
893 |
-
|
894 |
-
try:
|
895 |
-
# We then open the bytes using Pillow through an in-memory stream (io.BytesIO).
|
896 |
-
# This correctly provides the image data to the image processing library.
|
897 |
-
image = Image.open(io.BytesIO(image_bytes))
|
898 |
-
except Exception as e:
|
899 |
-
raise HTTPException(
|
900 |
-
status_code=400,
|
901 |
-
detail=f"Cannot open image. It may be corrupted or unsupported. Error: {e}",
|
902 |
-
)
|
903 |
-
|
904 |
-
# Run Tesseract OCR in a separate thread to avoid blocking the asyncio event loop.
|
905 |
-
# Pytesseract works with the Pillow 'Image' object directly.
|
906 |
-
loop = asyncio.get_running_loop()
|
907 |
-
hocr_bytes = await loop.run_in_executor(
|
908 |
-
None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
|
909 |
-
)
|
910 |
-
|
911 |
-
return hocr_bytes.decode("utf-8")
|
912 |
-
|
913 |
|
914 |
-
async def
|
915 |
"""
|
916 |
-
|
917 |
-
translations back into the HTML structure.
|
918 |
"""
|
919 |
-
|
920 |
-
|
921 |
-
if not elements_to_translate:
|
922 |
-
elements_to_translate = soup.find_all(class_="ocr_line")
|
923 |
-
|
924 |
-
original_texts = [el.get_text(strip=True) for el in elements_to_translate]
|
925 |
-
|
926 |
-
# Translate all texts concurrently
|
927 |
-
translation_tasks = [
|
928 |
-
call_sealion_for_translation(text, target_language) for text in original_texts
|
929 |
-
]
|
930 |
-
translated_texts = await asyncio.gather(*translation_tasks)
|
931 |
-
|
932 |
-
# Replace the text in the soup object with the translations
|
933 |
-
for i, element in enumerate(elements_to_translate):
|
934 |
-
if element.string:
|
935 |
-
element.string.replace_with(translated_texts[i])
|
936 |
-
|
937 |
-
return str(soup)
|
938 |
-
|
939 |
|
940 |
-
async def generate_html_from_hocr(translated_hocr_html: str) -> str:
|
941 |
-
"""
|
942 |
-
Receives translated hOCR HTML and uses Gemini to generate a final,
|
943 |
-
layout-aware HTML document.
|
944 |
-
"""
|
945 |
try:
|
946 |
api_key = os.getenv("GEMINI_API_KEY")
|
947 |
if not api_key:
|
948 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
949 |
|
950 |
genai.configure(api_key=api_key)
|
951 |
-
model = genai.GenerativeModel(model_name="gemini-2.5-flash") #
|
952 |
|
|
|
953 |
prompt = f"""
|
954 |
-
|
|
|
|
|
955 |
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
|
962 |
-
|
963 |
-
{
|
964 |
"""
|
965 |
|
966 |
def do_request():
|
967 |
-
"""Synchronous function to be run in a separate thread."""
|
968 |
-
response = model.generate_content(prompt)
|
969 |
-
|
970 |
-
cleaned_html = response.text.strip()
|
971 |
-
if cleaned_html.startswith("```html"):
|
972 |
-
cleaned_html = cleaned_html[7:]
|
973 |
-
if cleaned_html.endswith("```"):
|
974 |
-
cleaned_html = cleaned_html[:-3]
|
975 |
-
return cleaned_html.strip()
|
976 |
-
|
977 |
-
# Run the synchronous Gemini API call in a thread pool executor
|
978 |
-
return await asyncio.to_thread(do_request)
|
979 |
-
|
980 |
-
except Exception as e:
|
981 |
-
error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
|
982 |
-
traceback.print_exc()
|
983 |
-
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
984 |
-
|
985 |
-
|
986 |
-
# --- API ENDPOINT ---
|
987 |
-
|
988 |
-
@app.post("/api/translate_file_pytesseract", response_class=HTMLResponse)
|
989 |
-
async def translate_document_with_hocr(
|
990 |
-
target_language: str = Form(...), file: UploadFile = File(...)
|
991 |
-
):
|
992 |
-
"""
|
993 |
-
Processes a document using the simplified hOCR-based pipeline:
|
994 |
-
1. Tesseract extracts text and coordinates into an hOCR file.
|
995 |
-
2. Sea-Lion translates the text directly within the hOCR HTML structure.
|
996 |
-
3. Gemini uses the translated hOCR to generate the final layout-aware HTML.
|
997 |
-
"""
|
998 |
-
content_type = file.content_type
|
999 |
-
if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
|
1000 |
-
raise HTTPException(
|
1001 |
-
status_code=400,
|
1002 |
-
detail="Unsupported file type. Please use PNG, JPG, BMP or TIFF.",
|
1003 |
-
)
|
1004 |
-
|
1005 |
-
try:
|
1006 |
-
# === STEP 1: Extract text and coordinates with Tesseract hOCR ===
|
1007 |
-
hocr_html = await get_hocr_from_image(file)
|
1008 |
-
if not hocr_html or "ocr_page" not in hocr_html:
|
1009 |
-
raise HTTPException(
|
1010 |
-
status_code=400,
|
1011 |
-
detail="Tesseract could not extract any hOCR data from the image.",
|
1012 |
-
)
|
1013 |
-
print("***** Step 1 Done: Generated hOCR from image ******")
|
1014 |
-
|
1015 |
-
# === STEP 2: Translate text directly within the hOCR structure ===
|
1016 |
-
translated_hocr = await translate_hocr_html_concurrently(
|
1017 |
-
hocr_html, target_language
|
1018 |
-
)
|
1019 |
-
print("***** Step 2 Done: Translated hOCR in-place ******")
|
1020 |
-
|
1021 |
-
# === STEP 3: Generate final, layout-aware HTML from Gemini ===
|
1022 |
-
final_html = await generate_html_from_hocr(translated_hocr)
|
1023 |
-
print("***** Step 3 Done: Generated final HTML from translated hOCR ******")
|
1024 |
-
|
1025 |
-
# Return the final HTML generated by Gemini
|
1026 |
-
return HTMLResponse(content=final_html)
|
1027 |
-
|
1028 |
-
except HTTPException:
|
1029 |
-
# Re-raise HTTPExceptions directly to preserve status code and detail
|
1030 |
-
raise
|
1031 |
-
except Exception as e:
|
1032 |
-
traceback.print_exc()
|
1033 |
-
raise HTTPException(
|
1034 |
-
status_code=500,
|
1035 |
-
detail=f"An unexpected error occurred during processing: {str(e)}",
|
1036 |
-
)
|
1037 |
-
|
1038 |
-
|
1039 |
-
# ----------------------------------END OF PYTESSERACT workflow-----------------------------------
|
1040 |
-
|
1041 |
-
|
1042 |
-
# ----------------------------------Start OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
|
1043 |
-
|
1044 |
-
# --- Helper Functions (Sealion, Gemini Configuration) ---
|
1045 |
|
|
|
|
|
1046 |
|
1047 |
-
#
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
|
|
1052 |
|
1053 |
-
|
1054 |
-
|
1055 |
-
if not api_key:
|
1056 |
-
print("Warning: SEALION_API_KEY not set. Skipping translation.")
|
1057 |
-
return f"{text_to_translate} (Translation Skipped)"
|
1058 |
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
# Precise prompt for clean output
|
1064 |
-
prompt = f'Translate the following text to {lang}. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text_to_translate}"'
|
1065 |
-
payload = {
|
1066 |
-
"max_completion_tokens": 2048,
|
1067 |
-
"messages": [{"role": "user", "content": prompt}],
|
1068 |
-
"model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
|
1069 |
-
}
|
1070 |
|
1071 |
-
|
1072 |
-
try:
|
1073 |
-
response = await client.post(
|
1074 |
-
url, headers=headers, json=payload, timeout=45.0
|
1075 |
-
)
|
1076 |
-
response.raise_for_status()
|
1077 |
-
response_json = response.json()
|
1078 |
-
translated_text = response_json["choices"][0]["message"]["content"].strip()
|
1079 |
-
# Clean up potential extra quotes that the model might add
|
1080 |
-
return re.sub(r'^"|"$', "", translated_text)
|
1081 |
-
except httpx.RequestError as e:
|
1082 |
-
print(f"Translation request failed: {e}")
|
1083 |
-
return f"Translation Error: {text_to_translate}"
|
1084 |
-
except (KeyError, IndexError) as e:
|
1085 |
-
print(f"Could not parse translation response: {e}")
|
1086 |
-
return f"Translation Parsing Error: {text_to_translate}"
|
1087 |
|
|
|
|
|
|
|
|
|
1088 |
|
1089 |
# --- OCR EXTRACTION FUNCTIONS ---
|
1090 |
|
@@ -1145,12 +425,13 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
|
|
1145 |
os.unlink(temp_filepath)
|
1146 |
|
1147 |
|
1148 |
-
# --- TRANSLATION FUNCTIONS ---
|
1149 |
|
1150 |
|
1151 |
-
async def
|
1152 |
"""
|
1153 |
-
Parses hOCR, translates text
|
|
|
1154 |
"""
|
1155 |
soup = BeautifulSoup(hocr_html, "html.parser")
|
1156 |
elements_to_translate = soup.find_all(class_="ocrx_word")
|
@@ -1158,33 +439,37 @@ async def translate_hocr_html_concurrently(hocr_html: str, target_language: str)
|
|
1158 |
elements_to_translate = soup.find_all(class_="ocr_line")
|
1159 |
|
1160 |
original_texts = [el.get_text(strip=True) for el in elements_to_translate]
|
1161 |
-
translation_tasks = [
|
1162 |
-
call_sealion_for_translation(text, target_language) for text in original_texts
|
1163 |
-
]
|
1164 |
-
translated_texts = await asyncio.gather(*translation_tasks)
|
1165 |
|
|
|
|
|
|
|
|
|
1166 |
for i, element in enumerate(elements_to_translate):
|
1167 |
if element.string:
|
1168 |
-
|
|
|
|
|
1169 |
|
1170 |
return str(soup)
|
1171 |
|
1172 |
|
1173 |
-
async def
|
1174 |
paddle_data: list[dict], target_language: str
|
1175 |
) -> list[dict]:
|
1176 |
"""
|
1177 |
-
Translates the 'text' field of each item in the paddle_data list
|
|
|
1178 |
"""
|
1179 |
original_texts = [item.get("text", "") for item in paddle_data]
|
1180 |
-
|
1181 |
-
|
1182 |
-
|
1183 |
-
translated_texts = await asyncio.gather(*translation_tasks)
|
1184 |
|
1185 |
translated_data = []
|
1186 |
for i, item in enumerate(paddle_data):
|
1187 |
-
|
|
|
|
|
1188 |
|
1189 |
return translated_data
|
1190 |
|
@@ -1205,7 +490,7 @@ async def generate_html_from_dual_ocr(
|
|
1205 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
1206 |
|
1207 |
genai.configure(api_key=api_key)
|
1208 |
-
model = genai.GenerativeModel(model_name="gemini-2.5-flash")
|
1209 |
|
1210 |
prompt = f"""
|
1211 |
You are provided with two different translated OCR outputs for the same document.
|
@@ -1222,10 +507,10 @@ async def generate_html_from_dual_ocr(
|
|
1222 |
--- PADDLEOCR END ---
|
1223 |
|
1224 |
STRICT RULES:
|
1225 |
-
1. You MUST output ONLY the FINAL RAW HTML code.
|
1226 |
-
- No ```html, no triple quotes, no markdown, no explanations.
|
1227 |
- Output must begin with <!DOCTYPE html> and end with </html>.
|
1228 |
-
2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
|
1229 |
- Every PaddleOCR text must appear exactly once in the correct order and location.
|
1230 |
3. The HTML must be fully self-contained:
|
1231 |
- Include <html>, <head>, <style>, and <body>.
|
@@ -1260,14 +545,14 @@ async def generate_html_from_dual_ocr(
|
|
1260 |
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
1261 |
|
1262 |
|
1263 |
-
@app.post("/api/
|
1264 |
async def translate_document_dual_ocr(
|
1265 |
target_language: str = Form(...), file: UploadFile = File(...)
|
1266 |
):
|
1267 |
"""
|
1268 |
Processes a document using a dual OCR pipeline:
|
1269 |
1. Tesseract and PaddleOCR extract text and coordinates concurrently.
|
1270 |
-
2.
|
1271 |
3. Gemini uses both translated outputs to generate the final layout-aware HTML.
|
1272 |
"""
|
1273 |
content_type = file.content_type
|
@@ -1296,22 +581,19 @@ async def translate_document_dual_ocr(
|
|
1296 |
status_code=400,
|
1297 |
detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
|
1298 |
)
|
1299 |
-
print(paddle_task)
|
1300 |
-
print(hocr_task)
|
1301 |
print("***** Step 1 Done: Finished OCR extraction ******")
|
1302 |
|
1303 |
-
# === STEP 2: Translate both OCR outputs concurrently ===
|
1304 |
-
print("***** Step 2: Starting concurrent translation ******")
|
1305 |
-
translated_hocr_task =
|
1306 |
hocr_html, target_language
|
1307 |
)
|
1308 |
-
translated_paddle_task =
|
1309 |
paddle_data, target_language
|
1310 |
)
|
1311 |
translated_hocr, translated_paddle = await asyncio.gather(
|
1312 |
translated_hocr_task, translated_paddle_task
|
1313 |
)
|
1314 |
-
print(translated_paddle_task)
|
1315 |
print("***** Step 2 Done: Finished translation ******")
|
1316 |
|
1317 |
# === STEP 3: Generate final HTML from both translated outputs ===
|
@@ -1321,7 +603,7 @@ async def translate_document_dual_ocr(
|
|
1321 |
final_html = await generate_html_from_dual_ocr(
|
1322 |
translated_hocr, translated_paddle
|
1323 |
)
|
1324 |
-
|
1325 |
print("***** Step 3 Done: Generated final HTML ******")
|
1326 |
|
1327 |
return HTMLResponse(content=final_html)
|
@@ -1334,74 +616,146 @@ async def translate_document_dual_ocr(
|
|
1334 |
status_code=500,
|
1335 |
detail=f"An unexpected error occurred during processing: {str(e)}",
|
1336 |
)
|
1337 |
-
|
1338 |
-
|
1339 |
-
|
1340 |
-
|
1341 |
-
|
1342 |
-
|
1343 |
-
|
1344 |
-
|
1345 |
-
|
1346 |
-
|
1347 |
-
""
|
1348 |
-
|
1349 |
-
|
1350 |
-
|
1351 |
-
|
1352 |
-
|
1353 |
-
|
1354 |
-
|
1355 |
-
|
1356 |
-
|
1357 |
-
|
1358 |
-
|
1359 |
-
|
1360 |
-
|
1361 |
-
|
1362 |
-
|
1363 |
-
|
1364 |
-
|
1365 |
-
|
1366 |
-
|
1367 |
-
|
1368 |
-
|
1369 |
-
|
1370 |
-
|
1371 |
-
|
1372 |
-
|
1373 |
-
|
1374 |
-
|
1375 |
-
|
1376 |
-
|
1377 |
-
|
1378 |
-
|
1379 |
-
|
1380 |
-
|
1381 |
-
|
1382 |
-
|
1383 |
-
|
1384 |
-
|
1385 |
-
|
1386 |
-
|
1387 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1388 |
|
1389 |
async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
|
1390 |
"""
|
1391 |
Translates a list of texts using Gemini in a single batch API call.
|
1392 |
"""
|
1393 |
-
if not texts:
|
1394 |
-
return []
|
1395 |
|
1396 |
try:
|
1397 |
api_key = os.getenv("GEMINI_API_KEY")
|
1398 |
if not api_key:
|
1399 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
|
|
|
|
1400 |
|
1401 |
genai.configure(api_key=api_key)
|
1402 |
-
model = genai.GenerativeModel(model_name="gemini-
|
1403 |
|
1404 |
-
# Create a single prompt asking for a JSON array response
|
1405 |
prompt = f"""
|
1406 |
Translate each string in the following JSON array of strings to {target_language}.
|
1407 |
Return a single JSON array where each element is the translated string corresponding
|
@@ -1422,14 +776,10 @@ async def translate_texts_with_gemini(texts: list[str], target_language: str) ->
|
|
1422 |
response = model.generate_content(prompt)
|
1423 |
return response.text.strip()
|
1424 |
|
1425 |
-
# Run the synchronous SDK call in a thread to avoid blocking asyncio
|
1426 |
response_text = await asyncio.to_thread(do_request)
|
1427 |
-
|
1428 |
-
# Clean the response to ensure it's valid JSON
|
1429 |
json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
1430 |
if not json_response_match:
|
1431 |
print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
|
1432 |
-
# Fallback: return original texts if parsing fails
|
1433 |
return texts
|
1434 |
|
1435 |
cleaned_json = json_response_match.group(0)
|
@@ -1437,78 +787,55 @@ async def translate_texts_with_gemini(texts: list[str], target_language: str) ->
|
|
1437 |
|
1438 |
if len(translated_texts) != len(texts):
|
1439 |
print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
|
1440 |
-
# Fallback in case of length mismatch
|
1441 |
return texts
|
1442 |
|
1443 |
return translated_texts
|
1444 |
|
1445 |
except Exception as e:
|
1446 |
print(f"An error occurred during Gemini translation: {e}")
|
1447 |
-
# Return original texts as a fallback
|
1448 |
return texts
|
1449 |
|
1450 |
-
# --- OCR EXTRACTION FUNCTIONS ---
|
1451 |
|
|
|
1452 |
|
1453 |
async def get_hocr_from_image(image_bytes: bytes) -> str:
|
1454 |
"""
|
1455 |
Performs OCR using Tesseract to get raw hOCR HTML output.
|
1456 |
-
This function accepts image bytes.
|
1457 |
"""
|
1458 |
if not image_bytes:
|
1459 |
raise ValueError("Image bytes cannot be empty.")
|
1460 |
-
|
1461 |
try:
|
1462 |
image = Image.open(io.BytesIO(image_bytes))
|
|
|
|
|
|
|
|
|
1463 |
except Exception as e:
|
1464 |
raise HTTPException(
|
1465 |
status_code=400,
|
1466 |
-
detail=f"
|
1467 |
)
|
1468 |
|
1469 |
-
# Run Tesseract OCR in a thread to avoid blocking the asyncio event loop
|
1470 |
-
loop = asyncio.get_running_loop()
|
1471 |
-
hocr_bytes = await loop.run_in_executor(
|
1472 |
-
None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
|
1473 |
-
)
|
1474 |
-
return hocr_bytes.decode("utf-8")
|
1475 |
-
|
1476 |
|
1477 |
-
async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
|
1478 |
"""
|
1479 |
-
Extracts text and their bounding boxes from an image using
|
1480 |
-
|
1481 |
"""
|
1482 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
|
1483 |
-
temp_file.write(image_bytes)
|
1484 |
-
temp_filepath = temp_file.name
|
1485 |
-
|
1486 |
try:
|
1487 |
-
|
1488 |
-
def do_ocr() -> list[dict]:
|
1489 |
-
"""Synchronous function to be run in a separate thread."""
|
1490 |
-
client = Client("kevansoon/PaddleOCR")
|
1491 |
-
# Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
|
1492 |
-
result = client.predict(
|
1493 |
-
img=handle_file(temp_filepath),
|
1494 |
-
lang="en",
|
1495 |
-
api_name="/predict",
|
1496 |
-
)
|
1497 |
-
return result
|
1498 |
-
|
1499 |
-
loop = asyncio.get_running_loop()
|
1500 |
-
extracted_data = await loop.run_in_executor(None, do_ocr)
|
1501 |
if not extracted_data:
|
1502 |
-
print("Warning: PaddleOCR returned no data.")
|
1503 |
-
return []
|
1504 |
return extracted_data
|
1505 |
-
|
1506 |
-
|
|
|
|
|
|
|
1507 |
|
1508 |
|
1509 |
# --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
|
1510 |
|
1511 |
-
|
1512 |
async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
|
1513 |
"""
|
1514 |
Parses hOCR, translates all text in a single batch call to Gemini,
|
@@ -1520,17 +847,14 @@ async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str)
|
|
1520 |
elements_to_translate = soup.find_all(class_="ocr_line")
|
1521 |
|
1522 |
original_texts = [el.get_text(strip=True) for el in elements_to_translate]
|
|
|
|
|
1523 |
|
1524 |
-
# Translate all texts in one go
|
1525 |
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
1526 |
|
1527 |
-
# Inject translations back
|
1528 |
for i, element in enumerate(elements_to_translate):
|
1529 |
-
if element.string:
|
1530 |
-
|
1531 |
-
if i < len(translated_texts):
|
1532 |
-
element.string.replace_with(translated_texts[i])
|
1533 |
-
|
1534 |
return str(soup)
|
1535 |
|
1536 |
|
@@ -1542,22 +866,20 @@ async def translate_paddle_data_with_gemini(
|
|
1542 |
using a single batch call to Gemini.
|
1543 |
"""
|
1544 |
original_texts = [item.get("text", "") for item in paddle_data]
|
|
|
|
|
1545 |
|
1546 |
-
# Translate all texts in one go
|
1547 |
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
1548 |
|
1549 |
translated_data = []
|
1550 |
for i, item in enumerate(paddle_data):
|
1551 |
-
# Ensure we don't go out of bounds if translation failed
|
1552 |
translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
|
1553 |
translated_data.append({"text": translated_text, "box": item.get("box")})
|
1554 |
-
|
1555 |
return translated_data
|
1556 |
|
1557 |
|
1558 |
# --- FINAL HTML GENERATION ---
|
1559 |
|
1560 |
-
|
1561 |
async def generate_html_from_dual_ocr(
|
1562 |
translated_hocr_html: str, translated_paddle_data: list[dict]
|
1563 |
) -> str:
|
@@ -1569,54 +891,43 @@ async def generate_html_from_dual_ocr(
|
|
1569 |
api_key = os.getenv("GEMINI_API_KEY")
|
1570 |
if not api_key:
|
1571 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
|
|
|
|
1572 |
|
1573 |
genai.configure(api_key=api_key)
|
1574 |
-
model = genai.GenerativeModel(model_name="gemini-
|
1575 |
|
1576 |
prompt = f"""
|
1577 |
-
You are
|
1578 |
-
Your task is to MERGE them into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
|
1579 |
|
1580 |
-
Input 1: Translated hOCR HTML
|
1581 |
--- HOCR START ---
|
1582 |
{translated_hocr_html}
|
1583 |
--- HOCR END ---
|
1584 |
|
1585 |
-
Input 2: Translated PaddleOCR data
|
1586 |
--- PADDLEOCR START ---
|
1587 |
{str(translated_paddle_data)}
|
1588 |
--- PADDLEOCR END ---
|
1589 |
|
1590 |
-
STRICT
|
1591 |
-
1.
|
1592 |
-
|
1593 |
-
-
|
1594 |
-
|
1595 |
-
|
1596 |
-
|
1597 |
-
- Include <html>, <head>, <style>, and <body>.
|
1598 |
-
- Include CSS in a <style> block so it renders exactly in an iframe.
|
1599 |
-
4. Table structure requirement:
|
1600 |
-
- Use <table>, <tbody>, <tr>, and <td> to organize words into rows and columns.
|
1601 |
-
- Each PaddleOCR word must be placed in a separate <td> within the correct row based on vertical alignment.
|
1602 |
-
- Apply CSS for borders, padding, and cell alignment to ensure readability.
|
1603 |
-
- Use colspan/rowspan where necessary to match the original layout.
|
1604 |
-
5. Positioning:
|
1605 |
-
- Use bounding box data to size and place each cell proportionally.
|
1606 |
-
- Avoid text overlap — if bounding boxes would overlap, adjust table cell spans or widths.
|
1607 |
-
6. Before outputting:
|
1608 |
-
- Validate internally that the HTML is valid.
|
1609 |
-
- Confirm every PaddleOCR text appears in the table.
|
1610 |
-
- Confirm the table renders correctly in an iframe.
|
1611 |
|
1612 |
FINAL OUTPUT REQUIREMENT:
|
1613 |
-
- Output ONLY the complete, valid
|
1614 |
"""
|
1615 |
|
1616 |
def do_request():
|
1617 |
-
"""Synchronous function to be run in a separate thread."""
|
1618 |
response = model.generate_content(prompt)
|
1619 |
-
|
|
|
|
|
|
|
1620 |
|
1621 |
return await asyncio.to_thread(do_request)
|
1622 |
|
@@ -1626,9 +937,13 @@ async def generate_html_from_dual_ocr(
|
|
1626 |
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
1627 |
|
1628 |
|
1629 |
-
|
|
|
|
|
1630 |
async def translate_document_dual_ocr(
|
1631 |
-
target_language: str = Form(...),
|
|
|
|
|
1632 |
):
|
1633 |
"""
|
1634 |
Processes a document using a dual OCR pipeline:
|
@@ -1644,17 +959,14 @@ async def translate_document_dual_ocr(
|
|
1644 |
)
|
1645 |
|
1646 |
try:
|
1647 |
-
await file.seek(0)
|
1648 |
image_bytes = await file.read()
|
1649 |
if not image_bytes:
|
1650 |
raise HTTPException(status_code=400, detail="Uploaded file is empty.")
|
1651 |
|
1652 |
# === STEP 1: Run both OCR extractions concurrently ===
|
1653 |
-
print(
|
1654 |
-
"***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******"
|
1655 |
-
)
|
1656 |
hocr_task = get_hocr_from_image(image_bytes)
|
1657 |
-
paddle_task = extract_text_and_boxes_with_paddle(image_bytes)
|
1658 |
hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
|
1659 |
|
1660 |
if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
|
@@ -1666,25 +978,16 @@ async def translate_document_dual_ocr(
|
|
1666 |
|
1667 |
# === STEP 2: Translate both OCR outputs concurrently using Gemini ===
|
1668 |
print("***** Step 2: Starting concurrent translation with Gemini ******")
|
1669 |
-
translated_hocr_task = translate_hocr_html_with_gemini(
|
1670 |
-
|
1671 |
-
)
|
1672 |
-
translated_paddle_task = translate_paddle_data_with_gemini(
|
1673 |
-
paddle_data, target_language
|
1674 |
-
)
|
1675 |
translated_hocr, translated_paddle = await asyncio.gather(
|
1676 |
translated_hocr_task, translated_paddle_task
|
1677 |
)
|
1678 |
print("***** Step 2 Done: Finished translation ******")
|
1679 |
|
1680 |
# === STEP 3: Generate final HTML from both translated outputs ===
|
1681 |
-
print(
|
1682 |
-
|
1683 |
-
)
|
1684 |
-
final_html = await generate_html_from_dual_ocr(
|
1685 |
-
translated_hocr, translated_paddle
|
1686 |
-
)
|
1687 |
-
|
1688 |
print("***** Step 3 Done: Generated final HTML ******")
|
1689 |
|
1690 |
return HTMLResponse(content=final_html)
|
@@ -1697,4 +1000,5 @@ async def translate_document_dual_ocr(
|
|
1697 |
status_code=500,
|
1698 |
detail=f"An unexpected error occurred during processing: {str(e)}",
|
1699 |
)
|
1700 |
-
|
|
|
|
12 |
import tempfile
|
13 |
import io
|
14 |
import traceback
|
15 |
+
import atexit
|
16 |
+
import functools
|
17 |
+
from queue import Queue
|
18 |
+
from threading import Event, Thread
|
19 |
|
20 |
# beautifulsoup
|
21 |
from bs4 import BeautifulSoup
|
|
|
53 |
# --- MODIFIED: Replaced old tool imports with the new one ---
|
54 |
from tools.tools import analyze_contract
|
55 |
|
56 |
+
#numpy and paddleocr
|
57 |
+
import numpy as np
|
58 |
+
from paddleocr import PaddleOCR
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
|
63 |
app = FastAPI(
|
64 |
title="Document Translator (Final Architecture)",
|
|
|
122 |
raise HTTPException(
|
123 |
status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
|
124 |
)
|
125 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
@app.post("/upload")
|
128 |
async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
|
|
|
237 |
return documents
|
238 |
|
239 |
|
240 |
+
# --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
|
241 |
+
|
242 |
+
|
243 |
+
# testing clerk backend authentication
|
244 |
+
# @app.post("/upload")
|
245 |
+
# async def upload_file(
|
246 |
+
# authorization: str = Header(...),
|
247 |
+
# file: UploadFile = File(...)
|
248 |
+
# ):
|
249 |
+
# if not authorization.startswith("Bearer "):
|
250 |
+
# raise HTTPException(status_code=401, detail="Missing Bearer token")
|
251 |
|
252 |
+
# token = authorization.split(" ")[1]
|
253 |
+
# claims = await verify_clerk_jwt(token)
|
254 |
|
255 |
+
# user_id = claims.get("sub") # Clerk user ID
|
256 |
|
257 |
+
# # ✅ Now the Clerk user is verified
|
258 |
+
# # You can securely store this file, e.g., to Supabase or local
|
259 |
+
# return {"message": f"File uploaded by Clerk user {user_id}"}
|
260 |
+
|
261 |
+
#------------------------ start of gemini workflow ---------------------------------
|
262 |
+
|
263 |
+
# This helper function for calling the Sea-Lion API is now UNUSED in the pipeline,
|
264 |
+
# but is kept here as requested.
|
265 |
async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
|
266 |
"""Helper function to call the translation API for a single piece of text."""
|
267 |
if not text_to_translate.strip():
|
|
|
303 |
return f"Translation Parsing Error: {text_to_translate}"
|
304 |
|
305 |
|
306 |
+
# --- NEW GEMINI TRANSLATION FUNCTION ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
+
async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
|
309 |
"""
|
310 |
+
Translates a list of texts using Gemini in a single batch API call.
|
|
|
311 |
"""
|
312 |
+
if not texts:
|
313 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
|
|
|
|
|
|
|
|
|
|
315 |
try:
|
316 |
api_key = os.getenv("GEMINI_API_KEY")
|
317 |
if not api_key:
|
318 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
319 |
|
320 |
genai.configure(api_key=api_key)
|
321 |
+
model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
|
322 |
|
323 |
+
# Create a single prompt asking for a JSON array response
|
324 |
prompt = f"""
|
325 |
+
Translate each string in the following JSON array of strings to {target_language}.
|
326 |
+
Return a single JSON array where each element is the translated string corresponding
|
327 |
+
to the original at the same index. Your output MUST be only the JSON array and nothing else.
|
328 |
|
329 |
+
Example Input:
|
330 |
+
["Hello world", "How are you?"]
|
331 |
+
|
332 |
+
Example Output for target language 'Spanish':
|
333 |
+
["Hola mundo", "¿Cómo estás?"]
|
334 |
|
335 |
+
Input for this task:
|
336 |
+
{json.dumps(texts)}
|
337 |
"""
|
338 |
|
339 |
def do_request():
|
340 |
+
"""Synchronous function to be run in a separate thread."""
|
341 |
+
response = model.generate_content(prompt)
|
342 |
+
return response.text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
|
344 |
+
# Run the synchronous SDK call in a thread to avoid blocking asyncio
|
345 |
+
response_text = await asyncio.to_thread(do_request)
|
346 |
|
347 |
+
# Clean the response to ensure it's valid JSON
|
348 |
+
json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
349 |
+
if not json_response_match:
|
350 |
+
print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
|
351 |
+
# Fallback: return original texts if parsing fails
|
352 |
+
return texts
|
353 |
|
354 |
+
cleaned_json = json_response_match.group(0)
|
355 |
+
translated_texts = json.loads(cleaned_json)
|
|
|
|
|
|
|
356 |
|
357 |
+
if len(translated_texts) != len(texts):
|
358 |
+
print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
|
359 |
+
# Fallback in case of length mismatch
|
360 |
+
return texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
+
return translated_texts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
363 |
|
364 |
+
except Exception as e:
|
365 |
+
print(f"An error occurred during Gemini translation: {e}")
|
366 |
+
# Return original texts as a fallback
|
367 |
+
return texts
|
368 |
|
369 |
# --- OCR EXTRACTION FUNCTIONS ---
|
370 |
|
|
|
425 |
os.unlink(temp_filepath)
|
426 |
|
427 |
|
428 |
+
# --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
|
429 |
|
430 |
|
431 |
+
async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
|
432 |
"""
|
433 |
+
Parses hOCR, translates all text in a single batch call to Gemini,
|
434 |
+
and injects translations back into the HTML.
|
435 |
"""
|
436 |
soup = BeautifulSoup(hocr_html, "html.parser")
|
437 |
elements_to_translate = soup.find_all(class_="ocrx_word")
|
|
|
439 |
elements_to_translate = soup.find_all(class_="ocr_line")
|
440 |
|
441 |
original_texts = [el.get_text(strip=True) for el in elements_to_translate]
|
|
|
|
|
|
|
|
|
442 |
|
443 |
+
# Translate all texts in one go
|
444 |
+
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
445 |
+
|
446 |
+
# Inject translations back
|
447 |
for i, element in enumerate(elements_to_translate):
|
448 |
if element.string:
|
449 |
+
# Ensure we don't go out of bounds if translation failed
|
450 |
+
if i < len(translated_texts):
|
451 |
+
element.string.replace_with(translated_texts[i])
|
452 |
|
453 |
return str(soup)
|
454 |
|
455 |
|
456 |
+
async def translate_paddle_data_with_gemini(
|
457 |
paddle_data: list[dict], target_language: str
|
458 |
) -> list[dict]:
|
459 |
"""
|
460 |
+
Translates the 'text' field of each item in the paddle_data list
|
461 |
+
using a single batch call to Gemini.
|
462 |
"""
|
463 |
original_texts = [item.get("text", "") for item in paddle_data]
|
464 |
+
|
465 |
+
# Translate all texts in one go
|
466 |
+
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
|
|
467 |
|
468 |
translated_data = []
|
469 |
for i, item in enumerate(paddle_data):
|
470 |
+
# Ensure we don't go out of bounds if translation failed
|
471 |
+
translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
|
472 |
+
translated_data.append({"text": translated_text, "box": item.get("box")})
|
473 |
|
474 |
return translated_data
|
475 |
|
|
|
490 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
491 |
|
492 |
genai.configure(api_key=api_key)
|
493 |
+
model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
|
494 |
|
495 |
prompt = f"""
|
496 |
You are provided with two different translated OCR outputs for the same document.
|
|
|
507 |
--- PADDLEOCR END ---
|
508 |
|
509 |
STRICT RULES:
|
510 |
+
1. You MUST output ONLY the FINAL RAW HTML code.
|
511 |
+
- No ```html, no triple quotes, no markdown, no explanations.
|
512 |
- Output must begin with <!DOCTYPE html> and end with </html>.
|
513 |
+
2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
|
514 |
- Every PaddleOCR text must appear exactly once in the correct order and location.
|
515 |
3. The HTML must be fully self-contained:
|
516 |
- Include <html>, <head>, <style>, and <body>.
|
|
|
545 |
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
546 |
|
547 |
|
548 |
+
@app.post("/api/translate_file_gemini", response_class=HTMLResponse)
|
549 |
async def translate_document_dual_ocr(
|
550 |
target_language: str = Form(...), file: UploadFile = File(...)
|
551 |
):
|
552 |
"""
|
553 |
Processes a document using a dual OCR pipeline:
|
554 |
1. Tesseract and PaddleOCR extract text and coordinates concurrently.
|
555 |
+
2. Gemini translates the text from both outputs concurrently using a batch method.
|
556 |
3. Gemini uses both translated outputs to generate the final layout-aware HTML.
|
557 |
"""
|
558 |
content_type = file.content_type
|
|
|
581 |
status_code=400,
|
582 |
detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
|
583 |
)
|
|
|
|
|
584 |
print("***** Step 1 Done: Finished OCR extraction ******")
|
585 |
|
586 |
+
# === STEP 2: Translate both OCR outputs concurrently using Gemini ===
|
587 |
+
print("***** Step 2: Starting concurrent translation with Gemini ******")
|
588 |
+
translated_hocr_task = translate_hocr_html_with_gemini(
|
589 |
hocr_html, target_language
|
590 |
)
|
591 |
+
translated_paddle_task = translate_paddle_data_with_gemini(
|
592 |
paddle_data, target_language
|
593 |
)
|
594 |
translated_hocr, translated_paddle = await asyncio.gather(
|
595 |
translated_hocr_task, translated_paddle_task
|
596 |
)
|
|
|
597 |
print("***** Step 2 Done: Finished translation ******")
|
598 |
|
599 |
# === STEP 3: Generate final HTML from both translated outputs ===
|
|
|
603 |
final_html = await generate_html_from_dual_ocr(
|
604 |
translated_hocr, translated_paddle
|
605 |
)
|
606 |
+
|
607 |
print("***** Step 3 Done: Generated final HTML ******")
|
608 |
|
609 |
return HTMLResponse(content=final_html)
|
|
|
616 |
status_code=500,
|
617 |
detail=f"An unexpected error occurred during processing: {str(e)}",
|
618 |
)
|
619 |
+
#-------------------------- end of gemini workflow ----------------------------------
|
620 |
+
|
621 |
+
#-------------------------- start of updated gemini workflow ----------------------------------
|
622 |
+
|
623 |
+
# --- PADDLEOCR LOCAL MODEL MANAGER SETUP ---
|
624 |
+
|
625 |
+
LANG_CONFIG = {
|
626 |
+
"ch": {"num_workers": 2},
|
627 |
+
"en": {"num_workers": 2},
|
628 |
+
"fr": {"num_workers": 1},
|
629 |
+
"german": {"num_workers": 1},
|
630 |
+
"korean": {"num_workers": 1},
|
631 |
+
"japan": {"num_workers": 1},
|
632 |
+
}
|
633 |
+
CONCURRENCY_LIMIT = 8
|
634 |
+
|
635 |
+
|
636 |
+
class PaddleOCRModelManager(object):
|
637 |
+
def __init__(self,
|
638 |
+
num_workers,
|
639 |
+
model_factory):
|
640 |
+
super().__init__()
|
641 |
+
self._model_factory = model_factory
|
642 |
+
self._queue = Queue()
|
643 |
+
self._workers = []
|
644 |
+
self._model_initialized_event = Event()
|
645 |
+
for _ in range(num_workers):
|
646 |
+
worker = Thread(target=self._worker, daemon=True) # Use daemon threads
|
647 |
+
worker.start()
|
648 |
+
self._model_initialized_event.wait()
|
649 |
+
self._model_initialized_event.clear()
|
650 |
+
self._workers.append(worker)
|
651 |
+
|
652 |
+
def infer(self, *args, **kwargs):
|
653 |
+
result_queue = Queue(maxsize=1)
|
654 |
+
self._queue.put((args, kwargs, result_queue))
|
655 |
+
success, payload = result_queue.get()
|
656 |
+
if success:
|
657 |
+
return payload
|
658 |
+
else:
|
659 |
+
raise payload
|
660 |
+
|
661 |
+
def close(self):
|
662 |
+
for _ in self._workers:
|
663 |
+
self._queue.put(None)
|
664 |
+
for worker in self._workers:
|
665 |
+
worker.join()
|
666 |
+
|
667 |
+
def _worker(self):
|
668 |
+
print(f"Initializing PaddleOCR model in worker thread...")
|
669 |
+
model = self._model_factory()
|
670 |
+
self._model_initialized_event.set()
|
671 |
+
print(f"PaddleOCR model initialized.")
|
672 |
+
while True:
|
673 |
+
item = self._queue.get()
|
674 |
+
if item is None:
|
675 |
+
break
|
676 |
+
args, kwargs, result_queue = item
|
677 |
+
try:
|
678 |
+
result = model.ocr(*args, **kwargs)
|
679 |
+
result_queue.put((True, result))
|
680 |
+
except Exception as e:
|
681 |
+
result_queue.put((False, e))
|
682 |
+
finally:
|
683 |
+
self._queue.task_done()
|
684 |
+
|
685 |
+
|
686 |
+
def create_model(lang):
|
687 |
+
print(f"Creating PaddleOCR model for language: {lang}")
|
688 |
+
return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False)
|
689 |
+
|
690 |
+
|
691 |
+
model_managers = {}
|
692 |
+
for lang, config in LANG_CONFIG.items():
|
693 |
+
print(f"Setting up model manager for language: {lang}")
|
694 |
+
model_manager = PaddleOCRModelManager(config["num_workers"], functools.partial(create_model, lang=lang))
|
695 |
+
model_managers[lang] = model_manager
|
696 |
+
|
697 |
+
|
698 |
+
def close_model_managers():
|
699 |
+
print("Closing all PaddleOCR model managers...")
|
700 |
+
for manager in model_managers.values():
|
701 |
+
manager.close()
|
702 |
+
|
703 |
+
|
704 |
+
atexit.register(close_model_managers)
|
705 |
+
|
706 |
+
|
707 |
+
def local_inference(img_bytes: bytes, lang: str) -> list[dict]:
|
708 |
+
"""
|
709 |
+
Performs OCR using the local PaddleOCRModelManager.
|
710 |
+
Accepts image bytes and returns the structured output.
|
711 |
+
"""
|
712 |
+
ocr_manager = model_managers.get(lang)
|
713 |
+
if not ocr_manager:
|
714 |
+
print(f"Warning: Language '{lang}' not configured. Falling back to 'en'.")
|
715 |
+
ocr_manager = model_managers['en']
|
716 |
+
|
717 |
+
# Convert image bytes to a numpy array that PaddleOCR can process
|
718 |
+
image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
719 |
+
img_array = np.array(image)
|
720 |
+
|
721 |
+
# The result from paddleocr is a list of lists, e.g., [[box, [text, score]], ...]
|
722 |
+
result = ocr_manager.infer(img_array, cls=True)
|
723 |
+
|
724 |
+
# PaddleOCR can sometimes return a list of results (one per page/batch item)
|
725 |
+
if result and isinstance(result, list) and isinstance(result[0], list):
|
726 |
+
result = result[0]
|
727 |
+
|
728 |
+
output = []
|
729 |
+
if result:
|
730 |
+
for line in result:
|
731 |
+
box = line[0] # Bounding box coordinates
|
732 |
+
text = line[1][0] # Extracted text
|
733 |
+
output.append({
|
734 |
+
"text": text,
|
735 |
+
"box": box
|
736 |
+
})
|
737 |
+
return output
|
738 |
+
|
739 |
+
|
740 |
+
# --- GEMINI TRANSLATION FUNCTION ---
|
741 |
|
742 |
async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
|
743 |
"""
|
744 |
Translates a list of texts using Gemini in a single batch API call.
|
745 |
"""
|
746 |
+
if not texts or all(not s.strip() for s in texts):
|
747 |
+
return [""] * len(texts)
|
748 |
|
749 |
try:
|
750 |
api_key = os.getenv("GEMINI_API_KEY")
|
751 |
if not api_key:
|
752 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
753 |
+
if not genai:
|
754 |
+
raise ImportError("google.generativeai library is not available.")
|
755 |
|
756 |
genai.configure(api_key=api_key)
|
757 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
758 |
|
|
|
759 |
prompt = f"""
|
760 |
Translate each string in the following JSON array of strings to {target_language}.
|
761 |
Return a single JSON array where each element is the translated string corresponding
|
|
|
776 |
response = model.generate_content(prompt)
|
777 |
return response.text.strip()
|
778 |
|
|
|
779 |
response_text = await asyncio.to_thread(do_request)
|
|
|
|
|
780 |
json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
|
781 |
if not json_response_match:
|
782 |
print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
|
|
|
783 |
return texts
|
784 |
|
785 |
cleaned_json = json_response_match.group(0)
|
|
|
787 |
|
788 |
if len(translated_texts) != len(texts):
|
789 |
print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
|
|
|
790 |
return texts
|
791 |
|
792 |
return translated_texts
|
793 |
|
794 |
except Exception as e:
|
795 |
print(f"An error occurred during Gemini translation: {e}")
|
|
|
796 |
return texts
|
797 |
|
|
|
798 |
|
799 |
+
# --- OCR EXTRACTION FUNCTIONS ---
|
800 |
|
801 |
async def get_hocr_from_image(image_bytes: bytes) -> str:
|
802 |
"""
|
803 |
Performs OCR using Tesseract to get raw hOCR HTML output.
|
|
|
804 |
"""
|
805 |
if not image_bytes:
|
806 |
raise ValueError("Image bytes cannot be empty.")
|
|
|
807 |
try:
|
808 |
image = Image.open(io.BytesIO(image_bytes))
|
809 |
+
hocr_bytes = await asyncio.to_thread(
|
810 |
+
pytesseract.image_to_pdf_or_hocr, image, extension="hocr"
|
811 |
+
)
|
812 |
+
return hocr_bytes.decode("utf-8")
|
813 |
except Exception as e:
|
814 |
raise HTTPException(
|
815 |
status_code=400,
|
816 |
+
detail=f"Tesseract OCR failed. Error: {e}",
|
817 |
)
|
818 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
819 |
|
820 |
+
async def extract_text_and_boxes_with_paddle(image_bytes: bytes, lang: str = "en") -> list[dict]:
|
821 |
"""
|
822 |
+
Extracts text and their bounding boxes from an image using the local
|
823 |
+
PaddleOCRModelManager, running it in a thread to keep the API async.
|
824 |
"""
|
|
|
|
|
|
|
|
|
825 |
try:
|
826 |
+
extracted_data = await asyncio.to_thread(local_inference, image_bytes, lang)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
827 |
if not extracted_data:
|
828 |
+
print("Warning: Local PaddleOCR returned no data.")
|
|
|
829 |
return extracted_data
|
830 |
+
except Exception as e:
|
831 |
+
print(f"An error occurred during local PaddleOCR processing: {e}")
|
832 |
+
traceback.print_exc()
|
833 |
+
# Return empty list on failure to avoid breaking the pipeline
|
834 |
+
return []
|
835 |
|
836 |
|
837 |
# --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
|
838 |
|
|
|
839 |
async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
|
840 |
"""
|
841 |
Parses hOCR, translates all text in a single batch call to Gemini,
|
|
|
847 |
elements_to_translate = soup.find_all(class_="ocr_line")
|
848 |
|
849 |
original_texts = [el.get_text(strip=True) for el in elements_to_translate]
|
850 |
+
if not original_texts:
|
851 |
+
return str(soup)
|
852 |
|
|
|
853 |
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
854 |
|
|
|
855 |
for i, element in enumerate(elements_to_translate):
|
856 |
+
if element.string and i < len(translated_texts):
|
857 |
+
element.string.replace_with(translated_texts[i])
|
|
|
|
|
|
|
858 |
return str(soup)
|
859 |
|
860 |
|
|
|
866 |
using a single batch call to Gemini.
|
867 |
"""
|
868 |
original_texts = [item.get("text", "") for item in paddle_data]
|
869 |
+
if not original_texts:
|
870 |
+
return []
|
871 |
|
|
|
872 |
translated_texts = await translate_texts_with_gemini(original_texts, target_language)
|
873 |
|
874 |
translated_data = []
|
875 |
for i, item in enumerate(paddle_data):
|
|
|
876 |
translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
|
877 |
translated_data.append({"text": translated_text, "box": item.get("box")})
|
|
|
878 |
return translated_data
|
879 |
|
880 |
|
881 |
# --- FINAL HTML GENERATION ---
|
882 |
|
|
|
883 |
async def generate_html_from_dual_ocr(
|
884 |
translated_hocr_html: str, translated_paddle_data: list[dict]
|
885 |
) -> str:
|
|
|
891 |
api_key = os.getenv("GEMINI_API_KEY")
|
892 |
if not api_key:
|
893 |
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
894 |
+
if not genai:
|
895 |
+
raise ImportError("google.generativeai library is not available.")
|
896 |
|
897 |
genai.configure(api_key=api_key)
|
898 |
+
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
|
899 |
|
900 |
prompt = f"""
|
901 |
+
You are an expert web developer. Your task is to merge two different translated OCR outputs for the same document into a single, clean, and well-styled HTML document that can be rendered directly in an iframe.
|
|
|
902 |
|
903 |
+
Input 1: Translated hOCR HTML. This provides a basic structural layout.
|
904 |
--- HOCR START ---
|
905 |
{translated_hocr_html}
|
906 |
--- HOCR END ---
|
907 |
|
908 |
+
Input 2: Translated PaddleOCR data. This is a precise list of words and their bounding boxes.
|
909 |
--- PADDLEOCR START ---
|
910 |
{str(translated_paddle_data)}
|
911 |
--- PADDLEOCR END ---
|
912 |
|
913 |
+
STRICT INSTRUCTIONS:
|
914 |
+
1. **Output Raw HTML Only**: Your entire output must be only the final HTML code. It must start with `<!DOCTYPE html>` and end with `</html>`. Do NOT include markdown fences like ```html or any explanations.
|
915 |
+
2. **Prioritize PaddleOCR Data**: ALL text from the PaddleOCR input MUST be included in the final HTML. Use the hOCR as a structural guide, but the PaddleOCR data is the source of truth for the content and positioning.
|
916 |
+
3. **Self-Contained HTML**: The HTML must be fully self-contained with embedded CSS in a `<style>` block within the `<head>`.
|
917 |
+
4. **Layout Reconstruction**: Use absolute positioning (`position: absolute;`) for `<span>` or `<div>` elements containing the text. Use the bounding box coordinates from the PaddleOCR data to set the `top`, `left`, `width`, and `height` CSS properties for each element to reconstruct the original document layout precisely.
|
918 |
+
5. **Coordinate System**: The bounding box format is [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]. You can approximate the position using `left: x1`, `top: y1`, `width: x2 - x1`, and `height: y3 - y1`.
|
919 |
+
6. **Validation**: Before outputting, mentally confirm that every single text element from the PaddleOCR data is present in the final HTML and positioned correctly.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
920 |
|
921 |
FINAL OUTPUT REQUIREMENT:
|
922 |
+
- Output ONLY the complete, valid, and self-contained HTML.
|
923 |
"""
|
924 |
|
925 |
def do_request():
|
|
|
926 |
response = model.generate_content(prompt)
|
927 |
+
# Clean up potential markdown fences
|
928 |
+
clean_text = re.sub(r'^```html\s*', '', response.text.strip(), flags=re.IGNORECASE)
|
929 |
+
clean_text = re.sub(r'\s*```$', '', clean_text)
|
930 |
+
return clean_text
|
931 |
|
932 |
return await asyncio.to_thread(do_request)
|
933 |
|
|
|
937 |
return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
|
938 |
|
939 |
|
940 |
+
# --- FASTAPI ENDPOINT ---
|
941 |
+
|
942 |
+
@app.post("/api/translate_file_gemini_local", response_class=HTMLResponse)
|
943 |
async def translate_document_dual_ocr(
|
944 |
+
target_language: str = Form(...),
|
945 |
+
source_language: str = Form("en"), # Add source language for OCR
|
946 |
+
file: UploadFile = File(...)
|
947 |
):
|
948 |
"""
|
949 |
Processes a document using a dual OCR pipeline:
|
|
|
959 |
)
|
960 |
|
961 |
try:
|
|
|
962 |
image_bytes = await file.read()
|
963 |
if not image_bytes:
|
964 |
raise HTTPException(status_code=400, detail="Uploaded file is empty.")
|
965 |
|
966 |
# === STEP 1: Run both OCR extractions concurrently ===
|
967 |
+
print("***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******")
|
|
|
|
|
968 |
hocr_task = get_hocr_from_image(image_bytes)
|
969 |
+
paddle_task = extract_text_and_boxes_with_paddle(image_bytes, lang=source_language)
|
970 |
hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
|
971 |
|
972 |
if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
|
|
|
978 |
|
979 |
# === STEP 2: Translate both OCR outputs concurrently using Gemini ===
|
980 |
print("***** Step 2: Starting concurrent translation with Gemini ******")
|
981 |
+
translated_hocr_task = translate_hocr_html_with_gemini(hocr_html, target_language)
|
982 |
+
translated_paddle_task = translate_paddle_data_with_gemini(paddle_data, target_language)
|
|
|
|
|
|
|
|
|
983 |
translated_hocr, translated_paddle = await asyncio.gather(
|
984 |
translated_hocr_task, translated_paddle_task
|
985 |
)
|
986 |
print("***** Step 2 Done: Finished translation ******")
|
987 |
|
988 |
# === STEP 3: Generate final HTML from both translated outputs ===
|
989 |
+
print("***** Step 3: Generating final HTML from dual OCR data via Gemini ******")
|
990 |
+
final_html = await generate_html_from_dual_ocr(translated_hocr, translated_paddle)
|
|
|
|
|
|
|
|
|
|
|
991 |
print("***** Step 3 Done: Generated final HTML ******")
|
992 |
|
993 |
return HTMLResponse(content=final_html)
|
|
|
1000 |
status_code=500,
|
1001 |
detail=f"An unexpected error occurred during processing: {str(e)}",
|
1002 |
)
|
1003 |
+
|
1004 |
+
#-------------------------- end of updated gemini workflow ----------------------------------
|
requirements.txt
CHANGED
@@ -95,4 +95,6 @@ watchfiles==1.1.0
|
|
95 |
websockets==15.0.1
|
96 |
langextract
|
97 |
gradio_client
|
98 |
-
pytesseract
|
|
|
|
|
|
95 |
websockets==15.0.1
|
96 |
langextract
|
97 |
gradio_client
|
98 |
+
pytesseract
|
99 |
+
paddlepaddle
|
100 |
+
paddleocr==2.10.0
|