Spaces:

kevansoon
/

backend

Sleeping

App Files Files Community

KevanSoon commited on 25 days ago

Commit

c156dc2

1 Parent(s): 4df3d15

major app.py changes

Browse files

Files changed (2) hide show

app.py +284 -980
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -12,6 +12,10 @@ import uuid
 import tempfile
 import io
 import traceback
 # beautifulsoup
 from bs4 import BeautifulSoup
@@ -49,6 +53,12 @@ from auth.clerk import verify_clerk_jwt
 # --- MODIFIED: Replaced old tool imports with the new one ---
 from tools.tools import analyze_contract
 app = FastAPI(
     title="Document Translator (Final Architecture)",
@@ -112,608 +122,7 @@ async def analyze_contract_endpoint(file: UploadFile = File(...)):
         raise HTTPException(
             status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
         )
-# --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
-def wrap_words_with_spans(html: str) -> str:
-    # Wrap each word in target tags with a span having data attributes
-    def replacer(match):
-        replacer.counter += 1
-        word = match.group(0)
-        return f'<span data-clickable="true" data-id="word-{replacer.counter}">{word}</span>'
-    replacer.counter = 0
-    pattern = r"\b\w+[.,?!]?\b"  # matches words with optional trailing punctuation
-    for tag in ["p", "h1", "h2", "td"]:
-        # regex to capture content inside these tags
-        regex = re.compile(rf"(<{tag}[^>]*>)(.*?)(</{tag}>)", re.DOTALL)
-        def replacer_func(m):
-            open_tag, inner_text, close_tag = m.groups()
-            wrapped_text = re.sub(pattern, replacer, inner_text)
-            return open_tag + wrapped_text + close_tag
-        html = regex.sub(replacer_func, html)
-    return html
-def inject_dropdown_script(html: str) -> str:
-    script = """
-<script>
-window.addEventListener('DOMContentLoaded', () => {
-  function createDropdown(x, y, wordEl, word) {
-    // Remove any existing dropdown
-    const oldDropdown = document.getElementById('translation-dropdown');
-    if (oldDropdown) oldDropdown.remove();
-    // Create dropdown select element
-    const dropdown = document.createElement('select');
-    dropdown.id = 'translation-dropdown';
-    dropdown.style.position = 'absolute';
-    dropdown.style.left = x + 'px';
-    dropdown.style.top = y + 'px';
-    dropdown.style.zIndex = 9999;
-    // Languages options
-    const languages = ['English', 'Chinese', 'Tamil', 'Hindi'];
-    languages.forEach(lang => {
-      const option = document.createElement('option');
-      option.value = lang.toLowerCase();
-      option.innerText = lang;
-      dropdown.appendChild(option);
-    });
-    // Placeholder option
-    const defaultOption = document.createElement('option');
-    defaultOption.value = '';
-    defaultOption.innerText = 'Select language';
-    defaultOption.selected = true;
-    defaultOption.disabled = true;
-    dropdown.insertBefore(defaultOption, dropdown.firstChild);
-    document.body.appendChild(dropdown);
-    dropdown.focus();
-    dropdown.addEventListener('change', () => {
-      const selectedLang = dropdown.value;
-      if (!selectedLang) return;
-      // Call backend to translate word
-      fetch('http://localhost:8080/api/translate_frontend', {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ text: word, target_language: selectedLang }),
-      })
-      .then(res => {
-        if (!res.ok) throw new Error('Translation API error');
-        return res.json();
-      })
-      .then(data => {
-        const translated = data.translated_text || word;
-        wordEl.innerText = translated;
-        // Add or update language label
-        let label = wordEl.nextSibling;
-        if (!label || !label.classList || !label.classList.contains('language-label')) {
-          label = document.createElement('span');
-          label.className = 'language-label';
-          label.style.marginLeft = '6px';
-          label.style.fontSize = '0.8em';
-          label.style.color = '#555';
-          wordEl.after(label);
-        }
-        label.textContent = `(${dropdown.options[dropdown.selectedIndex].text})`;
-      })
-      .catch(err => {
-        console.error('Translation error:', err);
-        alert('Translation failed, please try again.');
-      });
-      dropdown.remove();
-    });
-    // Clicking outside closes dropdown
-    document.addEventListener('click', function onDocClick(e) {
-      if (!dropdown.contains(e.target)) {
-        dropdown.remove();
-        document.removeEventListener('click', onDocClick);
-      }
-    });
-  }
-  // Add click handlers to all words wrapped in spans with data-clickable="true"
-  document.querySelectorAll('span[data-clickable="true"]').forEach(el => {
-    el.style.cursor = 'pointer';
-    el.addEventListener('click', event => {
-      event.stopPropagation();
-      const word = el.innerText;
-      const rect = el.getBoundingClientRect();
-      const x = rect.left + window.scrollX;
-      const y = rect.bottom + window.scrollY;
-      createDropdown(x, y, el, word);
-    });
-  });
-});
-</script>
-"""
-    if "</body>" in html:
-        return html.replace("</body>", script + "\n</body>")
-    else:
-        return html + script
-@app.post("/api/translate_frontend")
-async def translate_text(request: Request):
-    try:
-        data = await request.json()
-        text = data.get("text")
-        target_language = data.get("target_language")
-        if not text or not target_language:
-            raise HTTPException(
-                status_code=400,
-                detail="Missing 'text' or 'target_language' in request body",
-            )
-        url = "https://api.sea-lion.ai/v1/chat/completions"
-        api_key = os.getenv("SEALION_API_KEY")
-        headers = {
-            "Authorization": f"Bearer {api_key}",
-            "Content-Type": "application/json",
-            # No "accept" header or set to "application/json"
-        }
-        prompt = (
-            f"Please translate the following text to {target_language} and return "
-            "ONLY the translated text without any explanations or extra formatting:\n\n"
-            f'"{text}"'
-        )
-        payload = {
-            "max_completion_tokens": 1024,
-            "messages": [{"role": "user", "content": prompt}],
-            "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
-        }
-        response = requests.post(url, headers=headers, data=json.dumps(payload))
-        response.raise_for_status()
-        # Parse JSON response
-        response_json = response.json()
-        # Extract translated text from response JSON
-        translated_text = response_json["choices"][0]["message"]["content"].strip()
-        if not translated_text:
-            raise HTTPException(
-                status_code=500, detail="Empty response from translation model."
-            )
-        return {"translated_text": translated_text}
-    except requests.exceptions.RequestException as e:
-        raise HTTPException(
-            status_code=502, detail=f"Translation API request failed: {e}"
-        )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Unexpected error: {e}")
-# --- Model 2: Sea-Lion (The JSON Translator) ---
-@app.post("/api/translate")
-async def translate_text(text: str, target_language: str):
-    """
-    Receives text and a target language, and returns the translated text
-    using the SEA-LION model.
-    """
-    # The API endpoint URL for translation
-    url = "https://api.sea-lion.ai/v1/chat/completions"
-    # It's recommended to store API keys securely, e.g., in environment variables
-    api_key = os.getenv("SEALION_API_KEY")
-    # The headers for the request
-    headers = {
-        "accept": "text/plain",
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-    # Create a dynamic prompt for the translation task
-    prompt = f'Translate the following text to {text}: "{target_language}"'
-    # The JSON data payload for the request
-    data = {
-        "max_completion_tokens": 4096,  # Increased token limit for longer translations
-        "messages": [{"role": "user", "content": prompt}],
-        "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
-    }
-    try:
-        # Make the POST request to the SEA-LION API
-        response = requests.post(url, headers=headers, data=json.dumps(data))
-        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
-        # The response from this specific API is plain text, not JSON.
-        # We will wrap it in a JSON structure for consistency in our API.
-        translated_text = response.text
-        # It's good practice to check if the response is empty
-        if not translated_text:
-            raise HTTPException(
-                status_code=500,
-                detail="Received an empty response from the translation model.",
-            )
-        return {"translated_text": translated_text}
-    except requests.exceptions.RequestException as e:
-        # Handle network-related errors
-        raise HTTPException(
-            status_code=502,
-            detail=f"Failed to communicate with the translation AI model: {e}",
-        )
-    except Exception as e:
-        # Handle other potential errors
-        raise HTTPException(
-            status_code=500,
-            detail=f"An unexpected error occurred during translation: {e}",
-        )
-# --- Model 3: Gemini (The HTML Generator) ---
-async def generate_html_from_translated_json(translated_json: dict) -> str:
-    """
-    Receives a translated JSON object and uses Gemini to generate the final
-    structured HTML document.
-    """
-    try:
-        api_key = os.getenv("GEMINI_API_KEY")
-        if not api_key:
-            raise ValueError("GEMINI_API_KEY not found in environment variables.")
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-2.0-flash")
-        json_string_for_prompt = json.dumps(translated_json, indent=2)
-        prompt = f"""
-        You are an expert system that converts a JSON object containing PRE-TRANSLATED text into a clean, semantic HTML document.
-        **Your Task:**
-        1.  Analyze the following JSON object. Its text content has already been translated.
-        2.  The core document data is located at the path: `choices[0]['message']['tool_calls'][0]['function']['arguments']`.
-        3.  The value of 'arguments' is a JSON STRING. You must parse this inner string to access the list of document chunks.
-        4.  Using the translated data from the 'text' fields, generate a single, complete HTML5 document. Use appropriate tags like <h1>, <h2>, <p>, and <table>.
-        5.  if json contains "tabular" means mmake a table for that with some grey border and styling
-        6.  Your final output must ONLY be the raw HTML code. Do not add comments or markdown.
-        **Translated JSON object to process:**
-        ```json
-        {json_string_for_prompt}
-        ```
-        """
-        # def do_request():
-        #     response = model.generate_content(prompt)
-        #     match = re.search(r'```html\n(.*?)\n```', response.text, re.DOTALL)
-        #     if match:
-        #         return match.group(1).strip()
-        #     return response.text.strip()
-        # return await asyncio.to_thread(do_request)
-        def do_request():
-            response = model.generate_content(prompt)
-            # Extract raw HTML from Gemini markdown code block
-            match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
-            raw_html = match.group(1).strip() if match else response.text.strip()
-            # Wrap each word in clickable spans
-            wrapped_html = wrap_words_with_spans(raw_html)
-            # Inject dropdown script
-            final_html = inject_dropdown_script(wrapped_html)
-            return final_html
-        return await asyncio.to_thread(do_request)
-    except google_exceptions.ResourceExhausted as e:
-        error_message = "The request to the document processor (Gemini) was rejected due to API quota limits. Please wait or upgrade your API plan."
-        return f"<html><body><h1>API Quota Error</h1><p>{html.escape(error_message)}</p></body></html>"
-    except Exception as e:
-        error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
-        return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-# --- API Endpoint Orchestrating the Pipeline ---
-@app.post("/api/translate_file", response_class=HTMLResponse)
-async def translate_document_to_raw_html(
-    target_language: str = Form(...), file: UploadFile = File(...)
-):
-    """
-    Processes a document using the final, robust pipeline:
-    1. Nemo extracts content to JSON.
-    2. Sea-Lion translates the text within the JSON.
-    3. Gemini generates the final HTML from the translated JSON.
-    """
-    content_type = file.content_type
-    if content_type not in ["application/pdf", "image/png", "image/jpeg"]:
-        raise HTTPException(status_code=400, detail="Unsupported file type.")
-    try:
-        # === STEP 1: Get raw JSON from Nemo (The Parser) ===
-        file_content = await file.read()
-        file_b64 = base64.b64encode(file_content).decode("utf-8")
-        nemo_data = {
-            "model": "nvidia/nemoretriever-parse",
-            "messages": [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:{content_type};base64,{file_b64}"
-                            },
-                        }
-                    ],
-                }
-            ],
-            "max_tokens": 2048,
-        }
-        headers = {"accept": "application/json", "Content-Type": "application/json"}
-        model_response = requests.post(
-            "http://localhost:8000/v1/chat/completions",
-            headers=headers,
-            data=json.dumps(nemo_data),
-        )
-        model_response.raise_for_status()
-        nemo_response_json = model_response.json()
-        print(nemo_response_json)
-        print("*********** Step 1 Done ***********")
-        print("*********** Step 2 in Progress ***********")
-        # === STEP 2: Get translated JSON from Sea-Lion (The Translator) ===
-        translated_json = await translate_text(nemo_response_json, target_language)
-        print(translated_json)
-        print("*********** Step 2 Done ***********")
-        print("*********** Step 3 in Progress ***********")
-        # === STEP 3: Generate final HTML from Gemini (The HTML Generator) ===
-        final_html = await generate_html_from_translated_json(translated_json)
-        print(final_html)
-        print("*********** Step 3 Done ***********")
-        # Check if Gemini itself returned an error message
-        if final_html.strip().startswith("<html><body><h1>"):
-            return HTMLResponse(content=final_html)
-        # === STEP 4: Return the final result to the frontend ===
-        return HTMLResponse(content=final_html)
-    except requests.exceptions.RequestException as e:
-        raise HTTPException(
-            status_code=502,
-            detail=f"Failed to communicate with a downstream AI model: {e}",
-        )
-    except Exception as e:
-        # This will catch any errors, including the ValueError from the Sea-Lion function
-        raise HTTPException(
-            status_code=500,
-            detail=f"An unexpected error occurred during processing: {e}",
-        )
-# <<< --- START OF MVP PIPELINE ADDITIONS (Layout-Aware Version) --- >>>
-async def extract_text_and_boxes_with_paddle(file_content: bytes) -> list[dict]:
-    """
-    Extracts text and their bounding boxes from an image using PaddleOCR.
-    Returns the full list of dictionary objects from the OCR tool.
-    """
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-        temp_file.write(file_content)
-        temp_filepath = temp_file.name
-    try:
-        def do_ocr() -> list[dict]:
-            """Synchronous function to be run in a separate thread."""
-            client = Client("kevansoon/PaddleOCR")
-            # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
-            result = client.predict(
-                img=handle_file(temp_filepath),
-                lang="en",
-                api_name="/predict",
-            )
-            return result
-        loop = asyncio.get_running_loop()
-        extracted_data = await loop.run_in_executor(None, do_ocr)
-        return extracted_data
-    finally:
-        os.unlink(temp_filepath)
-async def translate_paddle_data_concurrently(
-    paddle_data: list[dict], target_language: str
-) -> list[dict]:
-    """
-    Translates the 'text' field of each item in the paddle_data list concurrently.
-    """
-    async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
-        """Helper function to call the translation API for a single piece of text."""
-        url = "https://api.sea-lion.ai/v1/chat/completions"
-        api_key = os.getenv("SEALION_API_KEY")
-        headers = {
-            "Authorization": f"Bearer {api_key}",
-            "Content-Type": "application/json",
-        }
-        prompt = f'Translate the following phrase to {lang} and return ONLY the translated text without explanations or extra formatting:\n\n"{text_to_translate}"'
-        payload = {
-            "max_completion_tokens": 256,  # Tokens for a single phrase, not a whole doc
-            "messages": [{"role": "user", "content": prompt}],
-            "model": "aisingapore/Gemma-SEA-LION-v3-9B-IT",
-        }
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                url, headers=headers, json=payload, timeout=30.0
-            )
-            response.raise_for_status()
-            response_json = response.json()
-            return response_json["choices"][0]["message"]["content"].strip()
-    # Create a list of translation tasks to run concurrently
-    translation_tasks = [
-        call_sealion_for_translation(item["text"], target_language)
-        for item in paddle_data
-    ]
-    # Execute all translation tasks in parallel
-    translated_texts = await asyncio.gather(*translation_tasks)
-    # Reconstruct the data structure with translated text and original boxes
-    translated_data = []
-    for i, item in enumerate(paddle_data):
-        translated_data.append({"text": translated_texts[i], "box": item["box"]})
-    return translated_data
-async def generate_html_from_paddle_data(translated_data: list[dict]) -> str:
-    """
-    Receives translated OCR data (text with coordinates) and uses Gemini
-    to generate a layout-aware HTML document.
-    """
-    try:
-        api_key = os.getenv("GEMINI_API_KEY")
-        if not api_key:
-            raise ValueError("GEMINI_API_KEY not found in environment variables.")
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-2.5-flash")
-        # Convert the list of data to a JSON string for the prompt
-        # THE FIX IS HERE: Added ensure_ascii=False
-        json_data_for_prompt = json.dumps(translated_data, indent=2, ensure_ascii=False)
-        prompt = f"""
-        You are an expert system specializing in converting structured OCR data into a well-formatted HTML document that preserves the original layout.
-        **Your Task:**
-        1.  Analyze the following JSON array. Each object contains a `text` field (pre-translated) and a `box` field (four [x, y] coordinates of its bounding box).
-        2.  Use the `box` coordinates to understand the document's spatial structure.
-            -   Elements with similar y-coordinates are likely on the same row.
-            -   Elements aligned vertically form columns.
-        3.  Reconstruct the visual layout using semantic HTML.
-            -   Use `<table>` for grid-like data (rows and columns). This is critical for payslips.
-            -   Use `<h1>`, `<h2>`, `<p>` for headings and paragraphs.
-            -   Do NOT use absolute positioning (e.g., `style="position: absolute; left: ..."`). Create a clean, flowing HTML structure.
-        4.  Your final output must ONLY be the raw HTML code. Do not add comments, markdown backticks, or any other explanatory text.
-        **OCR Data to process:**
-        ```json
-        {json_data_for_prompt}
-        ```
-        """
-        def do_request():
-            """Synchronous function to be run in a separate thread."""
-            response = model.generate_content(prompt)
-            match = re.search(r"```html\n(.*?)\n```", response.text, re.DOTALL)
-            raw_html = match.group(1).strip() if match else response.text.strip()
-            # Reuse existing functions to make the HTML interactive
-            wrapped_html = wrap_words_with_spans(raw_html)
-            final_html = inject_dropdown_script(wrapped_html)
-            return final_html
-        return await asyncio.to_thread(do_request)
-    except Exception as e:
-        error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
-        return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-@app.post("/api/translate_file_mvp", response_class=HTMLResponse)
-async def translate_document_mvp(
-    target_language: str = Form(...), file: UploadFile = File(...)
-):
-    """
-    Processes a document using the Layout-Aware MVP pipeline:
-    1. PaddleOCR extracts text and coordinates.
-    2. Sea-Lion translates each text block concurrently.
-    3. Gemini uses the translated text and original coordinates to generate layout-aware HTML.
-    """
-    content_type = file.content_type
-    if content_type not in ["image/png", "image/jpeg"]:
-        raise HTTPException(
-            status_code=400,
-            detail="Unsupported file type for MVP pipeline. Please use PNG or JPG.",
-        )
-    try:
-        file_content = await file.read()
-        # === MVP STEP 1: Extract text and coordinates with PaddleOCR ===
-        paddle_data = await extract_text_and_boxes_with_paddle(file_content)
-        if not paddle_data:
-            raise HTTPException(
-                status_code=400,
-                detail="PaddleOCR could not extract any text from the image.",
-            )
-        print("***** Step 1 Done ******")
-        # === MVP STEP 2: Translate each text block concurrently ===
-        translated_data = await translate_paddle_data_concurrently(
-            paddle_data, target_language
-        )
-        print("***** Step 2 Done ******")
-        # === MVP STEP 3: Generate final, layout-aware HTML from Gemini ===
-        final_html = await generate_html_from_paddle_data(translated_data)
-        print("***** Step 3 Done ******")
-        return HTMLResponse(content=final_html)
-    except httpx.HTTPStatusError as e:
-        raise HTTPException(
-            status_code=e.response.status_code,
-            detail=f"Error from a downstream AI service: {e.response.text}",
-        )
-    except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=f"An unexpected error occurred during MVP processing: {str(e)}",
-        )
-# <<< --- END OF MVP PIPELINE ADDITIONS (Layout-Aware Version) --- >>>
-# testing clerk backend authentication
-# @app.post("/upload")
-# async def upload_file(
-#     authorization: str = Header(...),
-#     file: UploadFile = File(...)
-# ):
-#     if not authorization.startswith("Bearer "):
-#         raise HTTPException(status_code=401, detail="Missing Bearer token")
-#     token = authorization.split(" ")[1]
-#     claims = await verify_clerk_jwt(token)
-#     user_id = claims.get("sub")  # Clerk user ID
-#     # ✅ Now the Clerk user is verified
-#     # You can securely store this file, e.g., to Supabase or local
-#     return {"message": f"File uploaded by Clerk user {user_id}"}
 @app.post("/upload")
 async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
@@ -828,11 +237,31 @@ async def get_user_documents(
     return documents
-# ----------------------------------Start OF PYTESSERACT workflow-----------------------------------
-# --- Helper Functions (Sealion, Gemini Configuration) ---
 async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
     """Helper function to call the translation API for a single piece of text."""
     if not text_to_translate.strip():
@@ -874,217 +303,68 @@ async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str
             return f"Translation Parsing Error: {text_to_translate}"
-# --- PIPELINE FUNCTIONS (Corrected and Verified) ---
-async def get_hocr_from_image(file: UploadFile) -> str:
-    """
-    Performs OCR using Tesseract to get raw hOCR HTML output.
-    This function accepts an UploadFile object, reads its byte content,
-    and passes those bytes to Pillow and Tesseract.
-    """
-    # ** THE FIX IS HERE **
-    # We must first read the file's content into memory as bytes.
-    # The UploadFile object itself cannot be processed by Pillow.
-    image_bytes = await file.read()
-    await file.close() # It's good practice to close the file.
-    if not image_bytes:
-        raise HTTPException(status_code=400, detail="Uploaded file is empty.")
-    try:
-        # We then open the bytes using Pillow through an in-memory stream (io.BytesIO).
-        # This correctly provides the image data to the image processing library.
-        image = Image.open(io.BytesIO(image_bytes))
-    except Exception as e:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Cannot open image. It may be corrupted or unsupported. Error: {e}",
-        )
-    # Run Tesseract OCR in a separate thread to avoid blocking the asyncio event loop.
-    # Pytesseract works with the Pillow 'Image' object directly.
-    loop = asyncio.get_running_loop()
-    hocr_bytes = await loop.run_in_executor(
-        None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
-    )
-    return hocr_bytes.decode("utf-8")
-async def translate_hocr_html_concurrently(hocr_html: str, target_language: str) -> str:
     """
-    Parses hOCR to find text, translates it concurrently, and injects the
-    translations back into the HTML structure.
     """
-    soup = BeautifulSoup(hocr_html, "html.parser")
-    elements_to_translate = soup.find_all(class_="ocrx_word")
-    if not elements_to_translate:
-        elements_to_translate = soup.find_all(class_="ocr_line")
-    original_texts = [el.get_text(strip=True) for el in elements_to_translate]
-    # Translate all texts concurrently
-    translation_tasks = [
-        call_sealion_for_translation(text, target_language) for text in original_texts
-    ]
-    translated_texts = await asyncio.gather(*translation_tasks)
-    # Replace the text in the soup object with the translations
-    for i, element in enumerate(elements_to_translate):
-        if element.string:
-            element.string.replace_with(translated_texts[i])
-    return str(soup)
-async def generate_html_from_hocr(translated_hocr_html: str) -> str:
-    """
-    Receives translated hOCR HTML and uses Gemini to generate a final,
-    layout-aware HTML document.
-    """
     try:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
         genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Updated model name
         prompt = f"""
-        Given the following hOCR HTML, which contains translated text and positional information, convert it into a clean, well-styled HTML document suitable for display in an iframe.
-        - Reconstruct the layout based on the bounding box (`bbox`) information in the element titles.
-        - The final output should be a single HTML file with embedded CSS.
-        - Ensure text does not overlap.
-        - Use tables or other appropriate structures if they help maintain the layout.
-        - Do not include any explanations, just the raw HTML code.
-        hOCR Input:
-        {translated_hocr_html}
         """
         def do_request():
-            """Synchronous function to be run in a separate thread."""
-            response = model.generate_content(prompt)
-            # Clean up the response to ensure it's just raw HTML
-            cleaned_html = response.text.strip()
-            if cleaned_html.startswith("```html"):
-                cleaned_html = cleaned_html[7:]
-            if cleaned_html.endswith("```"):
-                cleaned_html = cleaned_html[:-3]
-            return cleaned_html.strip()
-        # Run the synchronous Gemini API call in a thread pool executor
-        return await asyncio.to_thread(do_request)
-    except Exception as e:
-        error_message = f"An error occurred while generating the HTML structure with Gemini: {str(e)}"
-        traceback.print_exc()
-        return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-# --- API ENDPOINT ---
-@app.post("/api/translate_file_pytesseract", response_class=HTMLResponse)
-async def translate_document_with_hocr(
-    target_language: str = Form(...), file: UploadFile = File(...)
-):
-    """
-    Processes a document using the simplified hOCR-based pipeline:
-    1. Tesseract extracts text and coordinates into an hOCR file.
-    2. Sea-Lion translates the text directly within the hOCR HTML structure.
-    3. Gemini uses the translated hOCR to generate the final layout-aware HTML.
-    """
-    content_type = file.content_type
-    if content_type not in ["image/png", "image/jpeg", "image/bmp", "image/tiff"]:
-        raise HTTPException(
-            status_code=400,
-            detail="Unsupported file type. Please use PNG, JPG, BMP or TIFF.",
-        )
-    try:
-        # === STEP 1: Extract text and coordinates with Tesseract hOCR ===
-        hocr_html = await get_hocr_from_image(file)
-        if not hocr_html or "ocr_page" not in hocr_html:
-            raise HTTPException(
-                status_code=400,
-                detail="Tesseract could not extract any hOCR data from the image.",
-            )
-        print("***** Step 1 Done: Generated hOCR from image ******")
-        # === STEP 2: Translate text directly within the hOCR structure ===
-        translated_hocr = await translate_hocr_html_concurrently(
-            hocr_html, target_language
-        )
-        print("***** Step 2 Done: Translated hOCR in-place ******")
-        # === STEP 3: Generate final, layout-aware HTML from Gemini ===
-        final_html = await generate_html_from_hocr(translated_hocr)
-        print("***** Step 3 Done: Generated final HTML from translated hOCR ******")
-        # Return the final HTML generated by Gemini
-        return HTMLResponse(content=final_html)
-    except HTTPException:
-        # Re-raise HTTPExceptions directly to preserve status code and detail
-        raise
-    except Exception as e:
-        traceback.print_exc()
-        raise HTTPException(
-            status_code=500,
-            detail=f"An unexpected error occurred during processing: {str(e)}",
-        )
-# ----------------------------------END OF PYTESSERACT workflow-----------------------------------
-# ----------------------------------Start OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
-# --- Helper Functions (Sealion, Gemini Configuration) ---
-# This helper function for calling the Sea-Lion API is used by both translation functions.
-async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
-    """Helper function to call the translation API for a single piece of text."""
-    if not text_to_translate.strip():
-        return ""  # Don't send empty strings for translation
-    url = "https://api.sea-lion.ai/v1/chat/completions"
-    api_key = os.getenv("SEALION_API_KEY")
-    if not api_key:
-        print("Warning: SEALION_API_KEY not set. Skipping translation.")
-        return f"{text_to_translate} (Translation Skipped)"
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-    # Precise prompt for clean output
-    prompt = f'Translate the following text to {lang}. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text_to_translate}"'
-    payload = {
-        "max_completion_tokens": 2048,
-        "messages": [{"role": "user", "content": prompt}],
-        "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
-    }
-    async with httpx.AsyncClient() as client:
-        try:
-            response = await client.post(
-                url, headers=headers, json=payload, timeout=45.0
-            )
-            response.raise_for_status()
-            response_json = response.json()
-            translated_text = response_json["choices"][0]["message"]["content"].strip()
-            # Clean up potential extra quotes that the model might add
-            return re.sub(r'^"|"$', "", translated_text)
-        except httpx.RequestError as e:
-            print(f"Translation request failed: {e}")
-            return f"Translation Error: {text_to_translate}"
-        except (KeyError, IndexError) as e:
-            print(f"Could not parse translation response: {e}")
-            return f"Translation Parsing Error: {text_to_translate}"
 # --- OCR EXTRACTION FUNCTIONS ---
@@ -1145,12 +425,13 @@ async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
         os.unlink(temp_filepath)
-# --- TRANSLATION FUNCTIONS ---
-async def translate_hocr_html_concurrently(hocr_html: str, target_language: str) -> str:
     """
-    Parses hOCR, translates text concurrently, and injects translations back into the HTML.
     """
     soup = BeautifulSoup(hocr_html, "html.parser")
     elements_to_translate = soup.find_all(class_="ocrx_word")
@@ -1158,33 +439,37 @@ async def translate_hocr_html_concurrently(hocr_html: str, target_language: str)
         elements_to_translate = soup.find_all(class_="ocr_line")
     original_texts = [el.get_text(strip=True) for el in elements_to_translate]
-    translation_tasks = [
-        call_sealion_for_translation(text, target_language) for text in original_texts
-    ]
-    translated_texts = await asyncio.gather(*translation_tasks)
     for i, element in enumerate(elements_to_translate):
         if element.string:
-            element.string.replace_with(translated_texts[i])
     return str(soup)
-async def translate_paddle_data_concurrently(
     paddle_data: list[dict], target_language: str
 ) -> list[dict]:
     """
-    Translates the 'text' field of each item in the paddle_data list concurrently.
     """
     original_texts = [item.get("text", "") for item in paddle_data]
-    translation_tasks = [
-        call_sealion_for_translation(text, target_language) for text in original_texts
-    ]
-    translated_texts = await asyncio.gather(*translation_tasks)
     translated_data = []
     for i, item in enumerate(paddle_data):
-        translated_data.append({"text": translated_texts[i], "box": item.get("box")})
     return translated_data
@@ -1205,7 +490,7 @@ async def generate_html_from_dual_ocr(
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
         genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-2.5-flash")
         prompt = f"""
                 You are provided with two different translated OCR outputs for the same document.
@@ -1222,10 +507,10 @@ async def generate_html_from_dual_ocr(
                 --- PADDLEOCR END ---
                 STRICT RULES:
-                1. You MUST output ONLY the FINAL RAW HTML code.
-                - No ```html, no triple quotes, no markdown, no explanations.
                 - Output must begin with <!DOCTYPE html> and end with </html>.
-                2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
                 - Every PaddleOCR text must appear exactly once in the correct order and location.
                 3. The HTML must be fully self-contained:
                 - Include <html>, <head>, <style>, and <body>.
@@ -1260,14 +545,14 @@ async def generate_html_from_dual_ocr(
         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-@app.post("/api/translate_file_dual_ocr", response_class=HTMLResponse)
 async def translate_document_dual_ocr(
     target_language: str = Form(...), file: UploadFile = File(...)
 ):
     """
     Processes a document using a dual OCR pipeline:
     1. Tesseract and PaddleOCR extract text and coordinates concurrently.
-    2. Sea-Lion translates the text from both outputs concurrently.
     3. Gemini uses both translated outputs to generate the final layout-aware HTML.
     """
     content_type = file.content_type
@@ -1296,22 +581,19 @@ async def translate_document_dual_ocr(
                 status_code=400,
                 detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
             )
-        print(paddle_task)
-        print(hocr_task)
         print("***** Step 1 Done: Finished OCR extraction ******")
-        # === STEP 2: Translate both OCR outputs concurrently ===
-        print("***** Step 2: Starting concurrent translation ******")
-        translated_hocr_task = translate_hocr_html_concurrently(
             hocr_html, target_language
         )
-        translated_paddle_task = translate_paddle_data_concurrently(
             paddle_data, target_language
         )
         translated_hocr, translated_paddle = await asyncio.gather(
             translated_hocr_task, translated_paddle_task
         )
-        print(translated_paddle_task)
         print("***** Step 2 Done: Finished translation ******")
         # === STEP 3: Generate final HTML from both translated outputs ===
@@ -1321,7 +603,7 @@ async def translate_document_dual_ocr(
         final_html = await generate_html_from_dual_ocr(
             translated_hocr, translated_paddle
         )
         print("***** Step 3 Done: Generated final HTML ******")
         return HTMLResponse(content=final_html)
@@ -1334,74 +616,146 @@ async def translate_document_dual_ocr(
             status_code=500,
             detail=f"An unexpected error occurred during processing: {str(e)}",
         )
-# ----------------------------------End OF PYTESSERACT + PADDLEOCR workflow-----------------------------------
-#------------------------ start of gemini workflow ---------------------------------
-# This helper function for calling the Sea-Lion API is now UNUSED in the pipeline,
-# but is kept here as requested.
-async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
-    """Helper function to call the translation API for a single piece of text."""
-    if not text_to_translate.strip():
-        return ""  # Don't send empty strings for translation
-    url = "https://api.sea-lion.ai/v1/chat/completions"
-    api_key = os.getenv("SEALION_API_KEY")
-    if not api_key:
-        print("Warning: SEALION_API_KEY not set. Skipping translation.")
-        return f"{text_to_translate} (Translation Skipped)"
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-    # Precise prompt for clean output
-    prompt = f'Translate the following text to {lang}. Return ONLY the translated text, without any additional explanations, formatting, or quotation marks:\n\n"{text_to_translate}"'
-    payload = {
-        "max_completion_tokens": 2048,
-        "messages": [{"role": "user", "content": prompt}],
-        "model": "aisingapore/Llama-SEA-LION-v3-70B-IT",
-    }
-    async with httpx.AsyncClient() as client:
-        try:
-            response = await client.post(
-                url, headers=headers, json=payload, timeout=45.0
-            )
-            response.raise_for_status()
-            response_json = response.json()
-            translated_text = response_json["choices"][0]["message"]["content"].strip()
-            # Clean up potential extra quotes that the model might add
-            return re.sub(r'^"|"$', "", translated_text)
-        except httpx.RequestError as e:
-            print(f"Translation request failed: {e}")
-            return f"Translation Error: {text_to_translate}"
-        except (KeyError, IndexError) as e:
-            print(f"Could not parse translation response: {e}")
-            return f"Translation Parsing Error: {text_to_translate}"
-# --- NEW GEMINI TRANSLATION FUNCTION ---
 async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
     """
     Translates a list of texts using Gemini in a single batch API call.
     """
-    if not texts:
-        return []
     try:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
         genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
-        # Create a single prompt asking for a JSON array response
         prompt = f"""
         Translate each string in the following JSON array of strings to {target_language}.
         Return a single JSON array where each element is the translated string corresponding
@@ -1422,14 +776,10 @@ async def translate_texts_with_gemini(texts: list[str], target_language: str) ->
             response = model.generate_content(prompt)
             return response.text.strip()
-        # Run the synchronous SDK call in a thread to avoid blocking asyncio
         response_text = await asyncio.to_thread(do_request)
-        # Clean the response to ensure it's valid JSON
         json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
         if not json_response_match:
             print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
-            # Fallback: return original texts if parsing fails
             return texts
         cleaned_json = json_response_match.group(0)
@@ -1437,78 +787,55 @@ async def translate_texts_with_gemini(texts: list[str], target_language: str) ->
         if len(translated_texts) != len(texts):
             print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
-            # Fallback in case of length mismatch
             return texts
         return translated_texts
     except Exception as e:
         print(f"An error occurred during Gemini translation: {e}")
-        # Return original texts as a fallback
         return texts
-# --- OCR EXTRACTION FUNCTIONS ---
 async def get_hocr_from_image(image_bytes: bytes) -> str:
     """
     Performs OCR using Tesseract to get raw hOCR HTML output.
-    This function accepts image bytes.
     """
     if not image_bytes:
         raise ValueError("Image bytes cannot be empty.")
     try:
         image = Image.open(io.BytesIO(image_bytes))
     except Exception as e:
         raise HTTPException(
             status_code=400,
-            detail=f"Cannot open image for Tesseract. It may be corrupted or unsupported. Error: {e}",
         )
-    # Run Tesseract OCR in a thread to avoid blocking the asyncio event loop
-    loop = asyncio.get_running_loop()
-    hocr_bytes = await loop.run_in_executor(
-        None, lambda: pytesseract.image_to_pdf_or_hocr(image, extension="hocr")
-    )
-    return hocr_bytes.decode("utf-8")
-async def extract_text_and_boxes_with_paddle(image_bytes: bytes) -> list[dict]:
     """
-    Extracts text and their bounding boxes from an image using PaddleOCR.
-    Returns the full list of dictionary objects from the OCR tool.
     """
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-        temp_file.write(image_bytes)
-        temp_filepath = temp_file.name
     try:
-        def do_ocr() -> list[dict]:
-            """Synchronous function to be run in a separate thread."""
-            client = Client("kevansoon/PaddleOCR")
-            # Returns a list of dictionaries, e.g., [{'text': '...', 'box': [...]}]
-            result = client.predict(
-                img=handle_file(temp_filepath),
-                lang="en",
-                api_name="/predict",
-            )
-            return result
-        loop = asyncio.get_running_loop()
-        extracted_data = await loop.run_in_executor(None, do_ocr)
         if not extracted_data:
-            print("Warning: PaddleOCR returned no data.")
-            return []
         return extracted_data
-    finally:
-        os.unlink(temp_filepath)
 # --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
 async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
     """
     Parses hOCR, translates all text in a single batch call to Gemini,
@@ -1520,17 +847,14 @@ async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str)
         elements_to_translate = soup.find_all(class_="ocr_line")
     original_texts = [el.get_text(strip=True) for el in elements_to_translate]
-    # Translate all texts in one go
     translated_texts = await translate_texts_with_gemini(original_texts, target_language)
-    # Inject translations back
     for i, element in enumerate(elements_to_translate):
-        if element.string:
-            # Ensure we don't go out of bounds if translation failed
-            if i < len(translated_texts):
-                element.string.replace_with(translated_texts[i])
     return str(soup)
@@ -1542,22 +866,20 @@ async def translate_paddle_data_with_gemini(
     using a single batch call to Gemini.
     """
     original_texts = [item.get("text", "") for item in paddle_data]
-    # Translate all texts in one go
     translated_texts = await translate_texts_with_gemini(original_texts, target_language)
     translated_data = []
     for i, item in enumerate(paddle_data):
-         # Ensure we don't go out of bounds if translation failed
         translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
         translated_data.append({"text": translated_text, "box": item.get("box")})
     return translated_data
 # --- FINAL HTML GENERATION ---
 async def generate_html_from_dual_ocr(
     translated_hocr_html: str, translated_paddle_data: list[dict]
 ) -> str:
@@ -1569,54 +891,43 @@ async def generate_html_from_dual_ocr(
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
         genai.configure(api_key=api_key)
-        model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
         prompt = f"""
-                You are provided with two different translated OCR outputs for the same document.
-                Your task is to MERGE them into a SINGLE, CLEAN, and WELL-STYLED HTML document that can be rendered directly in an iframe.
-                Input 1: Translated hOCR HTML
                 --- HOCR START ---
                 {translated_hocr_html}
                 --- HOCR END ---
-                Input 2: Translated PaddleOCR data (Python list of dicts with 'text' and 'box'):
                 --- PADDLEOCR START ---
                 {str(translated_paddle_data)}
                 --- PADDLEOCR END ---
-                STRICT RULES:
-                1. You MUST output ONLY the FINAL RAW HTML code.
-                - No ```html, no triple quotes, no markdown, no explanations.
-                - Output must begin with <!DOCTYPE html> and end with </html>.
-                2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
-                - Every PaddleOCR text must appear exactly once in the correct order and location.
-                3. The HTML must be fully self-contained:
-                - Include <html>, <head>, <style>, and <body>.
-                - Include CSS in a <style> block so it renders exactly in an iframe.
-                4. Table structure requirement:
-                - Use <table>, <tbody>, <tr>, and <td> to organize words into rows and columns.
-                - Each PaddleOCR word must be placed in a separate <td> within the correct row based on vertical alignment.
-                - Apply CSS for borders, padding, and cell alignment to ensure readability.
-                - Use colspan/rowspan where necessary to match the original layout.
-                5. Positioning:
-                - Use bounding box data to size and place each cell proportionally.
-                - Avoid text overlap — if bounding boxes would overlap, adjust table cell spans or widths.
-                6. Before outputting:
-                - Validate internally that the HTML is valid.
-                - Confirm every PaddleOCR text appears in the table.
-                - Confirm the table renders correctly in an iframe.
                 FINAL OUTPUT REQUIREMENT:
-                - Output ONLY the complete, valid HTML — no commentary, no extra text.
                 """
         def do_request():
-            """Synchronous function to be run in a separate thread."""
             response = model.generate_content(prompt)
-            return response.text.strip()
         return await asyncio.to_thread(do_request)
@@ -1626,9 +937,13 @@ async def generate_html_from_dual_ocr(
         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
-@app.post("/api/translate_file_gemini", response_class=HTMLResponse)
 async def translate_document_dual_ocr(
-    target_language: str = Form(...), file: UploadFile = File(...)
 ):
     """
     Processes a document using a dual OCR pipeline:
@@ -1644,17 +959,14 @@ async def translate_document_dual_ocr(
         )
     try:
-        await file.seek(0)
         image_bytes = await file.read()
         if not image_bytes:
             raise HTTPException(status_code=400, detail="Uploaded file is empty.")
         # === STEP 1: Run both OCR extractions concurrently ===
-        print(
-            "***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******"
-        )
         hocr_task = get_hocr_from_image(image_bytes)
-        paddle_task = extract_text_and_boxes_with_paddle(image_bytes)
         hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
         if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
@@ -1666,25 +978,16 @@ async def translate_document_dual_ocr(
         # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
         print("***** Step 2: Starting concurrent translation with Gemini ******")
-        translated_hocr_task = translate_hocr_html_with_gemini(
-            hocr_html, target_language
-        )
-        translated_paddle_task = translate_paddle_data_with_gemini(
-            paddle_data, target_language
-        )
         translated_hocr, translated_paddle = await asyncio.gather(
             translated_hocr_task, translated_paddle_task
         )
         print("***** Step 2 Done: Finished translation ******")
         # === STEP 3: Generate final HTML from both translated outputs ===
-        print(
-            "***** Step 3: Generating final HTML from dual OCR data via Gemini ******"
-        )
-        final_html = await generate_html_from_dual_ocr(
-            translated_hocr, translated_paddle
-        )
         print("***** Step 3 Done: Generated final HTML ******")
         return HTMLResponse(content=final_html)
@@ -1697,4 +1000,5 @@ async def translate_document_dual_ocr(
             status_code=500,
             detail=f"An unexpected error occurred during processing: {str(e)}",
         )
-#-------------------------- end of gemini workflow ----------------------------------

 import tempfile
 import io
 import traceback
+import atexit
+import functools
+from queue import Queue
+from threading import Event, Thread
 # beautifulsoup
 from bs4 import BeautifulSoup
 # --- MODIFIED: Replaced old tool imports with the new one ---
 from tools.tools import analyze_contract
+#numpy and paddleocr
+import numpy as np
+from paddleocr import PaddleOCR
 app = FastAPI(
     title="Document Translator (Final Architecture)",
         raise HTTPException(
             status_code=500, detail=f"An unexpected server error occurred: {str(e)}"
         )
 @app.post("/upload")
 async def upload_file(authorization: str = Header(...), file: UploadFile = File(...)):
     return documents
+# --- END: NEW ENDPOINT FOR THE REFACTORED TOOL ---
+# testing clerk backend authentication
+# @app.post("/upload")
+# async def upload_file(
+#     authorization: str = Header(...),
+#     file: UploadFile = File(...)
+# ):
+#     if not authorization.startswith("Bearer "):
+#         raise HTTPException(status_code=401, detail="Missing Bearer token")
+#     token = authorization.split(" ")[1]
+#     claims = await verify_clerk_jwt(token)
+#     user_id = claims.get("sub")  # Clerk user ID
+#     # ✅ Now the Clerk user is verified
+#     # You can securely store this file, e.g., to Supabase or local
+#     return {"message": f"File uploaded by Clerk user {user_id}"}
+#------------------------ start of gemini workflow ---------------------------------
+# This helper function for calling the Sea-Lion API is now UNUSED in the pipeline,
+# but is kept here as requested.
 async def call_sealion_for_translation(text_to_translate: str, lang: str) -> str:
     """Helper function to call the translation API for a single piece of text."""
     if not text_to_translate.strip():
             return f"Translation Parsing Error: {text_to_translate}"
+# --- NEW GEMINI TRANSLATION FUNCTION ---
+async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
     """
+    Translates a list of texts using Gemini in a single batch API call.
     """
+    if not texts:
+        return []
     try:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
         genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
+        # Create a single prompt asking for a JSON array response
         prompt = f"""
+        Translate each string in the following JSON array of strings to {target_language}.
+        Return a single JSON array where each element is the translated string corresponding
+        to the original at the same index. Your output MUST be only the JSON array and nothing else.
+        Example Input:
+        ["Hello world", "How are you?"]
+        Example Output for target language 'Spanish':
+        ["Hola mundo", "¿Cómo estás?"]
+        Input for this task:
+        {json.dumps(texts)}
         """
         def do_request():
+            """Synchronous function to be run in a separate thread."""
+            response = model.generate_content(prompt)
+            return response.text.strip()
+        # Run the synchronous SDK call in a thread to avoid blocking asyncio
+        response_text = await asyncio.to_thread(do_request)
+        # Clean the response to ensure it's valid JSON
+        json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
+        if not json_response_match:
+            print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
+            # Fallback: return original texts if parsing fails
+            return texts
+        cleaned_json = json_response_match.group(0)
+        translated_texts = json.loads(cleaned_json)
+        if len(translated_texts) != len(texts):
+            print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
+            # Fallback in case of length mismatch
+            return texts
+        return translated_texts
+    except Exception as e:
+        print(f"An error occurred during Gemini translation: {e}")
+        # Return original texts as a fallback
+        return texts
 # --- OCR EXTRACTION FUNCTIONS ---
         os.unlink(temp_filepath)
+# --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
+async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
     """
+    Parses hOCR, translates all text in a single batch call to Gemini,
+    and injects translations back into the HTML.
     """
     soup = BeautifulSoup(hocr_html, "html.parser")
     elements_to_translate = soup.find_all(class_="ocrx_word")
         elements_to_translate = soup.find_all(class_="ocr_line")
     original_texts = [el.get_text(strip=True) for el in elements_to_translate]
+    # Translate all texts in one go
+    translated_texts = await translate_texts_with_gemini(original_texts, target_language)
+    # Inject translations back
     for i, element in enumerate(elements_to_translate):
         if element.string:
+            # Ensure we don't go out of bounds if translation failed
+            if i < len(translated_texts):
+                element.string.replace_with(translated_texts[i])
     return str(soup)
+async def translate_paddle_data_with_gemini(
     paddle_data: list[dict], target_language: str
 ) -> list[dict]:
     """
+    Translates the 'text' field of each item in the paddle_data list
+    using a single batch call to Gemini.
     """
     original_texts = [item.get("text", "") for item in paddle_data]
+    # Translate all texts in one go
+    translated_texts = await translate_texts_with_gemini(original_texts, target_language)
     translated_data = []
     for i, item in enumerate(paddle_data):
+         # Ensure we don't go out of bounds if translation failed
+        translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
+        translated_data.append({"text": translated_text, "box": item.get("box")})
     return translated_data
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
         genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-2.5-flash") # Using Flash for speed
         prompt = f"""
                 You are provided with two different translated OCR outputs for the same document.
                 --- PADDLEOCR END ---
                 STRICT RULES:
+                1. You MUST output ONLY the FINAL RAW HTML code.
+                - No ```html, no triple quotes, no markdown, no explanations.
                 - Output must begin with <!DOCTYPE html> and end with </html>.
+                2. ALL text from the second input (PaddleOCR) MUST be included in the final HTML without omission.
                 - Every PaddleOCR text must appear exactly once in the correct order and location.
                 3. The HTML must be fully self-contained:
                 - Include <html>, <head>, <style>, and <body>.
         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
+@app.post("/api/translate_file_gemini", response_class=HTMLResponse)
 async def translate_document_dual_ocr(
     target_language: str = Form(...), file: UploadFile = File(...)
 ):
     """
     Processes a document using a dual OCR pipeline:
     1. Tesseract and PaddleOCR extract text and coordinates concurrently.
+    2. Gemini translates the text from both outputs concurrently using a batch method.
     3. Gemini uses both translated outputs to generate the final layout-aware HTML.
     """
     content_type = file.content_type
                 status_code=400,
                 detail="Neither Tesseract nor PaddleOCR could extract any data from the image.",
             )
         print("***** Step 1 Done: Finished OCR extraction ******")
+        # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
+        print("***** Step 2: Starting concurrent translation with Gemini ******")
+        translated_hocr_task = translate_hocr_html_with_gemini(
             hocr_html, target_language
         )
+        translated_paddle_task = translate_paddle_data_with_gemini(
             paddle_data, target_language
         )
         translated_hocr, translated_paddle = await asyncio.gather(
             translated_hocr_task, translated_paddle_task
         )
         print("***** Step 2 Done: Finished translation ******")
         # === STEP 3: Generate final HTML from both translated outputs ===
         final_html = await generate_html_from_dual_ocr(
             translated_hocr, translated_paddle
         )
         print("***** Step 3 Done: Generated final HTML ******")
         return HTMLResponse(content=final_html)
             status_code=500,
             detail=f"An unexpected error occurred during processing: {str(e)}",
         )
+#-------------------------- end of gemini workflow ----------------------------------
+#-------------------------- start of updated gemini workflow ----------------------------------
+# --- PADDLEOCR LOCAL MODEL MANAGER SETUP ---
+LANG_CONFIG = {
+    "ch": {"num_workers": 2},
+    "en": {"num_workers": 2},
+    "fr": {"num_workers": 1},
+    "german": {"num_workers": 1},
+    "korean": {"num_workers": 1},
+    "japan": {"num_workers": 1},
+}
+CONCURRENCY_LIMIT = 8
+class PaddleOCRModelManager(object):
+    def __init__(self,
+                 num_workers,
+                 model_factory):
+        super().__init__()
+        self._model_factory = model_factory
+        self._queue = Queue()
+        self._workers = []
+        self._model_initialized_event = Event()
+        for _ in range(num_workers):
+            worker = Thread(target=self._worker, daemon=True) # Use daemon threads
+            worker.start()
+            self._model_initialized_event.wait()
+            self._model_initialized_event.clear()
+            self._workers.append(worker)
+    def infer(self, *args, **kwargs):
+        result_queue = Queue(maxsize=1)
+        self._queue.put((args, kwargs, result_queue))
+        success, payload = result_queue.get()
+        if success:
+            return payload
+        else:
+            raise payload
+    def close(self):
+        for _ in self._workers:
+            self._queue.put(None)
+        for worker in self._workers:
+            worker.join()
+    def _worker(self):
+        print(f"Initializing PaddleOCR model in worker thread...")
+        model = self._model_factory()
+        self._model_initialized_event.set()
+        print(f"PaddleOCR model initialized.")
+        while True:
+            item = self._queue.get()
+            if item is None:
+                break
+            args, kwargs, result_queue = item
+            try:
+                result = model.ocr(*args, **kwargs)
+                result_queue.put((True, result))
+            except Exception as e:
+                result_queue.put((False, e))
+            finally:
+                self._queue.task_done()
+def create_model(lang):
+    print(f"Creating PaddleOCR model for language: {lang}")
+    return PaddleOCR(lang=lang, use_angle_cls=True, use_gpu=False)
+model_managers = {}
+for lang, config in LANG_CONFIG.items():
+    print(f"Setting up model manager for language: {lang}")
+    model_manager = PaddleOCRModelManager(config["num_workers"], functools.partial(create_model, lang=lang))
+    model_managers[lang] = model_manager
+def close_model_managers():
+    print("Closing all PaddleOCR model managers...")
+    for manager in model_managers.values():
+        manager.close()
+atexit.register(close_model_managers)
+def local_inference(img_bytes: bytes, lang: str) -> list[dict]:
+    """
+    Performs OCR using the local PaddleOCRModelManager.
+    Accepts image bytes and returns the structured output.
+    """
+    ocr_manager = model_managers.get(lang)
+    if not ocr_manager:
+        print(f"Warning: Language '{lang}' not configured. Falling back to 'en'.")
+        ocr_manager = model_managers['en']
+    # Convert image bytes to a numpy array that PaddleOCR can process
+    image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+    img_array = np.array(image)
+    # The result from paddleocr is a list of lists, e.g., [[box, [text, score]], ...]
+    result = ocr_manager.infer(img_array, cls=True)
+    # PaddleOCR can sometimes return a list of results (one per page/batch item)
+    if result and isinstance(result, list) and isinstance(result[0], list):
+        result = result[0]
+    output = []
+    if result:
+        for line in result:
+            box = line[0]       # Bounding box coordinates
+            text = line[1][0]   # Extracted text
+            output.append({
+                "text": text,
+                "box": box
+            })
+    return output
+# --- GEMINI TRANSLATION FUNCTION ---
 async def translate_texts_with_gemini(texts: list[str], target_language: str) -> list[str]:
     """
     Translates a list of texts using Gemini in a single batch API call.
     """
+    if not texts or all(not s.strip() for s in texts):
+        return [""] * len(texts)
     try:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        if not genai:
+             raise ImportError("google.generativeai library is not available.")
         genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
         prompt = f"""
         Translate each string in the following JSON array of strings to {target_language}.
         Return a single JSON array where each element is the translated string corresponding
             response = model.generate_content(prompt)
             return response.text.strip()
         response_text = await asyncio.to_thread(do_request)
         json_response_match = re.search(r'\[.*\]', response_text, re.DOTALL)
         if not json_response_match:
             print(f"Warning: Gemini did not return a valid JSON array. Response: {response_text}")
             return texts
         cleaned_json = json_response_match.group(0)
         if len(translated_texts) != len(texts):
             print(f"Warning: Mismatch in translation count. Expected {len(texts)}, got {len(translated_texts)}.")
             return texts
         return translated_texts
     except Exception as e:
         print(f"An error occurred during Gemini translation: {e}")
         return texts
+# --- OCR EXTRACTION FUNCTIONS ---
 async def get_hocr_from_image(image_bytes: bytes) -> str:
     """
     Performs OCR using Tesseract to get raw hOCR HTML output.
     """
     if not image_bytes:
         raise ValueError("Image bytes cannot be empty.")
     try:
         image = Image.open(io.BytesIO(image_bytes))
+        hocr_bytes = await asyncio.to_thread(
+            pytesseract.image_to_pdf_or_hocr, image, extension="hocr"
+        )
+        return hocr_bytes.decode("utf-8")
     except Exception as e:
         raise HTTPException(
             status_code=400,
+            detail=f"Tesseract OCR failed. Error: {e}",
         )
+async def extract_text_and_boxes_with_paddle(image_bytes: bytes, lang: str = "en") -> list[dict]:
     """
+    Extracts text and their bounding boxes from an image using the local
+    PaddleOCRModelManager, running it in a thread to keep the API async.
     """
     try:
+        extracted_data = await asyncio.to_thread(local_inference, image_bytes, lang)
         if not extracted_data:
+            print("Warning: Local PaddleOCR returned no data.")
         return extracted_data
+    except Exception as e:
+        print(f"An error occurred during local PaddleOCR processing: {e}")
+        traceback.print_exc()
+        # Return empty list on failure to avoid breaking the pipeline
+        return []
 # --- TRANSLATION FUNCTIONS (UPDATED TO USE GEMINI) ---
 async def translate_hocr_html_with_gemini(hocr_html: str, target_language: str) -> str:
     """
     Parses hOCR, translates all text in a single batch call to Gemini,
         elements_to_translate = soup.find_all(class_="ocr_line")
     original_texts = [el.get_text(strip=True) for el in elements_to_translate]
+    if not original_texts:
+        return str(soup)
     translated_texts = await translate_texts_with_gemini(original_texts, target_language)
     for i, element in enumerate(elements_to_translate):
+        if element.string and i < len(translated_texts):
+            element.string.replace_with(translated_texts[i])
     return str(soup)
     using a single batch call to Gemini.
     """
     original_texts = [item.get("text", "") for item in paddle_data]
+    if not original_texts:
+        return []
     translated_texts = await translate_texts_with_gemini(original_texts, target_language)
     translated_data = []
     for i, item in enumerate(paddle_data):
         translated_text = translated_texts[i] if i < len(translated_texts) else original_texts[i]
         translated_data.append({"text": translated_text, "box": item.get("box")})
     return translated_data
 # --- FINAL HTML GENERATION ---
 async def generate_html_from_dual_ocr(
     translated_hocr_html: str, translated_paddle_data: list[dict]
 ) -> str:
         api_key = os.getenv("GEMINI_API_KEY")
         if not api_key:
             raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        if not genai:
+             raise ImportError("google.generativeai library is not available.")
         genai.configure(api_key=api_key)
+        model = genai.GenerativeModel(model_name="gemini-1.5-flash")
         prompt = f"""
+                You are an expert web developer. Your task is to merge two different translated OCR outputs for the same document into a single, clean, and well-styled HTML document that can be rendered directly in an iframe.
+                Input 1: Translated hOCR HTML. This provides a basic structural layout.
                 --- HOCR START ---
                 {translated_hocr_html}
                 --- HOCR END ---
+                Input 2: Translated PaddleOCR data. This is a precise list of words and their bounding boxes.
                 --- PADDLEOCR START ---
                 {str(translated_paddle_data)}
                 --- PADDLEOCR END ---
+                STRICT INSTRUCTIONS:
+                1.  **Output Raw HTML Only**: Your entire output must be only the final HTML code. It must start with `<!DOCTYPE html>` and end with `</html>`. Do NOT include markdown fences like ```html or any explanations.
+                2.  **Prioritize PaddleOCR Data**: ALL text from the PaddleOCR input MUST be included in the final HTML. Use the hOCR as a structural guide, but the PaddleOCR data is the source of truth for the content and positioning.
+                3.  **Self-Contained HTML**: The HTML must be fully self-contained with embedded CSS in a `<style>` block within the `<head>`.
+                4.  **Layout Reconstruction**: Use absolute positioning (`position: absolute;`) for `<span>` or `<div>` elements containing the text. Use the bounding box coordinates from the PaddleOCR data to set the `top`, `left`, `width`, and `height` CSS properties for each element to reconstruct the original document layout precisely.
+                5.  **Coordinate System**: The bounding box format is [[x1, y1], [x2, y2], [x3, y3], [x4, y4]]. You can approximate the position using `left: x1`, `top: y1`, `width: x2 - x1`, and `height: y3 - y1`.
+                6.  **Validation**: Before outputting, mentally confirm that every single text element from the PaddleOCR data is present in the final HTML and positioned correctly.
                 FINAL OUTPUT REQUIREMENT:
+                - Output ONLY the complete, valid, and self-contained HTML.
                 """
         def do_request():
             response = model.generate_content(prompt)
+            # Clean up potential markdown fences
+            clean_text = re.sub(r'^```html\s*', '', response.text.strip(), flags=re.IGNORECASE)
+            clean_text = re.sub(r'\s*```$', '', clean_text)
+            return clean_text
         return await asyncio.to_thread(do_request)
         return f"<html><body><h1>HTML Generation Error</h1><p>{html.escape(error_message)}</p></body></html>"
+# --- FASTAPI ENDPOINT ---
+@app.post("/api/translate_file_gemini_local", response_class=HTMLResponse)
 async def translate_document_dual_ocr(
+    target_language: str = Form(...),
+    source_language: str = Form("en"), # Add source language for OCR
+    file: UploadFile = File(...)
 ):
     """
     Processes a document using a dual OCR pipeline:
         )
     try:
         image_bytes = await file.read()
         if not image_bytes:
             raise HTTPException(status_code=400, detail="Uploaded file is empty.")
         # === STEP 1: Run both OCR extractions concurrently ===
+        print("***** Step 1: Starting concurrent OCR extraction (Tesseract & PaddleOCR) ******")
         hocr_task = get_hocr_from_image(image_bytes)
+        paddle_task = extract_text_and_boxes_with_paddle(image_bytes, lang=source_language)
         hocr_html, paddle_data = await asyncio.gather(hocr_task, paddle_task)
         if (not hocr_html or "ocr_page" not in hocr_html) and not paddle_data:
         # === STEP 2: Translate both OCR outputs concurrently using Gemini ===
         print("***** Step 2: Starting concurrent translation with Gemini ******")
+        translated_hocr_task = translate_hocr_html_with_gemini(hocr_html, target_language)
+        translated_paddle_task = translate_paddle_data_with_gemini(paddle_data, target_language)
         translated_hocr, translated_paddle = await asyncio.gather(
             translated_hocr_task, translated_paddle_task
         )
         print("***** Step 2 Done: Finished translation ******")
         # === STEP 3: Generate final HTML from both translated outputs ===
+        print("***** Step 3: Generating final HTML from dual OCR data via Gemini ******")
+        final_html = await generate_html_from_dual_ocr(translated_hocr, translated_paddle)
         print("***** Step 3 Done: Generated final HTML ******")
         return HTMLResponse(content=final_html)
             status_code=500,
             detail=f"An unexpected error occurred during processing: {str(e)}",
         )
+#-------------------------- end of updated gemini workflow ----------------------------------

requirements.txt CHANGED Viewed

@@ -95,4 +95,6 @@ watchfiles==1.1.0
 websockets==15.0.1
 langextract
 gradio_client
-pytesseract

 websockets==15.0.1
 langextract
 gradio_client
+pytesseract
+paddlepaddle
+paddleocr==2.10.0