Spaces:

OrganizedProgrammers
/

Docxtract

Sleeping

App Files Files Community

Lucas ARRIESSE commited on Aug 4

Commit

a83bff5

1 Parent(s): f6da275

Fix TDoc download

Browse files

Files changed (2) hide show

api/docs.py +67 -38
dependencies.py +1 -0

api/docs.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import asyncio
-from typing import Literal
 from fastapi.routing import APIRouter
 import logging
 import string
@@ -19,7 +19,7 @@ from bs4 import BeautifulSoup
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from fastapi import Depends, BackgroundTasks, HTTPException, Request
-from dependencies import get_llm_router
 from fastapi.responses import StreamingResponse
 from litellm.router import Router
@@ -253,52 +253,81 @@ def get_change_request_dataframe(req: DataRequest):
 @router.post("/download_tdocs")
 def download_tdocs(req: DownloadRequest):
     """Download the specified TDocs and zips them in a single archive"""
-    documents = req.documents
-    logging.info(f"Downloading TDocs: {documents}")
-    def process_document(doc: str):
-        doc_id = doc
-        url = requests.post(
-            'https://organizedprogrammers-3gppdocfinder.hf.space/find',
-            headers={"Content-Type": "application/json"},
-            data=json.dumps({"doc_id": doc_id}),
-            verify=False
-        )
-        logging.info(
-            f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
-        url = url.json()['url']
-        logging.debug(f"Doc URL for {doc_id} is {url}")
         try:
-            txt = "\n".join(docx_to_txt(doc_id, url))
         except Exception as e:
-            txt = f"Document {doc_id} text extraction failed: {e}"
-        return doc_id, txt.encode("utf-8")
-    # PERF: use asyncio?
-    def process_batch(batch):
-        results = {}
-        for doc in batch:
-            try:
-                doc_id, file_bytes = process_document(doc)
-                results[doc_id] = file_bytes
-            except Exception as e:
-                traceback.print_exception(e)
-                results[doc] = b"Erreur"
-        return results
-    documents_bytes = process_batch(documents)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
-        for doc_id, txt_data in documents_bytes.items():
-            zip_file.writestr(f'{doc_id}.txt', txt_data)
     zip_buffer.seek(0)
     return StreamingResponse(
         zip_buffer,
-        media_type="application/zip"
     )

 import asyncio
+from typing import Dict, List, Literal, Tuple
 from fastapi.routing import APIRouter
 import logging
 import string
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 from fastapi import Depends, BackgroundTasks, HTTPException, Request
+from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
 from fastapi.responses import StreamingResponse
 from litellm.router import Router
 @router.post("/download_tdocs")
 def download_tdocs(req: DownloadRequest):
     """Download the specified TDocs and zips them in a single archive"""
+    # Document IDs to download
+    document_ids = req.documents
+    logging.info(f"Downloading TDocs: {document_ids}")
+    # Retrieve all doc URLs to download
+    doc_urls_req = requests.post(DOC_FINDER_BASE_URL + "find/batch",
+                                 headers={
+                                     "Content-Type": "application/json"
+                                 },
+                                 data=json.dumps({
+                                     "doc_ids": document_ids
+                                 }),
+                                 verify=False)
+    doc_urls_req.raise_for_status()
+    doc_urls = doc_urls_req.json()
+    # early check to bail out if no doc is available.
+    if len(doc_urls["results"]) == 0:
+        logging.warning(
+            f"Got no URL results for docs {document_ids}. 3GPP index may not be up to date")
+        raise HTTPException(
+            status_code=501, detail="Got no URL results for docs {documents}. 3GPP index may not be up to date")
+    documents_content: Dict[str, bytes] = {}
+    failed_documents: List[str] = []
+    def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
+        """Attempts to convert a document to text and returns success status and content."""
         try:
+            text_lines = docx_to_txt(doc_id, doc_url)
+            content_bytes = "\n".join(text_lines).encode("utf-8")
+            return True, content_bytes
         except Exception as e:
+            logging.warning(
+                f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
+            error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
+                "utf-8")
+            return False, error_message
+    for doc_id, doc_url in doc_urls["results"].items():
+        success, content = _process_single_document(doc_id, doc_url)
+        documents_content[doc_id] = content
+        if not success:
+            failed_documents.append(doc_id)
+    # sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
+    for requested_doc_id in document_ids:
+        if requested_doc_id not in documents_content:
+            error_msg = (
+                f"Failed to retrieve or process document '{requested_doc_id}'. "
+                "The 3GPP index may not be up to date, or the document might be unavailable."
+            ).encode("utf-8")
+            documents_content[requested_doc_id] = error_msg
+            logging.warning(
+                f"Document '{requested_doc_id}' was requested but not found or processed.")
+            if requested_doc_id not in failed_documents:
+                failed_documents.append(requested_doc_id)
     zip_buffer = io.BytesIO()
     with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
+        for doc_id, content_bytes in documents_content.items():
+            safe_filename = f"{doc_id}.txt"
+            zip_file.writestr(safe_filename, content_bytes)
     zip_buffer.seek(0)
     return StreamingResponse(
         zip_buffer,
+        media_type="application/zip",
+        headers={"Content-Disposition": "attachment; filename=tdocs.zip"}
     )

dependencies.py CHANGED Viewed

@@ -9,6 +9,7 @@ from jinja2 import Environment, StrictUndefined, FileSystemLoader
 INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
 def init_dependencies():
     """Initialize the application global dependencies"""

 INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
+DOC_FINDER_BASE_URL = "https://organizedprogrammers-docfinder.hf.space/"
 def init_dependencies():
     """Initialize the application global dependencies"""