Spaces:
Sleeping
Sleeping
Lucas ARRIESSE
commited on
Commit
·
a83bff5
1
Parent(s):
f6da275
Fix TDoc download
Browse files- api/docs.py +67 -38
- dependencies.py +1 -0
api/docs.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
from typing import Literal
|
| 3 |
from fastapi.routing import APIRouter
|
| 4 |
import logging
|
| 5 |
import string
|
|
@@ -19,7 +19,7 @@ from bs4 import BeautifulSoup
|
|
| 19 |
from nltk.corpus import stopwords
|
| 20 |
from nltk.stem import WordNetLemmatizer
|
| 21 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
| 22 |
-
from dependencies import get_llm_router
|
| 23 |
from fastapi.responses import StreamingResponse
|
| 24 |
from litellm.router import Router
|
| 25 |
|
|
@@ -253,52 +253,81 @@ def get_change_request_dataframe(req: DataRequest):
|
|
| 253 |
@router.post("/download_tdocs")
|
| 254 |
def download_tdocs(req: DownloadRequest):
|
| 255 |
"""Download the specified TDocs and zips them in a single archive"""
|
| 256 |
-
documents = req.documents
|
| 257 |
-
|
| 258 |
-
logging.info(f"Downloading TDocs: {documents}")
|
| 259 |
-
|
| 260 |
-
def process_document(doc: str):
|
| 261 |
-
doc_id = doc
|
| 262 |
-
url = requests.post(
|
| 263 |
-
'https://organizedprogrammers-3gppdocfinder.hf.space/find',
|
| 264 |
-
headers={"Content-Type": "application/json"},
|
| 265 |
-
data=json.dumps({"doc_id": doc_id}),
|
| 266 |
-
verify=False
|
| 267 |
-
)
|
| 268 |
-
logging.info(
|
| 269 |
-
f"Retrieving URL for doc {doc_id} returned http status {url.status_code}")
|
| 270 |
-
url = url.json()['url']
|
| 271 |
-
logging.debug(f"Doc URL for {doc_id} is {url}")
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
try:
|
| 274 |
-
|
|
|
|
|
|
|
| 275 |
except Exception as e:
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
zip_buffer = io.BytesIO()
|
| 294 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
| 295 |
-
for doc_id,
|
| 296 |
-
|
|
|
|
| 297 |
|
| 298 |
zip_buffer.seek(0)
|
|
|
|
| 299 |
return StreamingResponse(
|
| 300 |
zip_buffer,
|
| 301 |
-
media_type="application/zip"
|
|
|
|
| 302 |
)
|
| 303 |
|
| 304 |
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
from typing import Dict, List, Literal, Tuple
|
| 3 |
from fastapi.routing import APIRouter
|
| 4 |
import logging
|
| 5 |
import string
|
|
|
|
| 19 |
from nltk.corpus import stopwords
|
| 20 |
from nltk.stem import WordNetLemmatizer
|
| 21 |
from fastapi import Depends, BackgroundTasks, HTTPException, Request
|
| 22 |
+
from dependencies import DOC_FINDER_BASE_URL, get_http_client, get_llm_router
|
| 23 |
from fastapi.responses import StreamingResponse
|
| 24 |
from litellm.router import Router
|
| 25 |
|
|
|
|
| 253 |
@router.post("/download_tdocs")
|
| 254 |
def download_tdocs(req: DownloadRequest):
|
| 255 |
"""Download the specified TDocs and zips them in a single archive"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
+
# Document IDs to download
|
| 258 |
+
document_ids = req.documents
|
| 259 |
+
|
| 260 |
+
logging.info(f"Downloading TDocs: {document_ids}")
|
| 261 |
+
|
| 262 |
+
# Retrieve all doc URLs to download
|
| 263 |
+
doc_urls_req = requests.post(DOC_FINDER_BASE_URL + "find/batch",
|
| 264 |
+
headers={
|
| 265 |
+
"Content-Type": "application/json"
|
| 266 |
+
},
|
| 267 |
+
data=json.dumps({
|
| 268 |
+
"doc_ids": document_ids
|
| 269 |
+
}),
|
| 270 |
+
verify=False)
|
| 271 |
+
|
| 272 |
+
doc_urls_req.raise_for_status()
|
| 273 |
+
doc_urls = doc_urls_req.json()
|
| 274 |
+
|
| 275 |
+
# early check to bail out if no doc is available.
|
| 276 |
+
if len(doc_urls["results"]) == 0:
|
| 277 |
+
logging.warning(
|
| 278 |
+
f"Got no URL results for docs {document_ids}. 3GPP index may not be up to date")
|
| 279 |
+
|
| 280 |
+
raise HTTPException(
|
| 281 |
+
status_code=501, detail="Got no URL results for docs {documents}. 3GPP index may not be up to date")
|
| 282 |
+
|
| 283 |
+
documents_content: Dict[str, bytes] = {}
|
| 284 |
+
failed_documents: List[str] = []
|
| 285 |
+
|
| 286 |
+
def _process_single_document(doc_id: str, doc_url: str) -> Tuple[bool, bytes]:
|
| 287 |
+
"""Attempts to convert a document to text and returns success status and content."""
|
| 288 |
try:
|
| 289 |
+
text_lines = docx_to_txt(doc_id, doc_url)
|
| 290 |
+
content_bytes = "\n".join(text_lines).encode("utf-8")
|
| 291 |
+
return True, content_bytes
|
| 292 |
except Exception as e:
|
| 293 |
+
logging.warning(
|
| 294 |
+
f"Failed to process document '{doc_id}' from URL '{doc_url}': {e}")
|
| 295 |
+
error_message = f"Document '{doc_id}' text extraction failed: {e}".encode(
|
| 296 |
+
"utf-8")
|
| 297 |
+
return False, error_message
|
| 298 |
+
|
| 299 |
+
for doc_id, doc_url in doc_urls["results"].items():
|
| 300 |
+
success, content = _process_single_document(doc_id, doc_url)
|
| 301 |
+
documents_content[doc_id] = content
|
| 302 |
+
if not success:
|
| 303 |
+
failed_documents.append(doc_id)
|
| 304 |
+
|
| 305 |
+
# sanity check to ensure all requested documents are accounted for, adding error messages for any missing ones
|
| 306 |
+
for requested_doc_id in document_ids:
|
| 307 |
+
if requested_doc_id not in documents_content:
|
| 308 |
+
error_msg = (
|
| 309 |
+
f"Failed to retrieve or process document '{requested_doc_id}'. "
|
| 310 |
+
"The 3GPP index may not be up to date, or the document might be unavailable."
|
| 311 |
+
).encode("utf-8")
|
| 312 |
+
|
| 313 |
+
documents_content[requested_doc_id] = error_msg
|
| 314 |
+
logging.warning(
|
| 315 |
+
f"Document '{requested_doc_id}' was requested but not found or processed.")
|
| 316 |
+
if requested_doc_id not in failed_documents:
|
| 317 |
+
failed_documents.append(requested_doc_id)
|
| 318 |
|
| 319 |
zip_buffer = io.BytesIO()
|
| 320 |
with zipfile.ZipFile(zip_buffer, mode='w', compression=zipfile.ZIP_DEFLATED) as zip_file:
|
| 321 |
+
for doc_id, content_bytes in documents_content.items():
|
| 322 |
+
safe_filename = f"{doc_id}.txt"
|
| 323 |
+
zip_file.writestr(safe_filename, content_bytes)
|
| 324 |
|
| 325 |
zip_buffer.seek(0)
|
| 326 |
+
|
| 327 |
return StreamingResponse(
|
| 328 |
zip_buffer,
|
| 329 |
+
media_type="application/zip",
|
| 330 |
+
headers={"Content-Disposition": "attachment; filename=tdocs.zip"}
|
| 331 |
)
|
| 332 |
|
| 333 |
|
dependencies.py
CHANGED
|
@@ -9,6 +9,7 @@ from jinja2 import Environment, StrictUndefined, FileSystemLoader
|
|
| 9 |
|
| 10 |
|
| 11 |
INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
|
|
|
|
| 12 |
|
| 13 |
def init_dependencies():
|
| 14 |
"""Initialize the application global dependencies"""
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
INSIGHT_FINDER_BASE_URL = "https://organizedprogrammers-insight-finder.hf.space/"
|
| 12 |
+
DOC_FINDER_BASE_URL = "https://organizedprogrammers-docfinder.hf.space/"
|
| 13 |
|
| 14 |
def init_dependencies():
|
| 15 |
"""Initialize the application global dependencies"""
|