om4r932's picture
Add already indexed doc handling
7f4e86d
import requests, os, zipfile, subprocess, re, warnings
warnings.filterwarnings("ignore")
from io import BytesIO
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
load_dotenv()
app = FastAPI(title="3GPP Specification Splitter API",
description="API to split and display specifications by their chapters & sub-chapters",
docs_url="/")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
spec_contents = load_dataset("OrganizedProgrammers/3GPPSpecContent", token=os.environ["HF_TOKEN"])
spec_contents = spec_contents["train"].to_list()
def is_doc_indexed(spec_id: str):
return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
def get_full_doc(spec_id: str):
doc = []
for spec in spec_contents:
if spec["doc_id"] == spec_id:
doc.append(f"{spec['section']}\n{spec['content']}")
return "\n\n".join(doc)
def get_structured_doc(spec_id: str):
doc = {}
for spec in spec_contents:
if spec["doc_id"] == spec_id:
doc[spec["section"]] = spec["content"]
return doc
class SpecRequest(BaseModel):
spec_id: str
@app.post("/get_full_text")
def get_text(request: SpecRequest):
specification = request.spec_id
if is_doc_indexed(specification):
return get_full_doc(specification)
print(f"[WARNING] Document no. {specification} not indexed or is a TDoc, if it's a specification, try to reindex")
total_file = []
url = requests.post(
"https://organizedprogrammers-3gppdocfinder.hf.space/find",
verify=False,
headers={"Content-Type": "application/json"},
json={"doc_id": specification}
)
if url.status_code != 200:
raise HTTPException(404, detail="Not found")
url = url.json()['url']
response = requests.get(
url,
verify=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
)
zip_bytes = BytesIO(response.content)
current_zip_file = zipfile.ZipFile(zip_bytes)
for file_info in current_zip_file.infolist():
if file_info.filename.endswith(".zip") and len(current_zip_file.namelist()) == 1:
nested_zip_bytes = BytesIO(current_zip_file.read(file_info.filename))
current_zip_file = zipfile.ZipFile(nested_zip_bytes)
break
for file_info in current_zip_file.infolist():
filename = file_info.filename
if (filename.endswith('.doc') or filename.endswith('.docx')) and ("cover" not in filename.lower() and "annex" not in filename.lower()):
doc_bytes = current_zip_file.read(filename)
ext = filename.split(".")[-1]
input_path = f"/tmp/{specification}.{ext}"
output_path = f"/tmp/{specification}.txt"
with open(input_path, "wb") as f:
f.write(doc_bytes)
subprocess.run([
"libreoffice",
"--headless",
"--convert-to", "txt",
"--outdir", "/tmp",
input_path
], check=True)
with open(output_path, "r") as f:
txt_data = [line.strip() for line in f if line.strip()]
os.remove(input_path)
os.remove(output_path)
total_file.extend(txt_data)
if total_file == []:
raise HTTPException(status_code=404, detail="Not found !")
else:
return total_file
@app.post("/get_spec_content")
def get_spec_content(request: SpecRequest):
if is_doc_indexed(request.spec_id):
return get_structured_doc(request.spec_id)
text = get_text(request)
chapters = []
chapter_regex = re.compile(r"^(\d+[a-z]?(?:\.\d+)*)\t[A-Z0-9][\ \S]+$")
for i, line in enumerate(text):
if chapter_regex.fullmatch(line):
chapters.append((i, line))
document = {}
for i in range(len(chapters)):
start_index, chapter_title = chapters[i]
end_index = chapters[i+1][0] if i+1 < len(chapters) else len(text)
content_lines = text[start_index + 1 : end_index]
document[chapter_title.replace('\t', " ")] = "\n".join(content_lines)
return document