Spaces:
Sleeping
Sleeping
import xml.etree.ElementTree as ET | |
from elasticsearch import Elasticsearch, helpers | |
import torch | |
from transformers import CLIPProcessor, CLIPModel | |
import numpy as np | |
from server.utils.database import get_db | |
from server.utils.model import get_clip_model | |
from server.models.database import DocumentModel | |
from server.crud.patent_data import insert_data,search_data | |
from server.controllers.document import document_to_dict | |
# Load CLIP model globally for reuse | |
clip_model, clip_processor = get_clip_model() | |
def insert_data_from_xml(xml_file: str, db=get_db(), index_name="patents"): | |
tree = ET.parse(xml_file) | |
root = tree.getroot() | |
# Create index if not exists | |
if not db.indices.exists(index=index_name): | |
db.indices.create(index=index_name, body={ | |
"mappings": { | |
"properties": { | |
"reel_no": {"type": "keyword"}, | |
"frame_no": {"type": "keyword"}, | |
"assignors": {"type": "text"}, | |
"assignees": {"type": "text"}, | |
"invention_title": {"type": "text"}, | |
"conveyance_text": {"type": "text"}, | |
"doc_numbers": {"type": "keyword"}, | |
"raw_text": {"type": "text"}, | |
"embedding": {"type": "dense_vector", "dims": 512, "index": True, "similarity": "cosine"} | |
} | |
} | |
}) | |
get_text = lambda el: el.text.strip() if el is not None and el.text else "" | |
bulk_data = [] | |
for pa in root.findall('.//patent-assignment'): | |
record = pa.find('assignment-record') | |
if record is None: | |
continue | |
reel_no = get_text(record.find('reel-no')) | |
frame_no = get_text(record.find('frame-no')) | |
conveyance_text = get_text(record.find('conveyance-text')) | |
assignors = ", ".join([ | |
get_text(a.find('name')) for a in pa.findall('.//patent-assignor') if get_text(a.find('name')) | |
]) | |
assignees = ", ".join([ | |
get_text(a.find('name')) for a in pa.findall('.//patent-assignee') if get_text(a.find('name')) | |
]) | |
invention_title = "" | |
doc_numbers = [] | |
for prop in pa.findall('.//patent-property'): | |
title = prop.find('invention-title') | |
if title is not None: | |
invention_title = get_text(title) | |
for doc in prop.findall('document-id'): | |
doc_num = get_text(doc.find('doc-number')) | |
if doc_num: | |
doc_numbers.append(doc_num) | |
embedding = None | |
if invention_title: | |
inputs = clip_processor(text=[invention_title], return_tensors="pt", padding=True, truncation=True) | |
with torch.no_grad(): | |
embedding = clip_model.get_text_features(**inputs)[0].cpu().numpy().astype(np.float32).tolist() | |
else: | |
embedding = [0.0]*512 | |
document=DocumentModel() | |
document.reel_no = reel_no | |
document.frame_no = frame_no | |
document.assignors = assignors | |
document.assignees = assignees | |
document.invention_title = invention_title | |
document.conveyance_text = conveyance_text | |
document.doc_numbers = doc_numbers | |
document.raw_text = invention_title | |
document.embedding = embedding | |
doc=document_to_dict(document) | |
bulk_data.append({"_index": index_name, "_source": doc}) | |
if bulk_data: | |
helpers.bulk(db, bulk_data) | |
print(f"Inserted {len(bulk_data)} records into index '{index_name}'") | |
else: | |
return "No patent records found to insert." | |