|
|
import json |
|
|
import dataclasses |
|
|
from uuid import UUID |
|
|
from typing import Any |
|
|
from datetime import datetime, date |
|
|
|
|
|
|
|
|
import configparser |
|
|
from torch import cuda |
|
|
from qdrant_client.http import models as rest |
|
|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
|
from langchain_community.cross_encoders import HuggingFaceCrossEncoder |
|
|
|
|
|
|
|
|
def get_config(fp): |
|
|
config = configparser.ConfigParser() |
|
|
config.read_file(open(fp)) |
|
|
return config |
|
|
|
|
|
|
|
|
def get_embeddings_model(config): |
|
|
device = "cuda" if cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
model_name = config.get("retriever", "MODEL") |
|
|
model_kwargs = {"device": device} |
|
|
normalize_embeddings = bool(int(config.get("retriever", "NORMALIZE"))) |
|
|
encode_kwargs = { |
|
|
"normalize_embeddings": normalize_embeddings, |
|
|
"batch_size": 100, |
|
|
} |
|
|
|
|
|
embeddings = HuggingFaceEmbeddings( |
|
|
show_progress=True, |
|
|
model_name=model_name, |
|
|
model_kwargs=model_kwargs, |
|
|
encode_kwargs=encode_kwargs, |
|
|
) |
|
|
|
|
|
return embeddings |
|
|
|
|
|
|
|
|
def create_filter( |
|
|
reports: list = [], sources: str = None, subtype: str = None, year: str = None |
|
|
): |
|
|
if len(reports) == 0: |
|
|
print(f"defining filter for sources:{sources}, subtype:{subtype}") |
|
|
filter = rest.Filter( |
|
|
must=[ |
|
|
rest.FieldCondition( |
|
|
key="metadata.source", match=rest.MatchValue(value=sources) |
|
|
), |
|
|
rest.FieldCondition( |
|
|
key="metadata.filename", match=rest.MatchAny(any=subtype) |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
) |
|
|
else: |
|
|
print(f"defining filter for allreports:{reports}") |
|
|
filter = rest.Filter( |
|
|
must=[ |
|
|
rest.FieldCondition( |
|
|
key="metadata.filename", match=rest.MatchAny(any=reports) |
|
|
) |
|
|
] |
|
|
) |
|
|
|
|
|
return filter |
|
|
|
|
|
|
|
|
def load_json(fp): |
|
|
with open(fp, "r") as f: |
|
|
docs = json.load(f) |
|
|
return docs |
|
|
|
|
|
def get_timestamp(): |
|
|
now = datetime.datetime.now() |
|
|
timestamp = now.strftime("%Y%m%d%H%M%S") |
|
|
return timestamp |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class _RecursiveSerializer(json.JSONEncoder): |
|
|
"""A custom JSONEncoder that handles complex types by converting them to dicts or strings.""" |
|
|
def default(self, obj): |
|
|
|
|
|
if hasattr(obj, 'model_dump'): |
|
|
return obj.model_dump() |
|
|
|
|
|
|
|
|
if dataclasses.is_dataclass(obj): |
|
|
return dataclasses.asdict(obj) |
|
|
|
|
|
|
|
|
if isinstance(obj, (datetime, date, UUID)): |
|
|
return str(obj) |
|
|
|
|
|
|
|
|
if hasattr(obj, '__dict__'): |
|
|
return obj.__dict__ |
|
|
|
|
|
|
|
|
return super().default(obj) |
|
|
|
|
|
def to_json_string(obj: Any, **kwargs) -> str: |
|
|
""" |
|
|
Serializes a Python object into a JSON-formatted string. |
|
|
|
|
|
This function is a comprehensive utility that can handle: |
|
|
- Standard Python types (lists, dicts, strings, numbers, bools, None). |
|
|
- Pydantic models (using `model_dump()`). |
|
|
- Dataclasses (using `dataclasses.asdict()`). |
|
|
- Standard library types not natively JSON-serializable (e.g., datetime, UUID). |
|
|
- Custom classes with a `__dict__`. |
|
|
|
|
|
Args: |
|
|
obj (Any): The Python object to serialize. |
|
|
**kwargs: Additional keyword arguments to pass to `json.dumps`. |
|
|
|
|
|
Returns: |
|
|
str: A JSON-formatted string. |
|
|
|
|
|
Example: |
|
|
>>> from datetime import datetime |
|
|
>>> from pydantic import BaseModel |
|
|
>>> from dataclasses import dataclass |
|
|
|
|
|
>>> class Address(BaseModel): |
|
|
... street: str |
|
|
... city: str |
|
|
|
|
|
>>> @dataclass |
|
|
... class Product: |
|
|
... id: int |
|
|
... name: str |
|
|
|
|
|
>>> class Order(BaseModel): |
|
|
... user_address: Address |
|
|
... item: Product |
|
|
|
|
|
>>> order_obj = Order( |
|
|
... user_address=Address(street="123 Main St", city="Example City"), |
|
|
... item=Product(id=1, name="Laptop") |
|
|
... ) |
|
|
|
|
|
>>> print(to_json_string(order_obj, indent=2)) |
|
|
{ |
|
|
"user_address": { |
|
|
"street": "123 Main St", |
|
|
"city": "Example City" |
|
|
}, |
|
|
"item": { |
|
|
"id": 1, |
|
|
"name": "Laptop" |
|
|
} |
|
|
} |
|
|
""" |
|
|
return json.dumps(obj, cls=_RecursiveSerializer, **kwargs) |
|
|
|