Chatacter / chatacter /db /vector_database.py
MH0386's picture
Upload folder using huggingface_hub
68051dd verified
from typing import Any, List
from pydantic import StrictStr
from qdrant_client import QdrantClient
from qdrant_client.fastembed_common import QueryResponse
client = QdrantClient(host="localhost", port=6333)
def get_chunks(url: StrictStr) -> List[Element]:
elements: List[Element] = partition(url=url)
for i in range(len(elements)):
elements[i].text = clean_non_ascii_chars(text=elements[i].text)
elements[i].text = replace_unicode_quotes(text=elements[i].text)
elements[i].text = clean_extra_whitespace(text=elements[i].text)
elements[i].text = bytes_string_to_string(text=elements[i].text)
return chunk_by_title(elements=elements)
def add_data(chunks: List[Element]) -> None:
docs: List[StrictStr] = [chunks[i].text for i in range(len(chunks))]
metadata: List[dict[str, Any]] = [
chunks[i].metadata.to_dict() for i in range(len(chunks))
]
ids = list(range(1, len(chunks) + 1))
client.add(
collection_name=settings.vector_database_name,
documents=docs,
metadata=metadata,
ids=ids,
)
def query_db(query: StrictStr) -> List[QueryResponse]:
return client.query(
collection_name=settings.vector_database_name,
query_text=query,
)
if __name__ == "__main__":
url = "https://en.wikipedia.org/wiki/Napoleon"
chunks: List[Element] = get_chunks(url=url)
add_data(chunks=chunks)
r: List[QueryResponse] = query_db(query="Napoleon Bonaparte")
print(len(r))
print(r)