Spaces:
Running
Running
from typing import Any, List | |
from pydantic import StrictStr | |
from qdrant_client import QdrantClient | |
from qdrant_client.fastembed_common import QueryResponse | |
client = QdrantClient(host="localhost", port=6333) | |
def get_chunks(url: StrictStr) -> List[Element]: | |
elements: List[Element] = partition(url=url) | |
for i in range(len(elements)): | |
elements[i].text = clean_non_ascii_chars(text=elements[i].text) | |
elements[i].text = replace_unicode_quotes(text=elements[i].text) | |
elements[i].text = clean_extra_whitespace(text=elements[i].text) | |
elements[i].text = bytes_string_to_string(text=elements[i].text) | |
return chunk_by_title(elements=elements) | |
def add_data(chunks: List[Element]) -> None: | |
docs: List[StrictStr] = [chunks[i].text for i in range(len(chunks))] | |
metadata: List[dict[str, Any]] = [ | |
chunks[i].metadata.to_dict() for i in range(len(chunks)) | |
] | |
ids = list(range(1, len(chunks) + 1)) | |
client.add( | |
collection_name=settings.vector_database_name, | |
documents=docs, | |
metadata=metadata, | |
ids=ids, | |
) | |
def query_db(query: StrictStr) -> List[QueryResponse]: | |
return client.query( | |
collection_name=settings.vector_database_name, | |
query_text=query, | |
) | |
if __name__ == "__main__": | |
url = "https://en.wikipedia.org/wiki/Napoleon" | |
chunks: List[Element] = get_chunks(url=url) | |
add_data(chunks=chunks) | |
r: List[QueryResponse] = query_db(query="Napoleon Bonaparte") | |
print(len(r)) | |
print(r) | |