import io | |
import os | |
from PIL import Image | |
from azure.storage.blob import ContainerClient | |
def get_file_from_azure_blob_storage(path): | |
AZURE_SAS_URL_TRD = os.environ["AZURE_SAS_URL_TRD"] | |
container_client = ContainerClient.from_container_url(AZURE_SAS_URL_TRD) | |
blob_client = container_client.get_blob_client(path) | |
stream = blob_client.download_blob().readall() | |
file_object = io.BytesIO(stream) | |
return file_object | |
def get_image_from_azure_blob_storage(path): | |
base_path = "climateqa/documents/" | |
path = os.path.join(base_path, path) | |
file_object = get_file_from_azure_blob_storage(path) | |
image = Image.open(file_object) | |
return image | |
def remove_duplicates_keep_highest_score(documents): | |
unique_docs = {} | |
for doc in documents: | |
doc_id = doc.metadata.get('doc_id') | |
if doc_id in unique_docs: | |
if doc.metadata['reranking_score'] > unique_docs[doc_id].metadata['reranking_score']: | |
unique_docs[doc_id] = doc | |
else: | |
unique_docs[doc_id] = doc | |
return list(unique_docs.values()) | |