File size: 1,517 Bytes
c2e6cab
 
 
 
 
 
 
 
 
 
35fb63f
c2e6cab
35fb63f
c2e6cab
35fb63f
c2e6cab
 
 
 
 
 
 
35fb63f
c2e6cab
 
 
 
 
 
35fb63f
c2e6cab
35fb63f
c2e6cab
 
35fb63f
 
 
c2e6cab
 
 
 
 
 
35fb63f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents.base import Document
from langchain_core.vectorstores import VectorStore
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
from typing import List

class ClimateQARetriever(BaseRetriever):
    vectorstore: VectorStore
    sources: list = []
    reports:list = []
    threshold: float = 0.01
    k_summary: int = 3
    k_total: int = 7
    min_size: int = 200
    filter: dict = None

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:

       # Check if all elements in the list are either IPCC or IPBES
        assert isinstance(self.sources,list)
       # assert self.k_total > self.k_summary, "k_total should be greater than k_summary"

        # Prepare base search kwargs
        filters = {}

        filters["source"] = { "$in":self.sources}

        docs = self.vectorstore.similarity_search_with_score(query=query,k=self.k_total, filter=self.filter)

        # Add score to metadata 
        results = []
        for i, (doc, score) in enumerate(docs):
            # filtre les sources sous le seuil
            if score < self.threshold:
                continue
            doc.metadata["similarity_score"] = score
            doc.metadata["content"] = doc.page_content
            doc.metadata["chunk_type"] = "text"
            doc.metadata["page_number"] = 1
            results.append(doc)
        return results