File size: 1,958 Bytes
01b8e8e
27e0350
01b8e8e
 
 
 
 
 
 
39503cb
01b8e8e
39503cb
01b8e8e
 
 
 
 
 
 
39503cb
27e0350
01b8e8e
27e0350
6a6afbf
27e0350
 
01b8e8e
 
 
39503cb
01b8e8e
 
 
 
 
101be32
01b8e8e
101be32
 
6bb1fd5
 
 
 
cfc1673
6bb1fd5
 
42468fb
 
 
 
101be32
39503cb
 
 
101be32
39503cb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from haystack.schema import Document
from haystack.document_stores import BaseDocumentStore
import uuid


def format_docs(documents):
    """Given a list of documents, format the documents and return the documents and doc ids."""
    db_docs: list = []
    for doc in documents:
        doc_id = doc["id"] if doc["id"] is not None else str(uuid.uuid4())
        db_doc = {
            "content": doc["text"],
            "content_type": "text",
            "id": str(uuid.uuid4()),
            "meta": {"id": doc_id},
        }
        db_docs.append(Document(**db_doc))
    return db_docs, [doc.meta["id"] for doc in db_docs]


def index(documents, pipeline, clear_index=True):
    documents, doc_ids = format_docs(documents)
    if clear_index:
        document_stores = pipeline.get_nodes_by_class(class_type=BaseDocumentStore)
        for docstore in document_stores:
            docstore.delete_index(docstore.index)
    pipeline.run(documents=documents)
    return doc_ids


def search(queries, pipeline):
    results = []
    matches_queries = pipeline.run_batch(queries=queries)
    for matches in matches_queries["documents"]:
        query_results = []
        score_is_empty = False
        for res in matches:
            if not score_is_empty:
                score_is_empty = True if res.score is None else False
            match = {
                "text": res.content,
                "id": res.meta["id"],
                "fragment_id": res.id,
                "meta": res.meta,
            }
            if not score_is_empty:
                match.update({"score": res.score})
            if hasattr(res, "content_audio"):
                match.update({"content_audio": res.content_audio})
            query_results.append(match)
        if not score_is_empty:
            query_results = sorted(
                query_results, key=lambda x: x["score"], reverse=True
            )
        results.append(query_results)
    return results