File size: 831 Bytes
c200df3
 
4ce2e5d
 
 
 
c200df3
 
 
4ce2e5d
 
 
 
 
c200df3
4ce2e5d
 
 
 
8416f29
 
 
c200df3
 
 
 
 
8416f29
4ce2e5d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
"""This module contains utility functions for the project"""

import mmh3
from haystack import Document


def get_unique_docs(dataset, unique_docs: set):
    """Get unique documents from dataset

    Args:
    dataset: list of dictionaries

    Returns:
    docs: list of haystack.Document
    """
    docs = list()
    for doc in dataset:
        if doc["context"] is not None and doc["context_id"] not in unique_docs:
            unique_docs.add(doc["context_id"])
            document = Document(
                content=doc["context"],
                meta={
                    "title": doc["context_title"],
                    "context_id": doc["context_id"],
                    "url": doc["url"],
                    "source": "QASports",
                },
            )
            docs.append(document)
    return docs