leomaurodesenv commited on
Commit
4ce2e5d
1 Parent(s): 80fa8ee

feat(utils): Create utils, add get unique documents function

Browse files
Files changed (1) hide show
  1. utils.py +22 -0
utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''This module contains utility functions for the project'''
2
+ import mmh3
3
+ from haystack import Document
4
+
5
+
6
+ def get_unique_docs(dataset):
7
+ '''Get unique documents from dataset
8
+
9
+ Args:
10
+ dataset: list of dictionaries
11
+
12
+ Returns:
13
+ docs: list of haystack.Document
14
+ '''
15
+ unique_docs = set()
16
+ docs = list()
17
+ for doc in dataset:
18
+ if doc["context"] is not None and doc["context_id"] not in unique_docs:
19
+ unique_docs.add(doc["context_id"])
20
+ document = Document(content=doc["context"], meta={'title': doc["context_title"], 'context_id': doc["context_id"]})
21
+ docs.append(document)
22
+ return docs