File size: 1,735 Bytes
de37907
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8dca23
de37907
 
 
 
 
d8dca23
 
 
 
 
 
de37907
 
 
 
 
 
 
 
 
 
 
 
 
d8dca23
de37907
 
 
 
 
 
 
 
d8dca23
de37907
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import glob
import json
from tokenizing import tokenize_doc, tokenize_doc_to_str, tokenize_text
from rank_bm25 import BM25Okapi
import os
import pickle
import numpy as np
from tqdm import tqdm
import torch

docs = []
base_path = "./Data"
bm25_path = os.path.join(base_path, "bm25.pkl")
tokenized_docs_path = os.path.join(base_path, "tokenized_docs.pkl")

# Take all json files with names that end '_processed'  
for path in glob.glob(f"{base_path}/*_processed.json"):
    print(path)
    with open(path, 'r') as f:
        docs.extend(json.load(f))
    
index = 0

# for i, doc in enumerate(docs):
#     if 'body' in doc:
#         if doc['body'] == "I don't fuck the sandwich before eating it":
#             tokenized_doc = tokenize_doc(doc)
#             print(tokenized_doc)
#             index = i

with open(bm25_path, 'rb') as f:
    bm25 = pickle.load(f)

# tokenized_docs = [tokenize_doc(doc) for doc in tqdm(docs, desc="Tokenizing documents")]

# bm25 = BM25Okapi(tokenized_docs)

# with open(tokenized_docs_path, 'wb') as f:
#     pickle.dump(tokenized_docs, f)
# with open(bm25_path, 'wb') as f:
#     pickle.dump(bm25, f)

message = "tell me a joke about sandwich before eating it"
tokenized_message = tokenize_text(message)
print(tokenized_message)
scores = torch.tensor(bm25.get_scores(tokenized_message))
sorted_doc_indices = np.argsort(scores)

for i in range(1, 2):
    print("Score:", scores[sorted_doc_indices[-i]] )
    print(docs[sorted_doc_indices[-i]])
    print("Doc number:", sorted_doc_indices[-i])

# result_docs = [docs[i] for i in sorted_doc_indices[-30:] if scores[i] > 0]

# return result_docs[::-1] # Return the top n documents in descending order which means the most relevant documents are first