File size: 1,958 Bytes
1457d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94dc00e
1457d21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af971a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import tiktoken
from random import shuffle

# `tokenizer`: used to count how many tokens
tokenizer_name = tiktoken.encoding_for_model('gpt-4')
tokenizer = tiktoken.get_encoding(tokenizer_name.name)

def tiktoken_len(text):
    # evaluate how many tokens for the given text
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)


class Knowledge:
    def __init__(self, db):
        self.db = db
        self.contents = []

    def collect_knowledge(self, keywords_dict: dict, max_query: int):
        """
        keywords_dict:
            {"machine learning": 5, "language model": 2};
        """
        db = self.db
        if max_query > 0:
            for kw in keywords_dict:
                docs = db.similarity_search_with_score(kw, k=max_query)
                for i in range(max_query):
                    content = {"content": docs[i][0].page_content.replace('\n', ' '),
                               "score": docs[i][1]}  # todo: add more meta information; clean the page_content
                    self.contents.append(content)
            # sort contents by score / shuffle
            shuffle(self.contents)

    def to_prompts(self, max_tokens=2048):
        if len(self.contents) == 0:
            return ""
        prompts = []
        tokens = 0
        for idx, content in enumerate(self.contents):
            prompt = "Reference {}: {}\n".format(idx, content["content"])
            tokens += tiktoken_len(prompt)
            if tokens >= max_tokens:
                break
            else:
                prompts.append(prompt)
        return "".join(prompts)

    def to_json(self):
        if len(self.contents) == 0:
            return {}
        output = {}
        for idx, content in enumerate(self.contents):
            output[str(idx)] = {
                "content": content["content"],
                "score": str(content["score"])
            }
        print(output)
        return output