File size: 1,728 Bytes
f7db77c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from tqdm import tqdm
import numpy as np
from commons.Configs import configs
from commons.File import file
from commons.OpenAIClient import openaiClient


class Embeddings:
    def __init__(self, debug=False):
        self.debug = debug

    def generateEmbeddings(self):
        inputFilePath = configs.generatedDatasetPath
        outputFilePath = configs.generatedEmbeddingsPath
        dataset = file.readJsonFile(inputFilePath)
        embeddings = []
        print("")
        # for each sentence
        for i, qa in enumerate(tqdm(dataset)):
            sentences = [qa['question'], qa['answer']]
            emb = openaiClient.generateEmbeddings(sentences)
            embjson = {'question': emb[0], 'answer': emb[1], 'label': i}
            print("Sentence: ", i, sentences)
            embeddings.append(embjson)
        # save all the generated embeddings
        # Default: io/generated/embeddings.json
        print("Writing embeddings to file: ", outputFilePath)
        file.writeFile(outputFilePath, embeddings)

    def loadEmbeddings(self):
        inputFilePath = configs.generatedEmbeddingsPath
        embeddings = file.readJsonFile(inputFilePath)
        questionEmbeddings = [x['question'] for x in embeddings]
        answerEmbeddings = [x['answer'] for x in embeddings]
        labels = [x['label'] for x in embeddings]
        # i would use float16, but I've had issues with GPU
        # I know I'm not using GPU now, but I might in the future
        return \
            np.array(questionEmbeddings, dtype=np.float32), \
            np.array(answerEmbeddings, dtype=np.float32), \
            np.array(labels, dtype=np.int32)


embeddings = Embeddings()