colbert-acl / knn_db_init.py
davidheineman's picture
add knn init
8e2f8d0
import json
import time
from openai import OpenAI
with open('.openai-secret', 'r') as f: OPENAI_API_KEY = f.read()
def read_dataset():
print("Reading dataset")
json_file = open('updateddata.json')
data = json.load(json_file)
return data
def main():
"""
The purpse of this script is to take the articles json and add a field of the vector embeddings.
There is a hardcoded sleep because the openAI API is rate limited to 1M tokens / min.
In total, takes about an hour to run.
"""
data = read_dataset()
print(len(data))
mini = 0
max = 750
client = OpenAI(api_key=OPENAI_API_KEY)
newjson = []
while max <= len(data):
print("------------")
print("startind", mini, "endind", max)
paper_subset = data[mini:max]
abstract_list = []
for paper in paper_subset:
abstract = paper['abstract'][0:2048]
abstract = json.dumps(abstract, ensure_ascii=True)
abstract_list.append(abstract)
totallen = 0
for thinig in abstract_list:
totallen+= len(thinig[0:2048])
print("numtokens:", totallen)
abstract_list = [x.replace("\n"," ") for x in abstract_list]
abstract_list = [x.strip() for x in abstract_list]
res = client.embeddings.create(
model="text-embedding-3-small",
input=abstract_list,
encoding_format="float"
)
resdata = res.dict()['data']
print("Successful API call")
for i in range(len(paper_subset)):
paper_subset[i]['embed'] = resdata[i]['embedding']
print("Added embeds")
newjson.append(paper_subset)
if (max == len(data)): break
time.sleep(61)
mini += 750
max += 750
max = min(max, len(data))
with open("datawithembeds.json", 'w') as f:
json.dump(newjson, f)
if __name__ == '__main__':
main()