import json import time from openai import OpenAI with open('.openai-secret', 'r') as f: OPENAI_API_KEY = f.read() def read_dataset(): print("Reading dataset") json_file = open('updateddata.json') data = json.load(json_file) return data def main(): """ The purpse of this script is to take the articles json and add a field of the vector embeddings. There is a hardcoded sleep because the openAI API is rate limited to 1M tokens / min. In total, takes about an hour to run. """ data = read_dataset() print(len(data)) mini = 0 max = 750 client = OpenAI(api_key=OPENAI_API_KEY) newjson = [] while max <= len(data): print("------------") print("startind", mini, "endind", max) paper_subset = data[mini:max] abstract_list = [] for paper in paper_subset: abstract = paper['abstract'][0:2048] abstract = json.dumps(abstract, ensure_ascii=True) abstract_list.append(abstract) totallen = 0 for thinig in abstract_list: totallen+= len(thinig[0:2048]) print("numtokens:", totallen) abstract_list = [x.replace("\n"," ") for x in abstract_list] abstract_list = [x.strip() for x in abstract_list] res = client.embeddings.create( model="text-embedding-3-small", input=abstract_list, encoding_format="float" ) resdata = res.dict()['data'] print("Successful API call") for i in range(len(paper_subset)): paper_subset[i]['embed'] = resdata[i]['embedding'] print("Added embeds") newjson.append(paper_subset) if (max == len(data)): break time.sleep(61) mini += 750 max += 750 max = min(max, len(data)) with open("datawithembeds.json", 'w') as f: json.dump(newjson, f) if __name__ == '__main__': main()