|
import json |
|
import time |
|
from openai import OpenAI |
|
|
|
with open('.openai-secret', 'r') as f: OPENAI_API_KEY = f.read() |
|
|
|
|
|
def read_dataset(): |
|
print("Reading dataset") |
|
json_file = open('updateddata.json') |
|
data = json.load(json_file) |
|
return data |
|
|
|
|
|
def main(): |
|
""" |
|
The purpse of this script is to take the articles json and add a field of the vector embeddings. |
|
There is a hardcoded sleep because the openAI API is rate limited to 1M tokens / min. |
|
In total, takes about an hour to run. |
|
""" |
|
|
|
data = read_dataset() |
|
print(len(data)) |
|
|
|
mini = 0 |
|
max = 750 |
|
|
|
client = OpenAI(api_key=OPENAI_API_KEY) |
|
|
|
newjson = [] |
|
|
|
while max <= len(data): |
|
print("------------") |
|
print("startind", mini, "endind", max) |
|
|
|
paper_subset = data[mini:max] |
|
abstract_list = [] |
|
for paper in paper_subset: |
|
abstract = paper['abstract'][0:2048] |
|
abstract = json.dumps(abstract, ensure_ascii=True) |
|
abstract_list.append(abstract) |
|
|
|
totallen = 0 |
|
for thinig in abstract_list: |
|
totallen+= len(thinig[0:2048]) |
|
print("numtokens:", totallen) |
|
|
|
abstract_list = [x.replace("\n"," ") for x in abstract_list] |
|
abstract_list = [x.strip() for x in abstract_list] |
|
|
|
res = client.embeddings.create( |
|
model="text-embedding-3-small", |
|
input=abstract_list, |
|
encoding_format="float" |
|
) |
|
|
|
resdata = res.dict()['data'] |
|
|
|
print("Successful API call") |
|
|
|
for i in range(len(paper_subset)): |
|
paper_subset[i]['embed'] = resdata[i]['embedding'] |
|
|
|
print("Added embeds") |
|
|
|
newjson.append(paper_subset) |
|
|
|
if (max == len(data)): break |
|
|
|
time.sleep(61) |
|
|
|
mini += 750 |
|
max += 750 |
|
max = min(max, len(data)) |
|
|
|
with open("datawithembeds.json", 'w') as f: |
|
json.dump(newjson, f) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |