davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 28

Commit

8e2f8d0

•

1 Parent(s): 6d2b619

add knn init

Files changed (2) hide show

.gitignore +3 -1
knn_db_init.py +81 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,6 @@
 __pycache__
 experiments
 .openai-secret
-.mongodb-secret

 __pycache__
 experiments
 .openai-secret
+.mongodb-secret
+demo.mov
+.DS_Store

knn_db_init.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import json
+import time
+from openai import OpenAI
+with open('.openai-secret', 'r') as f: OPENAI_API_KEY = f.read()
+def read_dataset():
+    print("Reading dataset")
+    json_file = open('updateddata.json')
+    data = json.load(json_file)
+    return data
+def main():
+    """
+    The purpse of this script is to take the articles json and add a field of the vector embeddings.
+    There is a hardcoded sleep because the openAI API is rate limited to 1M tokens / min.
+    In total, takes about an hour to run.
+    """
+    data = read_dataset()
+    print(len(data))
+    mini = 0
+    max = 750
+    client = OpenAI(api_key=OPENAI_API_KEY)
+    newjson = []
+    while max <= len(data):
+        print("------------")
+        print("startind", mini, "endind", max)
+        paper_subset = data[mini:max]
+        abstract_list = []
+        for paper in paper_subset:
+            abstract = paper['abstract'][0:2048]
+            abstract = json.dumps(abstract, ensure_ascii=True)
+            abstract_list.append(abstract)
+        totallen = 0
+        for thinig in abstract_list:
+            totallen+= len(thinig[0:2048])
+        print("numtokens:", totallen)
+        abstract_list = [x.replace("\n"," ") for x in abstract_list]
+        abstract_list = [x.strip() for x in abstract_list]
+        res = client.embeddings.create(
+            model="text-embedding-3-small",
+            input=abstract_list,
+            encoding_format="float"
+        )
+        resdata = res.dict()['data']
+        print("Successful API call")
+        for i in range(len(paper_subset)):
+            paper_subset[i]['embed'] = resdata[i]['embedding']
+        print("Added embeds")
+        newjson.append(paper_subset)
+        if (max == len(data)): break
+        time.sleep(61)
+        mini += 750
+        max += 750
+        max = min(max, len(data))
+    with open("datawithembeds.json", 'w') as f:
+            json.dump(newjson, f)
+if __name__ == '__main__':
+    main()