subhankarhalder commited on
Commit
c9d9b06
·
verified ·
1 Parent(s): fa2c8db

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +41 -0
train.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader
2
+ from langchain_together.embeddings import TogetherEmbeddings
3
+ import faiss
4
+ import os
5
+ import time
6
+ import numpy as np
7
+ import pickle
8
+
9
+ os.environ["TOGETHER_API_KEY"] = st.secrets["together_api_key"]
10
+ embeddings = TogetherEmbeddings(model="togethercomputer/m2-bert-80M-8k-retrieval")
11
+
12
+ loader = PyPDFLoader("ship.pdf")
13
+ data = loader.load()
14
+ print (f'You have {len(data)} document(s) in your data')
15
+ print (f'There are {len(data[0].page_content)} characters in your sample document')
16
+ print (f'Here is a sample: {data[0].page_content}')
17
+
18
+ list_of_texts = []
19
+ list_of_embeddings = []
20
+ for val in data:
21
+ text_content = val.page_content
22
+ list_of_texts.append(text_content)
23
+ embedding_vector = embeddings.embed_query(text_content)
24
+ list_of_embeddings.append(embedding_vector)
25
+
26
+
27
+ embeddings_array = np.array(list_of_embeddings).astype('float32')
28
+ d = len(list_of_embeddings[0])
29
+ index = faiss.IndexFlatL2(d)
30
+ index.add(embeddings_array)
31
+
32
+ # Save the index
33
+ faiss.write_index(index, "faiss.index")
34
+ # Save the list of texts
35
+ with open("list_of_texts.pkl", 'wb') as f:
36
+ pickle.dump(list_of_texts, f)
37
+
38
+
39
+
40
+
41
+