jamescg commited on
Commit
6a3deac
1 Parent(s): 5d3e016

Rename ransient ingenstion_data.py to ingest_data.py

Browse files
Files changed (2) hide show
  1. ingest_data.py +23 -0
  2. ransient ingenstion_data.py +0 -8
ingest_data.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import UnstructuredFileLoader
3
+ from langchain.vectorstores.faiss import FAISS
4
+ from langchain.embeddings import OpenAIEmbeddings
5
+ import pickle
6
+
7
+ # Load Data
8
+ loader = UnstructuredFileLoader("state_of_the_union.txt")
9
+ raw_documents = loader.load()
10
+
11
+ # Split text
12
+ text_splitter = RecursiveCharacterTextSplitter()
13
+ documents = text_splitter.split_documents(raw_documents)
14
+
15
+
16
+ # Load Data to vectorstore
17
+ embeddings = OpenAIEmbeddings()
18
+ vectorstore = FAISS.from_documents(documents, embeddings)
19
+
20
+
21
+ # Save vectorstore
22
+ with open("vectorstore.pkl", "wb") as f:
23
+ pickle.dump(vectorstore, f)
ransient ingenstion_data.py DELETED
@@ -1,8 +0,0 @@
1
- from langchain.document_loaders import TextLoader
2
- loader = TextLoader('SMR4 publication.txt')
3
- from langchain.indexes import VectorstoreIndexCreator
4
- index = VectorstoreIndexCreator().from_loaders([loader])
5
-
6
- # Load Data
7
- loader = TextLoader("SMR4 publication.txt")
8
- raw_documents = loader.load()