angelesteban00 commited on
Commit
8b091a4
1 Parent(s): 168d589
Files changed (3) hide show
  1. app.py +2 -2
  2. load_data_from_PDF.py +34 -0
  3. requirements.txt +2 -0
app.py CHANGED
@@ -21,11 +21,11 @@ Demo based on https://www.mongodb.com/developer/products/atlas/rag-atlas-vector-
21
  ## Prerequisites:
22
  create a free DB called "langchain_demo" and a collection called "collection_of_text_blobs" in MongoDB Atlas (https://cloud.mongodb.com). After that, you have two options:
23
 
24
- **option1**) execute locally "load_data.py" to create new documents and their embeddings in MongoDB<br>
25
  **option2**) import the JSON file "langchain_demo.collection_of_text_blobs.json"
26
 
27
  ## Dataset
28
- The JSON documents in MongoDB looks like:
29
  ```
30
  {
31
  "_id": {
 
21
  ## Prerequisites:
22
  create a free DB called "langchain_demo" and a collection called "collection_of_text_blobs" in MongoDB Atlas (https://cloud.mongodb.com). After that, you have two options:
23
 
24
+ **option1**) execute locally "load_data.py"/"load_data_from_PDF.py" to create new documents and their embeddings in MongoDB<br>
25
  **option2**) import the JSON file "langchain_demo.collection_of_text_blobs.json"
26
 
27
  ## Dataset
28
+ The JSON documents in MongoDB looks like (also was splitted and embebed this PDF https://arxiv.org/pdf/2303.08774.pdf):
29
  ```
30
  {
31
  "_id": {
load_data_from_PDF.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo import MongoClient
2
+ # error since Jan 2024, from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain_openai import OpenAIEmbeddings
4
+ # error since Jan 2024, from langchain.vectorstores import MongoDBAtlasVectorSearch
5
+ from langchain_community.vectorstores import MongoDBAtlasVectorSearch
6
+ # error since Jan 2024, from langchain.document_loaders import PyPDFLoader
7
+ from langchain_community.document_loaders import PyPDFLoader
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ import os
10
+
11
+ mongo_uri = os.getenv("MONGO_URI")
12
+ openai_api_key = os.getenv("OPENAI_API_KEY")
13
+
14
+ client = MongoClient(mongo_uri)
15
+ dbName = "langchain_demo"
16
+ collectionName = "collection_of_text_blobs"
17
+ collection = client[dbName][collectionName]
18
+
19
+ #loader = DirectoryLoader( './sample_files', glob="./*.txt", show_progress=True)
20
+ loader = PyPDFLoader("https://arxiv.org/pdf/2303.08774.pdf")
21
+ data = loader.load()
22
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
23
+ docs = text_splitter.split_documents(data)
24
+
25
+ #embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
26
+ #vectorStore = MongoDBAtlasVectorSearch.from_documents( data, embeddings, collection=collection, index_name="default" )
27
+
28
+ # insert the documents in MongoDB Atlas Vector Search
29
+ x = MongoDBAtlasVectorSearch.from_documents(
30
+ documents=docs,
31
+ embedding=OpenAIEmbeddings(openai_api_key=openai_api_key, disallowed_special=()),
32
+ collection=collection,
33
+ index_name="default"
34
+ )
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
  langchain
 
 
2
  langchain-openai
3
  pymongo[srv]==4.1.1
4
  bs4
 
1
  langchain
2
+ pypdf
3
+ python-dotenv
4
  langchain-openai
5
  pymongo[srv]==4.1.1
6
  bs4