sameemul-haque commited on
Commit
2b5265f
1 Parent(s): 377f682

feat : add MongoDB integration

Browse files
Files changed (3) hide show
  1. .env.example +2 -1
  2. app.py +37 -10
  3. requirements.txt +4 -3
.env.example CHANGED
@@ -1 +1,2 @@
1
- HUGGINGFACEHUB_API_TOKEN = "YOUR_HUGGINGFACEHUB_API_TOKEN"
 
 
1
+ HUGGINGFACEHUB_API_TOKEN = "YOUR_HUGGINGFACEHUB_API_TOKEN"
2
+ MONGODB_CONNECTION_STRING = "YOUR_MONGODB_CONNECTION_STRING"
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os, textwrap
2
  from dotenv import load_dotenv
3
  from langchain.chains import RetrievalQA
@@ -6,6 +7,7 @@ from langchain_community.llms import HuggingFaceHub
6
  from langchain_community.document_loaders import PyPDFLoader
7
  from langchain_community.document_loaders import DirectoryLoader
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
9
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
10
  from flask import Flask, request
11
 
@@ -14,31 +16,56 @@ app = Flask(__name__)
14
  @app.route('/',methods=['GET'])
15
 
16
  def main():
17
- query = request.args.get('q')
18
- # query = unquote(query)
19
-
20
  # load env
21
  load_dotenv()
 
 
 
 
 
 
 
 
 
 
22
 
23
  # load pdfs from the Documents directory
24
- loader = DirectoryLoader(f'./Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
25
- documents = loader.load()
26
 
27
  # split the documents into chunks
28
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
29
- texts = text_splitter.split_documents(documents)
30
 
31
  instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
32
 
33
  # create the retriever
34
- db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
35
- retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})
36
  # retriever search type is similarity search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  # query = 'What is operating system?'
39
 
40
  # Initialize the model falcon-7b
41
- os.environ["HUGGINGFACEHUB_API_TOKEN"]
42
  llm=HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.1 ,"max_length":512})
43
 
44
  # create the chain to answer questions
 
1
+ import pymongo
2
  import os, textwrap
3
  from dotenv import load_dotenv
4
  from langchain.chains import RetrievalQA
 
7
  from langchain_community.document_loaders import PyPDFLoader
8
  from langchain_community.document_loaders import DirectoryLoader
9
  from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain_community.vectorstores import MongoDBAtlasVectorSearch
11
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
12
  from flask import Flask, request
13
 
 
16
  @app.route('/',methods=['GET'])
17
 
18
  def main():
 
 
 
19
  # load env
20
  load_dotenv()
21
+ mongodb_connection_string = os.getenv("MONGODB_CONNECTION_STRING")
22
+ os.environ["HUGGINGFACEHUB_API_TOKEN"]
23
+
24
+ # connect to mongodb
25
+ client = pymongo.MongoClient(mongodb_connection_string)
26
+ db = client.test_database
27
+ collection = db.textbooks
28
+
29
+ query = request.args.get('q')
30
+ # query = unquote(query)
31
 
32
  # load pdfs from the Documents directory
33
+ # loader = DirectoryLoader(f'./Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
34
+ # documents = loader.load()
35
 
36
  # split the documents into chunks
37
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
38
+ # texts = text_splitter.split_documents(documents)
39
 
40
  instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
41
 
42
  # create the retriever
43
+ # db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
44
+ # retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})
45
  # retriever search type is similarity search
46
+
47
+ # # create the retriever and do embedding
48
+ # vector_search = MongoDBAtlasVectorSearch.from_documents(
49
+ # documents=texts,
50
+ # embedding=instructor_embeddings,
51
+ # collection=collection,
52
+ # index_name="default",
53
+ # )
54
+
55
+ vector_search = MongoDBAtlasVectorSearch.from_connection_string(
56
+ mongodb_connection_string,
57
+ "test_database" + "." + "textbooks",
58
+ instructor_embeddings,
59
+ index_name="default",
60
+ )
61
+ retriever = vector_search.as_retriever(
62
+ search_type="similarity",
63
+ search_kwargs={"k": 3},
64
+ )
65
 
66
  # query = 'What is operating system?'
67
 
68
  # Initialize the model falcon-7b
 
69
  llm=HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.1 ,"max_length":512})
70
 
71
  # create the chain to answer questions
requirements.txt CHANGED
@@ -1,10 +1,11 @@
1
- Flask==3.0.2
2
  Gunicorn
3
- python-dotenv==1.0.1
4
- langchain==0.1.6
5
  pypdf==4.0.1
6
  InstructorEmbedding==1.0.1
7
  torch==2.2.1
8
  tqdm==4.66.2
9
  sentence-transformers==2.2.2
10
  faiss-cpu==1.7.4
 
 
1
+ Flask
2
  Gunicorn
3
+ python-dotenv
4
+ langchain
5
  pypdf==4.0.1
6
  InstructorEmbedding==1.0.1
7
  torch==2.2.1
8
  tqdm==4.66.2
9
  sentence-transformers==2.2.2
10
  faiss-cpu==1.7.4
11
+ pymongo