sameemul-haque commited on
Commit
48f76d5
0 Parent(s):

feat: intial commit

Browse files
Files changed (3) hide show
  1. .env.example +1 -0
  2. .gitignore +4 -0
  3. app.py +87 -0
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ HUGGINGFACEHUB_API_TOKEN = "YOUR_HUGGINGFACEHUB_API_TOKEN"
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Documents
2
+ .env
3
+ venv
4
+ test
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.chains import RetrievalQA
3
+ from langchain_community.document_loaders import PyPDFLoader
4
+ from langchain_community.document_loaders import DirectoryLoader
5
+ from InstructorEmbedding import INSTRUCTOR
6
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import pickle
9
+ import faiss
10
+ from langchain_community.vectorstores import FAISS
11
+ from pprint import pprint
12
+ import textwrap
13
+ import os
14
+ from dotenv import load_dotenv
15
+ from langchain_community.llms import HuggingFaceHub
16
+
17
+ # load env
18
+ load_dotenv()
19
+
20
+ # load pdf from a directory
21
+ loader = DirectoryLoader(f'./Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
22
+ documents = loader.load()
23
+
24
+ # chunks
25
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
26
+ texts = text_splitter.split_documents(documents)
27
+
28
+ def store_embeddings(docs, embeddings, sotre_name, path):
29
+ vectorStore = FAISS.from_documents(docs, embeddings)
30
+ with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
31
+ pickle.dump(vectorStore, f)
32
+
33
+ def load_embeddings(sotre_name, path):
34
+ with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
35
+ VectorStore = pickle.load(f)
36
+ return VectorStore
37
+
38
+ instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
39
+ Embedding_store_path = f"./Embedding_store"
40
+
41
+ # store_embeddings(texts, instructor_embeddings, sotre_name='instructEmbeddings', path=Embedding_store_path)
42
+ # db_instructEmbedd = load_embeddings(sotre_name='instructEmbeddings', path=Embedding_store_path)
43
+
44
+ db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)
45
+ retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 3})
46
+ retriever.search_type
47
+ retriever.search_kwargs
48
+ docs = retriever.get_relevant_documents("What is Operating System?")
49
+ # pprint(docs[0])
50
+ # pprint(docs[1])
51
+ # pprint(docs[2])
52
+
53
+ # Initialize the model
54
+
55
+ # Smaug-72B
56
+ # model_smaug = ollama.Model("smaug-72b")
57
+
58
+ # falcon-7b
59
+
60
+ os.environ["HUGGINGFACEHUB_API_TOKEN"]
61
+ llm=HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.1 ,"max_length":512})
62
+
63
+
64
+ # create the chain to answer questions
65
+ qa_chain_instrucEmbed = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
66
+
67
+ ## Cite sources
68
+ def wrap_text_preserve_newlines(text, width=110):
69
+ # Split the input text into lines based on newline characters
70
+ lines = text.split('\n')
71
+ # Wrap each line individually
72
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
73
+ # Join the wrapped lines back together using newline characters
74
+ wrapped_text = '\n'.join(wrapped_lines)
75
+ return wrapped_text
76
+
77
+ def process_llm_response(llm_response):
78
+ print(wrap_text_preserve_newlines(llm_response['result']))
79
+ print('\nSources:')
80
+ for source in llm_response["source_documents"]:
81
+ print(source.metadata['source'])
82
+
83
+ query = 'What is operating system?'
84
+
85
+ # print('-------------------Instructor Embeddings------------------\n')
86
+ llm_response = qa_chain_instrucEmbed(query)
87
+ process_llm_response(llm_response)