santhoshml commited on
Commit
e90e7d7
1 Parent(s): 1d36b50

committing v1

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. Dockerfile +14 -0
  3. app.py +52 -0
  4. meta-10k.pdf +3 -0
  5. requirements.txt +7 -0
  6. utils.py +63 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ meta-10k.pdf filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.11.8
5
+
6
+ WORKDIR /code
7
+
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
11
+
12
+ COPY . .
13
+
14
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ import tiktoken
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.document_loaders import PyMuPDFLoader
6
+ from langchain_community.vectorstores import Qdrant
7
+ from langchain_openai.embeddings import OpenAIEmbeddings
8
+ from operator import itemgetter
9
+ from langchain.schema.output_parser import StrOutputParser
10
+ from langchain.schema.runnable import RunnablePassthrough
11
+ from utils import *
12
+ import os
13
+ import getpass
14
+ from langchain.globals import set_debug
15
+
16
+
17
+ class RAGMeta10K:
18
+
19
+ def __init__(self) -> None:
20
+ os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
21
+
22
+ # set_debug(True)
23
+
24
+ self.UtilsObject = Utils()
25
+ self.rag_prompt_template = self.UtilsObject.init_prompt()
26
+ self.UtilsObject.split_into_chunks()
27
+ self.qdrant_retriever = self.UtilsObject.get_vector_store().as_retriever()
28
+
29
+ def ask_question(self, question: str):
30
+ retrieval_augmented_qa_chain = (
31
+ {"context": itemgetter("question") | self.qdrant_retriever, "question": itemgetter("question")}
32
+ | RunnablePassthrough.assign(context=itemgetter("context"))
33
+ | {"response": self.rag_prompt_template | self.UtilsObject.get_llm_model(), "context": itemgetter("context")}
34
+ )
35
+
36
+ response = retrieval_augmented_qa_chain.invoke({"question" : question})
37
+ print("response :"+ response["response"].content)
38
+ # print("*******")
39
+ # for context in response["context"]:
40
+ # print("Context:")
41
+ # print(context)
42
+ # print("----")
43
+
44
+
45
+ ragObject = RAGMeta10K()
46
+ ragObject.ask_question("Who are Directors?") #works
47
+ ragObject.ask_question("what is the value of Total cash and cash equivalents ?") #works
48
+
49
+ #ragObject.ask_question("What is the value of total cash and cash equivalents?")
50
+ # ragObject.ask_question("Who are the is the Board Chair and Chief Executive Officer ?")
51
+ #ragObject.ask_question("Who is the Board Chair and Chief Executive Officer ?")
52
+
meta-10k.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8fadc2448e4f99ad0ec2dc2e41d13b864204955238cf1f7cd9c96839f274a6c
3
+ size 2481466
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ langchain-openai
2
+ langchain_core
3
+ tiktoken
4
+ langchain
5
+ langchain_community
6
+ langchain_openai
7
+ typing
utils.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_openai import ChatOpenAI
2
+ import tiktoken
3
+ from langchain.document_loaders import PyMuPDFLoader
4
+ from langchain_core.prompts import ChatPromptTemplate
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_openai.embeddings import OpenAIEmbeddings
7
+ from typing import List
8
+ from langchain_core.documents.base import Document
9
+ from langchain_community.vectorstores import Qdrant
10
+
11
+
12
+ class Utils:
13
+ def __init__(
14
+ self,
15
+ llm_name: str = "gpt-3.5-turbo",
16
+ pdf_name: str = "meta-10k.pdf",
17
+ embedding_model: str = "text-embedding-3-small",
18
+ ) -> None:
19
+ self.openai_chat_model = ChatOpenAI(model=llm_name)
20
+ self.enc = tiktoken.encoding_for_model(llm_name)
21
+ self.docs = PyMuPDFLoader(pdf_name).load()
22
+ self.embedding_model = OpenAIEmbeddings(model=embedding_model)
23
+
24
+ #semanticTextSplitter
25
+ #tokenRTextSplitter
26
+ def split_into_chunks(self):
27
+ text_splitter = RecursiveCharacterTextSplitter(
28
+ chunk_size=500, chunk_overlap=50, length_function=self.tiktoken_len
29
+ )
30
+ self.split_chunks = text_splitter.split_documents(self.docs)
31
+ return self.split_chunks
32
+
33
+ def get_llm_model(self):
34
+ return self.openai_chat_model
35
+
36
+ def init_prompt(self) -> ChatPromptTemplate:
37
+ RAG_PROMPT = """
38
+ ###Instruction###:
39
+ Answer the question based only on the following context. If you cannot answer the question with the context, please respond with "I don't know":
40
+
41
+ CONTEXT:
42
+ {context}
43
+
44
+ QUERY:
45
+ {question}
46
+ """
47
+ rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
48
+ return rag_prompt
49
+
50
+ def tiktoken_len(self, text) -> int:
51
+ self.tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
52
+ text,
53
+ )
54
+ return len(self.tokens)
55
+
56
+ def get_vector_store(self):
57
+ self.qdrant_vectorstore = Qdrant.from_documents(
58
+ self.split_chunks,
59
+ self.embedding_model,
60
+ location=":memory:",
61
+ collection_name="meta-10k",
62
+ )
63
+ return self.qdrant_vectorstore