Spaces:
Sleeping
Sleeping
santhoshml
commited on
Commit
•
e90e7d7
1
Parent(s):
1d36b50
committing v1
Browse files- .gitattributes +1 -0
- Dockerfile +14 -0
- app.py +52 -0
- meta-10k.pdf +3 -0
- requirements.txt +7 -0
- utils.py +63 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
meta-10k.pdf filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.11.8
|
5 |
+
|
6 |
+
WORKDIR /code
|
7 |
+
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
11 |
+
|
12 |
+
COPY . .
|
13 |
+
|
14 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import ChatOpenAI
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
3 |
+
import tiktoken
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from langchain.document_loaders import PyMuPDFLoader
|
6 |
+
from langchain_community.vectorstores import Qdrant
|
7 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
8 |
+
from operator import itemgetter
|
9 |
+
from langchain.schema.output_parser import StrOutputParser
|
10 |
+
from langchain.schema.runnable import RunnablePassthrough
|
11 |
+
from utils import *
|
12 |
+
import os
|
13 |
+
import getpass
|
14 |
+
from langchain.globals import set_debug
|
15 |
+
|
16 |
+
|
17 |
+
class RAGMeta10K:
|
18 |
+
|
19 |
+
def __init__(self) -> None:
|
20 |
+
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
|
21 |
+
|
22 |
+
# set_debug(True)
|
23 |
+
|
24 |
+
self.UtilsObject = Utils()
|
25 |
+
self.rag_prompt_template = self.UtilsObject.init_prompt()
|
26 |
+
self.UtilsObject.split_into_chunks()
|
27 |
+
self.qdrant_retriever = self.UtilsObject.get_vector_store().as_retriever()
|
28 |
+
|
29 |
+
def ask_question(self, question: str):
|
30 |
+
retrieval_augmented_qa_chain = (
|
31 |
+
{"context": itemgetter("question") | self.qdrant_retriever, "question": itemgetter("question")}
|
32 |
+
| RunnablePassthrough.assign(context=itemgetter("context"))
|
33 |
+
| {"response": self.rag_prompt_template | self.UtilsObject.get_llm_model(), "context": itemgetter("context")}
|
34 |
+
)
|
35 |
+
|
36 |
+
response = retrieval_augmented_qa_chain.invoke({"question" : question})
|
37 |
+
print("response :"+ response["response"].content)
|
38 |
+
# print("*******")
|
39 |
+
# for context in response["context"]:
|
40 |
+
# print("Context:")
|
41 |
+
# print(context)
|
42 |
+
# print("----")
|
43 |
+
|
44 |
+
|
45 |
+
ragObject = RAGMeta10K()
|
46 |
+
ragObject.ask_question("Who are Directors?") #works
|
47 |
+
ragObject.ask_question("what is the value of Total cash and cash equivalents ?") #works
|
48 |
+
|
49 |
+
#ragObject.ask_question("What is the value of total cash and cash equivalents?")
|
50 |
+
# ragObject.ask_question("Who are the is the Board Chair and Chief Executive Officer ?")
|
51 |
+
#ragObject.ask_question("Who is the Board Chair and Chief Executive Officer ?")
|
52 |
+
|
meta-10k.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8fadc2448e4f99ad0ec2dc2e41d13b864204955238cf1f7cd9c96839f274a6c
|
3 |
+
size 2481466
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain-openai
|
2 |
+
langchain_core
|
3 |
+
tiktoken
|
4 |
+
langchain
|
5 |
+
langchain_community
|
6 |
+
langchain_openai
|
7 |
+
typing
|
utils.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_openai import ChatOpenAI
|
2 |
+
import tiktoken
|
3 |
+
from langchain.document_loaders import PyMuPDFLoader
|
4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
7 |
+
from typing import List
|
8 |
+
from langchain_core.documents.base import Document
|
9 |
+
from langchain_community.vectorstores import Qdrant
|
10 |
+
|
11 |
+
|
12 |
+
class Utils:
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
llm_name: str = "gpt-3.5-turbo",
|
16 |
+
pdf_name: str = "meta-10k.pdf",
|
17 |
+
embedding_model: str = "text-embedding-3-small",
|
18 |
+
) -> None:
|
19 |
+
self.openai_chat_model = ChatOpenAI(model=llm_name)
|
20 |
+
self.enc = tiktoken.encoding_for_model(llm_name)
|
21 |
+
self.docs = PyMuPDFLoader(pdf_name).load()
|
22 |
+
self.embedding_model = OpenAIEmbeddings(model=embedding_model)
|
23 |
+
|
24 |
+
#semanticTextSplitter
|
25 |
+
#tokenRTextSplitter
|
26 |
+
def split_into_chunks(self):
|
27 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
28 |
+
chunk_size=500, chunk_overlap=50, length_function=self.tiktoken_len
|
29 |
+
)
|
30 |
+
self.split_chunks = text_splitter.split_documents(self.docs)
|
31 |
+
return self.split_chunks
|
32 |
+
|
33 |
+
def get_llm_model(self):
|
34 |
+
return self.openai_chat_model
|
35 |
+
|
36 |
+
def init_prompt(self) -> ChatPromptTemplate:
|
37 |
+
RAG_PROMPT = """
|
38 |
+
###Instruction###:
|
39 |
+
Answer the question based only on the following context. If you cannot answer the question with the context, please respond with "I don't know":
|
40 |
+
|
41 |
+
CONTEXT:
|
42 |
+
{context}
|
43 |
+
|
44 |
+
QUERY:
|
45 |
+
{question}
|
46 |
+
"""
|
47 |
+
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)
|
48 |
+
return rag_prompt
|
49 |
+
|
50 |
+
def tiktoken_len(self, text) -> int:
|
51 |
+
self.tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode(
|
52 |
+
text,
|
53 |
+
)
|
54 |
+
return len(self.tokens)
|
55 |
+
|
56 |
+
def get_vector_store(self):
|
57 |
+
self.qdrant_vectorstore = Qdrant.from_documents(
|
58 |
+
self.split_chunks,
|
59 |
+
self.embedding_model,
|
60 |
+
location=":memory:",
|
61 |
+
collection_name="meta-10k",
|
62 |
+
)
|
63 |
+
return self.qdrant_vectorstore
|