Upload 8 files
Browse files- Pipfile +20 -0
- Pipfile.lock +0 -0
- __pycache__/consts.cpython-39.pyc +0 -0
- app.py +85 -0
- backend/core.py +28 -0
- consts.py +1 -0
- dataIngestionFromPdfToPinecone.py +38 -0
- requirements.txt +84 -0
Pipfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[[source]]
|
2 |
+
url = "https://pypi.org/simple"
|
3 |
+
verify_ssl = true
|
4 |
+
name = "pypi"
|
5 |
+
|
6 |
+
[packages]
|
7 |
+
pypdf = "*"
|
8 |
+
langchain = "*"
|
9 |
+
pinecone-client = "*"
|
10 |
+
python-dotenv = "*"
|
11 |
+
openai = "*"
|
12 |
+
tiktoken = "*"
|
13 |
+
streamlit = "*"
|
14 |
+
typing = "*"
|
15 |
+
streamlit-chat = "*"
|
16 |
+
|
17 |
+
[dev-packages]
|
18 |
+
|
19 |
+
[requires]
|
20 |
+
python_version = "3.9"
|
Pipfile.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
__pycache__/consts.cpython-39.pyc
ADDED
Binary file (182 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Set
|
2 |
+
|
3 |
+
from backend.core import run_llm
|
4 |
+
import streamlit as st
|
5 |
+
from streamlit_chat import message
|
6 |
+
from PIL import Image
|
7 |
+
from io import BytesIO
|
8 |
+
import base64
|
9 |
+
|
10 |
+
# def add_bg_from_local(image_file):
|
11 |
+
# with open(image_file, "rb") as image_file:
|
12 |
+
# encoded_string = base64.b64encode(image_file.read())
|
13 |
+
# st.markdown(
|
14 |
+
# f"""
|
15 |
+
# <style>
|
16 |
+
# .stApp {{
|
17 |
+
# background-image: url(data:{"jpeg"};base64,{encoded_string.decode()});
|
18 |
+
# background-size: cover
|
19 |
+
# }}
|
20 |
+
# </style>
|
21 |
+
# """,
|
22 |
+
# unsafe_allow_html=True
|
23 |
+
# )
|
24 |
+
# background_image = "bg2.jpeg"
|
25 |
+
# add_bg_from_local(background_image)
|
26 |
+
|
27 |
+
|
28 |
+
st.header("Get to know Anuj !!!")
|
29 |
+
|
30 |
+
if "user_prompt_history" not in st.session_state:
|
31 |
+
st.session_state["user_prompt_history"] = []
|
32 |
+
|
33 |
+
if "chat_answers_history" not in st.session_state:
|
34 |
+
st.session_state["chat_answers_history"] = []
|
35 |
+
|
36 |
+
if "chat_history" not in st.session_state:
|
37 |
+
st.session_state["chat_history"] = []
|
38 |
+
|
39 |
+
|
40 |
+
def create_sources_string(source_urls: Set[str]) -> str:
|
41 |
+
if not source_urls:
|
42 |
+
return ""
|
43 |
+
sources_list = list(source_urls)
|
44 |
+
sources_list.sort()
|
45 |
+
sources_string = "sources:\n"
|
46 |
+
for i, source in enumerate(sources_list):
|
47 |
+
sources_string += f"{i+1}. {source}\n"
|
48 |
+
return sources_string
|
49 |
+
|
50 |
+
|
51 |
+
prompt = st.text_input("Prompt", placeholder="Enter your question here (eg. Who is Anuj Mahajan, What is his work experience ?)...")
|
52 |
+
|
53 |
+
|
54 |
+
if prompt:
|
55 |
+
with st.spinner("Generating response..."):
|
56 |
+
generated_response = run_llm(
|
57 |
+
query=prompt, chat_history=st.session_state["chat_history"]
|
58 |
+
)
|
59 |
+
sources = set(
|
60 |
+
[doc.metadata["source"] for doc in generated_response["source_documents"]]
|
61 |
+
)
|
62 |
+
formatted_response = (
|
63 |
+
f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
|
64 |
+
)
|
65 |
+
|
66 |
+
st.session_state.user_prompt_history.append(prompt)
|
67 |
+
st.session_state.chat_answers_history.append(formatted_response)
|
68 |
+
st.session_state.chat_history.append((prompt, generated_response["answer"]))
|
69 |
+
|
70 |
+
if st.session_state["chat_answers_history"]:
|
71 |
+
for generated_response, user_query in zip(
|
72 |
+
st.session_state["chat_answers_history"],
|
73 |
+
st.session_state["user_prompt_history"],
|
74 |
+
):
|
75 |
+
message(
|
76 |
+
user_query,
|
77 |
+
is_user=True,
|
78 |
+
avatar_style="adventurer",
|
79 |
+
seed=123,
|
80 |
+
)
|
81 |
+
# message(generated_response)
|
82 |
+
st.write(
|
83 |
+
f'<div style="word-wrap: break-word;">{generated_response}</div>',
|
84 |
+
unsafe_allow_html=True,
|
85 |
+
)
|
backend/core.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Any, Dict, List
|
3 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.chains import ConversationalRetrievalChain
|
6 |
+
from langchain.vectorstores import Pinecone
|
7 |
+
import pinecone
|
8 |
+
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
|
11 |
+
from consts import INDEX_NAME
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
pinecone.init(
|
15 |
+
api_key=os.environ.get("PINECONE_API_KEY"),
|
16 |
+
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
|
17 |
+
)
|
18 |
+
|
19 |
+
def run_llm(query, chat_history):
|
20 |
+
embeddings = OpenAIEmbeddings()
|
21 |
+
docSearch = Pinecone.from_existing_index(index_name = INDEX_NAME, embedding = embeddings)
|
22 |
+
chat = ChatOpenAI(verbose = True, temperature = 0)
|
23 |
+
|
24 |
+
qa = ConversationalRetrievalChain.from_llm(llm = chat, retriever = docSearch.as_retriever(), return_source_documents = True)
|
25 |
+
|
26 |
+
return qa({"question" : query, "chat_history" : chat_history})
|
27 |
+
|
28 |
+
|
consts.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
INDEX_NAME = "pdf-parser"
|
dataIngestionFromPdfToPinecone.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from langchain.document_loaders import PyPDFLoader
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
5 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
6 |
+
from langchain.chains import RetrievalQA
|
7 |
+
from langchain.llms import OpenAI
|
8 |
+
|
9 |
+
from langchain.vectorstores import Pinecone
|
10 |
+
import pinecone
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
|
13 |
+
from consts import INDEX_NAME
|
14 |
+
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
pinecone.init(
|
18 |
+
api_key=os.environ.get("PINECONE_API_KEY"),
|
19 |
+
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
|
20 |
+
)
|
21 |
+
|
22 |
+
def ingestDataFromPdfIntoPinecone():
|
23 |
+
print('Reading Data from PDF')
|
24 |
+
pdf_path = "/Users/anujmahajan/Desktop/Anuj Documents/Resume/PDF/Amazon/Anuj Mahajan - IUB MS CS - CV.pdf"
|
25 |
+
loader = PyPDFLoader(file_path=pdf_path)
|
26 |
+
documents = loader.load()
|
27 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
28 |
+
chunk_size=1000, chunk_overlap=30, separators=["\n\n", "\n", " ", ""]
|
29 |
+
)
|
30 |
+
documents = text_splitter.split_documents(documents=documents)
|
31 |
+
|
32 |
+
print(f"Going to insert {len(documents)} to Pinecone")
|
33 |
+
embeddings = OpenAIEmbeddings()
|
34 |
+
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
|
35 |
+
print("****** Added to Pinecone vectorstore vectors")
|
36 |
+
|
37 |
+
if __name__ == "__main__":
|
38 |
+
ingestDataFromPdfIntoPinecone()
|
requirements.txt
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.1
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.2.0
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==4.2.0
|
6 |
+
async-timeout==4.0.3
|
7 |
+
attrs==23.2.0
|
8 |
+
blinker==1.7.0
|
9 |
+
cachetools==5.3.2
|
10 |
+
certifi==2023.11.17
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
click==8.1.7
|
13 |
+
dataclasses-json==0.6.3
|
14 |
+
distro==1.9.0
|
15 |
+
dnspython==2.4.2
|
16 |
+
exceptiongroup==1.2.0
|
17 |
+
frozenlist==1.4.1
|
18 |
+
gitdb==4.0.11
|
19 |
+
GitPython==3.1.40
|
20 |
+
greenlet==3.0.3
|
21 |
+
h11==0.14.0
|
22 |
+
httpcore==1.0.2
|
23 |
+
httpx==0.26.0
|
24 |
+
idna==3.6
|
25 |
+
importlib-metadata==6.11.0
|
26 |
+
Jinja2==3.1.2
|
27 |
+
jsonpatch==1.33
|
28 |
+
jsonpointer==2.4
|
29 |
+
jsonschema==4.20.0
|
30 |
+
jsonschema-specifications==2023.12.1
|
31 |
+
langchain==0.0.354
|
32 |
+
langchain-community==0.0.8
|
33 |
+
langchain-core==0.1.5
|
34 |
+
langsmith==0.0.77
|
35 |
+
loguru==0.7.2
|
36 |
+
markdown-it-py==3.0.0
|
37 |
+
MarkupSafe==2.1.3
|
38 |
+
marshmallow==3.20.1
|
39 |
+
mdurl==0.1.2
|
40 |
+
multidict==6.0.4
|
41 |
+
mypy-extensions==1.0.0
|
42 |
+
numpy==1.26.3
|
43 |
+
openai==1.6.1
|
44 |
+
packaging==23.2
|
45 |
+
pandas==2.1.4
|
46 |
+
pillow==10.2.0
|
47 |
+
pinecone-client==2.2.4
|
48 |
+
protobuf==4.25.1
|
49 |
+
pyarrow==14.0.2
|
50 |
+
pydantic==2.5.3
|
51 |
+
pydantic_core==2.14.6
|
52 |
+
pydeck==0.8.1b0
|
53 |
+
Pygments==2.17.2
|
54 |
+
pypdf==3.17.4
|
55 |
+
python-dateutil==2.8.2
|
56 |
+
python-dotenv==1.0.0
|
57 |
+
pytz==2023.3.post1
|
58 |
+
PyYAML==6.0.1
|
59 |
+
referencing==0.32.0
|
60 |
+
regex==2023.12.25
|
61 |
+
requests==2.31.0
|
62 |
+
rich==13.7.0
|
63 |
+
rpds-py==0.16.2
|
64 |
+
six==1.16.0
|
65 |
+
smmap==5.0.1
|
66 |
+
sniffio==1.3.0
|
67 |
+
SQLAlchemy==2.0.25
|
68 |
+
streamlit==1.29.0
|
69 |
+
streamlit-chat==0.1.1
|
70 |
+
tenacity==8.2.3
|
71 |
+
tiktoken==0.5.2
|
72 |
+
toml==0.10.2
|
73 |
+
toolz==0.12.0
|
74 |
+
tornado==6.4
|
75 |
+
tqdm==4.66.1
|
76 |
+
typing==3.7.4.3
|
77 |
+
typing-inspect==0.9.0
|
78 |
+
typing_extensions==4.9.0
|
79 |
+
tzdata==2023.4
|
80 |
+
tzlocal==5.2
|
81 |
+
urllib3==2.1.0
|
82 |
+
validators==0.22.0
|
83 |
+
yarl==1.9.4
|
84 |
+
zipp==3.17.0
|