anujmaha commited on
Commit
6951a91
1 Parent(s): ff58a3a

Upload 8 files

Browse files
Pipfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ pypdf = "*"
8
+ langchain = "*"
9
+ pinecone-client = "*"
10
+ python-dotenv = "*"
11
+ openai = "*"
12
+ tiktoken = "*"
13
+ streamlit = "*"
14
+ typing = "*"
15
+ streamlit-chat = "*"
16
+
17
+ [dev-packages]
18
+
19
+ [requires]
20
+ python_version = "3.9"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
__pycache__/consts.cpython-39.pyc ADDED
Binary file (182 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Set
2
+
3
+ from backend.core import run_llm
4
+ import streamlit as st
5
+ from streamlit_chat import message
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ import base64
9
+
10
+ # def add_bg_from_local(image_file):
11
+ # with open(image_file, "rb") as image_file:
12
+ # encoded_string = base64.b64encode(image_file.read())
13
+ # st.markdown(
14
+ # f"""
15
+ # <style>
16
+ # .stApp {{
17
+ # background-image: url(data:{"jpeg"};base64,{encoded_string.decode()});
18
+ # background-size: cover
19
+ # }}
20
+ # </style>
21
+ # """,
22
+ # unsafe_allow_html=True
23
+ # )
24
+ # background_image = "bg2.jpeg"
25
+ # add_bg_from_local(background_image)
26
+
27
+
28
+ st.header("Get to know Anuj !!!")
29
+
30
+ if "user_prompt_history" not in st.session_state:
31
+ st.session_state["user_prompt_history"] = []
32
+
33
+ if "chat_answers_history" not in st.session_state:
34
+ st.session_state["chat_answers_history"] = []
35
+
36
+ if "chat_history" not in st.session_state:
37
+ st.session_state["chat_history"] = []
38
+
39
+
40
+ def create_sources_string(source_urls: Set[str]) -> str:
41
+ if not source_urls:
42
+ return ""
43
+ sources_list = list(source_urls)
44
+ sources_list.sort()
45
+ sources_string = "sources:\n"
46
+ for i, source in enumerate(sources_list):
47
+ sources_string += f"{i+1}. {source}\n"
48
+ return sources_string
49
+
50
+
51
+ prompt = st.text_input("Prompt", placeholder="Enter your question here (eg. Who is Anuj Mahajan, What is his work experience ?)...")
52
+
53
+
54
+ if prompt:
55
+ with st.spinner("Generating response..."):
56
+ generated_response = run_llm(
57
+ query=prompt, chat_history=st.session_state["chat_history"]
58
+ )
59
+ sources = set(
60
+ [doc.metadata["source"] for doc in generated_response["source_documents"]]
61
+ )
62
+ formatted_response = (
63
+ f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
64
+ )
65
+
66
+ st.session_state.user_prompt_history.append(prompt)
67
+ st.session_state.chat_answers_history.append(formatted_response)
68
+ st.session_state.chat_history.append((prompt, generated_response["answer"]))
69
+
70
+ if st.session_state["chat_answers_history"]:
71
+ for generated_response, user_query in zip(
72
+ st.session_state["chat_answers_history"],
73
+ st.session_state["user_prompt_history"],
74
+ ):
75
+ message(
76
+ user_query,
77
+ is_user=True,
78
+ avatar_style="adventurer",
79
+ seed=123,
80
+ )
81
+ # message(generated_response)
82
+ st.write(
83
+ f'<div style="word-wrap: break-word;">{generated_response}</div>',
84
+ unsafe_allow_html=True,
85
+ )
backend/core.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict, List
3
+ from langchain.embeddings.openai import OpenAIEmbeddings
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.chains import ConversationalRetrievalChain
6
+ from langchain.vectorstores import Pinecone
7
+ import pinecone
8
+
9
+ from dotenv import load_dotenv
10
+
11
+ from consts import INDEX_NAME
12
+ load_dotenv()
13
+
14
+ pinecone.init(
15
+ api_key=os.environ.get("PINECONE_API_KEY"),
16
+ environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
17
+ )
18
+
19
+ def run_llm(query, chat_history):
20
+ embeddings = OpenAIEmbeddings()
21
+ docSearch = Pinecone.from_existing_index(index_name = INDEX_NAME, embedding = embeddings)
22
+ chat = ChatOpenAI(verbose = True, temperature = 0)
23
+
24
+ qa = ConversationalRetrievalChain.from_llm(llm = chat, retriever = docSearch.as_retriever(), return_source_documents = True)
25
+
26
+ return qa({"question" : query, "chat_history" : chat_history})
27
+
28
+
consts.py ADDED
@@ -0,0 +1 @@
 
 
1
+ INDEX_NAME = "pdf-parser"
dataIngestionFromPdfToPinecone.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
+ from langchain.embeddings.openai import OpenAIEmbeddings
6
+ from langchain.chains import RetrievalQA
7
+ from langchain.llms import OpenAI
8
+
9
+ from langchain.vectorstores import Pinecone
10
+ import pinecone
11
+ from dotenv import load_dotenv
12
+
13
+ from consts import INDEX_NAME
14
+
15
+ load_dotenv()
16
+
17
+ pinecone.init(
18
+ api_key=os.environ.get("PINECONE_API_KEY"),
19
+ environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
20
+ )
21
+
22
+ def ingestDataFromPdfIntoPinecone():
23
+ print('Reading Data from PDF')
24
+ pdf_path = "/Users/anujmahajan/Desktop/Anuj Documents/Resume/PDF/Amazon/Anuj Mahajan - IUB MS CS - CV.pdf"
25
+ loader = PyPDFLoader(file_path=pdf_path)
26
+ documents = loader.load()
27
+ text_splitter = RecursiveCharacterTextSplitter(
28
+ chunk_size=1000, chunk_overlap=30, separators=["\n\n", "\n", " ", ""]
29
+ )
30
+ documents = text_splitter.split_documents(documents=documents)
31
+
32
+ print(f"Going to insert {len(documents)} to Pinecone")
33
+ embeddings = OpenAIEmbeddings()
34
+ Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
35
+ print("****** Added to Pinecone vectorstore vectors")
36
+
37
+ if __name__ == "__main__":
38
+ ingestDataFromPdfIntoPinecone()
requirements.txt ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.1
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.2.0
6
+ async-timeout==4.0.3
7
+ attrs==23.2.0
8
+ blinker==1.7.0
9
+ cachetools==5.3.2
10
+ certifi==2023.11.17
11
+ charset-normalizer==3.3.2
12
+ click==8.1.7
13
+ dataclasses-json==0.6.3
14
+ distro==1.9.0
15
+ dnspython==2.4.2
16
+ exceptiongroup==1.2.0
17
+ frozenlist==1.4.1
18
+ gitdb==4.0.11
19
+ GitPython==3.1.40
20
+ greenlet==3.0.3
21
+ h11==0.14.0
22
+ httpcore==1.0.2
23
+ httpx==0.26.0
24
+ idna==3.6
25
+ importlib-metadata==6.11.0
26
+ Jinja2==3.1.2
27
+ jsonpatch==1.33
28
+ jsonpointer==2.4
29
+ jsonschema==4.20.0
30
+ jsonschema-specifications==2023.12.1
31
+ langchain==0.0.354
32
+ langchain-community==0.0.8
33
+ langchain-core==0.1.5
34
+ langsmith==0.0.77
35
+ loguru==0.7.2
36
+ markdown-it-py==3.0.0
37
+ MarkupSafe==2.1.3
38
+ marshmallow==3.20.1
39
+ mdurl==0.1.2
40
+ multidict==6.0.4
41
+ mypy-extensions==1.0.0
42
+ numpy==1.26.3
43
+ openai==1.6.1
44
+ packaging==23.2
45
+ pandas==2.1.4
46
+ pillow==10.2.0
47
+ pinecone-client==2.2.4
48
+ protobuf==4.25.1
49
+ pyarrow==14.0.2
50
+ pydantic==2.5.3
51
+ pydantic_core==2.14.6
52
+ pydeck==0.8.1b0
53
+ Pygments==2.17.2
54
+ pypdf==3.17.4
55
+ python-dateutil==2.8.2
56
+ python-dotenv==1.0.0
57
+ pytz==2023.3.post1
58
+ PyYAML==6.0.1
59
+ referencing==0.32.0
60
+ regex==2023.12.25
61
+ requests==2.31.0
62
+ rich==13.7.0
63
+ rpds-py==0.16.2
64
+ six==1.16.0
65
+ smmap==5.0.1
66
+ sniffio==1.3.0
67
+ SQLAlchemy==2.0.25
68
+ streamlit==1.29.0
69
+ streamlit-chat==0.1.1
70
+ tenacity==8.2.3
71
+ tiktoken==0.5.2
72
+ toml==0.10.2
73
+ toolz==0.12.0
74
+ tornado==6.4
75
+ tqdm==4.66.1
76
+ typing==3.7.4.3
77
+ typing-inspect==0.9.0
78
+ typing_extensions==4.9.0
79
+ tzdata==2023.4
80
+ tzlocal==5.2
81
+ urllib3==2.1.0
82
+ validators==0.22.0
83
+ yarl==1.9.4
84
+ zipp==3.17.0