Spaces:
Sleeping
Sleeping
Initial commit for whole code
Browse files- .gitattributes +1 -0
- Langchain Document Helper Architecture.png +3 -0
- Pipfile +26 -0
- Pipfile.lock +0 -0
- app.py +85 -0
- backend/core.py +34 -0
- consts.py +1 -0
- ingestion.py +42 -0
- requirements.txt +110 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Langchain[[:space:]]Document[[:space:]]Helper[[:space:]]Architecture.png filter=lfs diff=lfs merge=lfs -text
|
Langchain Document Helper Architecture.png
ADDED
![]() |
Git LFS Details
|
Pipfile
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[[source]]
|
2 |
+
url = "https://pypi.org/simple"
|
3 |
+
verify_ssl = true
|
4 |
+
name = "pypi"
|
5 |
+
|
6 |
+
[packages]
|
7 |
+
langchain = "*"
|
8 |
+
beautifulsoup4 = "*"
|
9 |
+
black = "*"
|
10 |
+
tiktoken = "*"
|
11 |
+
openai = "*"
|
12 |
+
pinecone-client = "*"
|
13 |
+
unstructured = "*"
|
14 |
+
nltk = "*"
|
15 |
+
fastapi = "*"
|
16 |
+
jinja2 = "*"
|
17 |
+
uvicorn = "*"
|
18 |
+
streamlit = "*"
|
19 |
+
streamlit-chat = "*"
|
20 |
+
tqdm = "*"
|
21 |
+
|
22 |
+
[dev-packages]
|
23 |
+
|
24 |
+
[requires]
|
25 |
+
python_version = "3.11"
|
26 |
+
python_full_version = "3.11.0"
|
Pipfile.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Set
|
2 |
+
|
3 |
+
from backend.core import run_llm
|
4 |
+
import streamlit as st
|
5 |
+
from streamlit_chat import message
|
6 |
+
from PIL import Image
|
7 |
+
from io import BytesIO
|
8 |
+
import base64
|
9 |
+
|
10 |
+
# def add_bg_from_local(image_file):
|
11 |
+
# with open(image_file, "rb") as image_file:
|
12 |
+
# encoded_string = base64.b64encode(image_file.read())
|
13 |
+
# st.markdown(
|
14 |
+
# f"""
|
15 |
+
# <style>
|
16 |
+
# .stApp {{
|
17 |
+
# background-image: url(data:{"jpeg"};base64,{encoded_string.decode()});
|
18 |
+
# background-size: cover
|
19 |
+
# }}
|
20 |
+
# </style>
|
21 |
+
# """,
|
22 |
+
# unsafe_allow_html=True
|
23 |
+
# )
|
24 |
+
# background_image = "bg2.jpeg"
|
25 |
+
# add_bg_from_local(background_image)
|
26 |
+
|
27 |
+
|
28 |
+
st.header("LangChain 🦜🔗 Documentation - Helper ChatBot")
|
29 |
+
|
30 |
+
if "user_prompt_history" not in st.session_state:
|
31 |
+
st.session_state["user_prompt_history"] = []
|
32 |
+
|
33 |
+
if "chat_answers_history" not in st.session_state:
|
34 |
+
st.session_state["chat_answers_history"] = []
|
35 |
+
|
36 |
+
if "chat_history" not in st.session_state:
|
37 |
+
st.session_state["chat_history"] = []
|
38 |
+
|
39 |
+
|
40 |
+
def create_sources_string(source_urls: Set[str]) -> str:
|
41 |
+
if not source_urls:
|
42 |
+
return ""
|
43 |
+
sources_list = list(source_urls)
|
44 |
+
sources_list.sort()
|
45 |
+
sources_string = "sources:\n"
|
46 |
+
for i, source in enumerate(sources_list):
|
47 |
+
sources_string += f"{i+1}. {source}\n"
|
48 |
+
return sources_string
|
49 |
+
|
50 |
+
|
51 |
+
prompt = st.text_input("Prompt", placeholder="Enter your message here...")
|
52 |
+
|
53 |
+
|
54 |
+
if prompt:
|
55 |
+
with st.spinner("Generating response..."):
|
56 |
+
generated_response = run_llm(
|
57 |
+
query=prompt, chat_history=st.session_state["chat_history"]
|
58 |
+
)
|
59 |
+
sources = set(
|
60 |
+
[doc.metadata["source"] for doc in generated_response["source_documents"]]
|
61 |
+
)
|
62 |
+
formatted_response = (
|
63 |
+
f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
|
64 |
+
)
|
65 |
+
|
66 |
+
st.session_state.user_prompt_history.append(prompt)
|
67 |
+
st.session_state.chat_answers_history.append(formatted_response)
|
68 |
+
st.session_state.chat_history.append((prompt, generated_response["answer"]))
|
69 |
+
|
70 |
+
if st.session_state["chat_answers_history"]:
|
71 |
+
for generated_response, user_query in zip(
|
72 |
+
st.session_state["chat_answers_history"],
|
73 |
+
st.session_state["user_prompt_history"],
|
74 |
+
):
|
75 |
+
message(
|
76 |
+
user_query,
|
77 |
+
is_user=True,
|
78 |
+
avatar_style="adventurer",
|
79 |
+
seed=123,
|
80 |
+
)
|
81 |
+
# message(generated_response)
|
82 |
+
st.write(
|
83 |
+
f'<div style="word-wrap: break-word;">{generated_response}</div>',
|
84 |
+
unsafe_allow_html=True,
|
85 |
+
)
|
backend/core.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Any, Dict, List
|
3 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
4 |
+
from langchain.chat_models import ChatOpenAI
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain.chains import ConversationalRetrievalChain
|
7 |
+
from langchain.vectorstores import Pinecone
|
8 |
+
import pinecone
|
9 |
+
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
|
12 |
+
from consts import INDEX_NAME
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
pinecone.init(
|
16 |
+
api_key=os.environ.get("PINECONE_API_KEY"),
|
17 |
+
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
|
18 |
+
)
|
19 |
+
|
20 |
+
def run_llm(query, chat_history):
|
21 |
+
embeddings = OpenAIEmbeddings()
|
22 |
+
docSearch = Pinecone.from_existing_index(index_name = INDEX_NAME, embedding = embeddings)
|
23 |
+
chat = ChatOpenAI(verbose = True, temperature = 0)
|
24 |
+
|
25 |
+
#qa = RetrievalQA.from_chain_type(llm = chat,chain_type = "stuff", retriever = docSearch.as_retriever(), return_source_documents = True)
|
26 |
+
|
27 |
+
qa = ConversationalRetrievalChain.from_llm(llm = chat, retriever = docSearch.as_retriever(), return_source_documents = True)
|
28 |
+
|
29 |
+
return qa({"question" : query, "chat_history" : chat_history})
|
30 |
+
|
31 |
+
# if __name__ == "__main__":
|
32 |
+
# print(run_llm(query = "What is RetrievalQA Chain ? "), )
|
33 |
+
|
34 |
+
|
consts.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
INDEX_NAME = "langchain-doc-index"
|
ingestion.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from langchain.document_loaders import ReadTheDocsLoader
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain.embeddings import OpenAIEmbeddings
|
5 |
+
from langchain.vectorstores import Pinecone
|
6 |
+
import pinecone
|
7 |
+
from dotenv import load_dotenv
|
8 |
+
|
9 |
+
from consts import INDEX_NAME
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
pinecone.init(
|
14 |
+
api_key=os.environ.get("PINECONE_API_KEY"),
|
15 |
+
environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
|
16 |
+
)
|
17 |
+
|
18 |
+
|
19 |
+
def ingest_docs() -> None:
|
20 |
+
# openai_api_key=os.environ.get("OPENAI_API_KEY")
|
21 |
+
loader = ReadTheDocsLoader(path="langchain-docs/langchain.readthedocs.io/en/latest")
|
22 |
+
raw_documents = loader.load()
|
23 |
+
print(f"loaded {len(raw_documents) }documents")
|
24 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
25 |
+
chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]
|
26 |
+
)
|
27 |
+
documents = text_splitter.split_documents(documents=raw_documents)
|
28 |
+
print(f"Splitted into {len(documents)} chunks")
|
29 |
+
|
30 |
+
for doc in documents:
|
31 |
+
old_path = doc.metadata["source"]
|
32 |
+
new_url = old_path.replace("langchain-docs", "https:/")
|
33 |
+
doc.metadata.update({"source": new_url})
|
34 |
+
|
35 |
+
print(f"Going to insert {len(documents)} to Pinecone")
|
36 |
+
embeddings = OpenAIEmbeddings()
|
37 |
+
Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
|
38 |
+
print("****** Added to Pinecone vectorstore vectors")
|
39 |
+
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
ingest_docs()
|
requirements.txt
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp==3.9.1
|
2 |
+
aiosignal==1.3.1
|
3 |
+
altair==5.2.0
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==4.2.0
|
6 |
+
async-timeout==4.0.3
|
7 |
+
attrs==23.2.0
|
8 |
+
backoff==2.2.1
|
9 |
+
beautifulsoup4==4.12.2
|
10 |
+
black==23.12.1
|
11 |
+
blinker==1.7.0
|
12 |
+
cachetools==5.3.2
|
13 |
+
certifi==2023.11.17
|
14 |
+
chardet==5.2.0
|
15 |
+
charset-normalizer==3.3.2
|
16 |
+
click==8.1.7
|
17 |
+
dataclasses-json==0.6.3
|
18 |
+
distro==1.9.0
|
19 |
+
dnspython==2.4.2
|
20 |
+
emoji==2.9.0
|
21 |
+
exceptiongroup==1.2.0
|
22 |
+
fastapi==0.108.0
|
23 |
+
filetype==1.2.0
|
24 |
+
frozenlist==1.4.1
|
25 |
+
gitdb==4.0.11
|
26 |
+
GitPython==3.1.40
|
27 |
+
greenlet==3.0.3
|
28 |
+
h11==0.14.0
|
29 |
+
httpcore==1.0.2
|
30 |
+
httpx==0.26.0
|
31 |
+
idna==3.6
|
32 |
+
importlib-metadata==6.11.0
|
33 |
+
install==1.3.5
|
34 |
+
Jinja2==3.1.2
|
35 |
+
joblib==1.3.2
|
36 |
+
jsonpatch==1.33
|
37 |
+
jsonpath-python==1.0.6
|
38 |
+
jsonpointer==2.4
|
39 |
+
jsonschema==4.20.0
|
40 |
+
jsonschema-specifications==2023.12.1
|
41 |
+
langchain==0.0.354
|
42 |
+
langchain-community==0.0.8
|
43 |
+
langchain-core==0.1.5
|
44 |
+
langdetect==1.0.9
|
45 |
+
langsmith==0.0.77
|
46 |
+
loguru==0.7.2
|
47 |
+
lxml==5.0.0
|
48 |
+
markdown-it-py==3.0.0
|
49 |
+
MarkupSafe==2.1.3
|
50 |
+
marshmallow==3.20.1
|
51 |
+
mdurl==0.1.2
|
52 |
+
multidict==6.0.4
|
53 |
+
mypy-extensions==1.0.0
|
54 |
+
nltk==3.8.1
|
55 |
+
numpy==1.26.3
|
56 |
+
openai==1.6.1
|
57 |
+
packaging==23.2
|
58 |
+
pandas==2.1.4
|
59 |
+
pathspec==0.12.1
|
60 |
+
pillow==10.2.0
|
61 |
+
pinecone-client==2.2.4
|
62 |
+
platformdirs==4.1.0
|
63 |
+
protobuf==4.25.1
|
64 |
+
pyarrow==14.0.2
|
65 |
+
pydantic==2.5.3
|
66 |
+
pydantic_core==2.14.6
|
67 |
+
pydeck==0.8.1b0
|
68 |
+
Pygments==2.17.2
|
69 |
+
PyMuPDF==1.23.8
|
70 |
+
PyMuPDFb==1.23.7
|
71 |
+
python-dateutil==2.8.2
|
72 |
+
python-dotenv==1.0.0
|
73 |
+
python-iso639==2024.1.2
|
74 |
+
python-magic==0.4.27
|
75 |
+
pytz==2023.3.post1
|
76 |
+
PyYAML==6.0.1
|
77 |
+
rapidfuzz==3.6.1
|
78 |
+
referencing==0.32.0
|
79 |
+
regex==2023.12.25
|
80 |
+
requests==2.31.0
|
81 |
+
rich==13.7.0
|
82 |
+
rpds-py==0.16.2
|
83 |
+
six==1.16.0
|
84 |
+
smmap==5.0.1
|
85 |
+
sniffio==1.3.0
|
86 |
+
soupsieve==2.5
|
87 |
+
SQLAlchemy==2.0.25
|
88 |
+
starlette==0.32.0.post1
|
89 |
+
streamlit==1.29.0
|
90 |
+
streamlit-chat==0.1.1
|
91 |
+
tabulate==0.9.0
|
92 |
+
tenacity==8.2.3
|
93 |
+
tiktoken==0.5.2
|
94 |
+
toml==0.10.2
|
95 |
+
tomli==2.0.1
|
96 |
+
toolz==0.12.0
|
97 |
+
tornado==6.4
|
98 |
+
tqdm==4.66.1
|
99 |
+
typing-inspect==0.9.0
|
100 |
+
typing_extensions==4.9.0
|
101 |
+
tzdata==2023.4
|
102 |
+
tzlocal==5.2
|
103 |
+
unstructured==0.11.7
|
104 |
+
unstructured-client==0.15.1
|
105 |
+
urllib3==2.1.0
|
106 |
+
uvicorn==0.25.0
|
107 |
+
validators==0.22.0
|
108 |
+
wrapt==1.16.0
|
109 |
+
yarl==1.9.4
|
110 |
+
zipp==3.17.0
|