anujmaha commited on
Commit
dcc1634
1 Parent(s): 88251b8

Initial commit for whole code

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Langchain[[:space:]]Document[[:space:]]Helper[[:space:]]Architecture.png filter=lfs diff=lfs merge=lfs -text
Langchain Document Helper Architecture.png ADDED

Git LFS Details

  • SHA256: f8448b68d8f7cc0bdcd3648ca70211cb7f30e948637a2411c6409652da9b02b1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB
Pipfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [[source]]
2
+ url = "https://pypi.org/simple"
3
+ verify_ssl = true
4
+ name = "pypi"
5
+
6
+ [packages]
7
+ langchain = "*"
8
+ beautifulsoup4 = "*"
9
+ black = "*"
10
+ tiktoken = "*"
11
+ openai = "*"
12
+ pinecone-client = "*"
13
+ unstructured = "*"
14
+ nltk = "*"
15
+ fastapi = "*"
16
+ jinja2 = "*"
17
+ uvicorn = "*"
18
+ streamlit = "*"
19
+ streamlit-chat = "*"
20
+ tqdm = "*"
21
+
22
+ [dev-packages]
23
+
24
+ [requires]
25
+ python_version = "3.11"
26
+ python_full_version = "3.11.0"
Pipfile.lock ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Set
2
+
3
+ from backend.core import run_llm
4
+ import streamlit as st
5
+ from streamlit_chat import message
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ import base64
9
+
10
+ # def add_bg_from_local(image_file):
11
+ # with open(image_file, "rb") as image_file:
12
+ # encoded_string = base64.b64encode(image_file.read())
13
+ # st.markdown(
14
+ # f"""
15
+ # <style>
16
+ # .stApp {{
17
+ # background-image: url(data:{"jpeg"};base64,{encoded_string.decode()});
18
+ # background-size: cover
19
+ # }}
20
+ # </style>
21
+ # """,
22
+ # unsafe_allow_html=True
23
+ # )
24
+ # background_image = "bg2.jpeg"
25
+ # add_bg_from_local(background_image)
26
+
27
+
28
+ st.header("LangChain 🦜🔗 Documentation - Helper ChatBot")
29
+
30
+ if "user_prompt_history" not in st.session_state:
31
+ st.session_state["user_prompt_history"] = []
32
+
33
+ if "chat_answers_history" not in st.session_state:
34
+ st.session_state["chat_answers_history"] = []
35
+
36
+ if "chat_history" not in st.session_state:
37
+ st.session_state["chat_history"] = []
38
+
39
+
40
+ def create_sources_string(source_urls: Set[str]) -> str:
41
+ if not source_urls:
42
+ return ""
43
+ sources_list = list(source_urls)
44
+ sources_list.sort()
45
+ sources_string = "sources:\n"
46
+ for i, source in enumerate(sources_list):
47
+ sources_string += f"{i+1}. {source}\n"
48
+ return sources_string
49
+
50
+
51
+ prompt = st.text_input("Prompt", placeholder="Enter your message here...")
52
+
53
+
54
+ if prompt:
55
+ with st.spinner("Generating response..."):
56
+ generated_response = run_llm(
57
+ query=prompt, chat_history=st.session_state["chat_history"]
58
+ )
59
+ sources = set(
60
+ [doc.metadata["source"] for doc in generated_response["source_documents"]]
61
+ )
62
+ formatted_response = (
63
+ f"{generated_response['answer']} \n\n {create_sources_string(sources)}"
64
+ )
65
+
66
+ st.session_state.user_prompt_history.append(prompt)
67
+ st.session_state.chat_answers_history.append(formatted_response)
68
+ st.session_state.chat_history.append((prompt, generated_response["answer"]))
69
+
70
+ if st.session_state["chat_answers_history"]:
71
+ for generated_response, user_query in zip(
72
+ st.session_state["chat_answers_history"],
73
+ st.session_state["user_prompt_history"],
74
+ ):
75
+ message(
76
+ user_query,
77
+ is_user=True,
78
+ avatar_style="adventurer",
79
+ seed=123,
80
+ )
81
+ # message(generated_response)
82
+ st.write(
83
+ f'<div style="word-wrap: break-word;">{generated_response}</div>',
84
+ unsafe_allow_html=True,
85
+ )
backend/core.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Dict, List
3
+ from langchain.embeddings.openai import OpenAIEmbeddings
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.chains import RetrievalQA
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain.vectorstores import Pinecone
8
+ import pinecone
9
+
10
+ from dotenv import load_dotenv
11
+
12
+ from consts import INDEX_NAME
13
+ load_dotenv()
14
+
15
+ pinecone.init(
16
+ api_key=os.environ.get("PINECONE_API_KEY"),
17
+ environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
18
+ )
19
+
20
+ def run_llm(query, chat_history):
21
+ embeddings = OpenAIEmbeddings()
22
+ docSearch = Pinecone.from_existing_index(index_name = INDEX_NAME, embedding = embeddings)
23
+ chat = ChatOpenAI(verbose = True, temperature = 0)
24
+
25
+ #qa = RetrievalQA.from_chain_type(llm = chat,chain_type = "stuff", retriever = docSearch.as_retriever(), return_source_documents = True)
26
+
27
+ qa = ConversationalRetrievalChain.from_llm(llm = chat, retriever = docSearch.as_retriever(), return_source_documents = True)
28
+
29
+ return qa({"question" : query, "chat_history" : chat_history})
30
+
31
+ # if __name__ == "__main__":
32
+ # print(run_llm(query = "What is RetrievalQA Chain ? "), )
33
+
34
+
consts.py ADDED
@@ -0,0 +1 @@
 
 
1
+ INDEX_NAME = "langchain-doc-index"
ingestion.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain.document_loaders import ReadTheDocsLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.embeddings import OpenAIEmbeddings
5
+ from langchain.vectorstores import Pinecone
6
+ import pinecone
7
+ from dotenv import load_dotenv
8
+
9
+ from consts import INDEX_NAME
10
+
11
+ load_dotenv()
12
+
13
+ pinecone.init(
14
+ api_key=os.environ.get("PINECONE_API_KEY"),
15
+ environment=os.environ.get("PINECONE_ENVIRONMENT_REGION"),
16
+ )
17
+
18
+
19
+ def ingest_docs() -> None:
20
+ # openai_api_key=os.environ.get("OPENAI_API_KEY")
21
+ loader = ReadTheDocsLoader(path="langchain-docs/langchain.readthedocs.io/en/latest")
22
+ raw_documents = loader.load()
23
+ print(f"loaded {len(raw_documents) }documents")
24
+ text_splitter = RecursiveCharacterTextSplitter(
25
+ chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]
26
+ )
27
+ documents = text_splitter.split_documents(documents=raw_documents)
28
+ print(f"Splitted into {len(documents)} chunks")
29
+
30
+ for doc in documents:
31
+ old_path = doc.metadata["source"]
32
+ new_url = old_path.replace("langchain-docs", "https:/")
33
+ doc.metadata.update({"source": new_url})
34
+
35
+ print(f"Going to insert {len(documents)} to Pinecone")
36
+ embeddings = OpenAIEmbeddings()
37
+ Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME)
38
+ print("****** Added to Pinecone vectorstore vectors")
39
+
40
+
41
+ if __name__ == "__main__":
42
+ ingest_docs()
requirements.txt ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.1
2
+ aiosignal==1.3.1
3
+ altair==5.2.0
4
+ annotated-types==0.6.0
5
+ anyio==4.2.0
6
+ async-timeout==4.0.3
7
+ attrs==23.2.0
8
+ backoff==2.2.1
9
+ beautifulsoup4==4.12.2
10
+ black==23.12.1
11
+ blinker==1.7.0
12
+ cachetools==5.3.2
13
+ certifi==2023.11.17
14
+ chardet==5.2.0
15
+ charset-normalizer==3.3.2
16
+ click==8.1.7
17
+ dataclasses-json==0.6.3
18
+ distro==1.9.0
19
+ dnspython==2.4.2
20
+ emoji==2.9.0
21
+ exceptiongroup==1.2.0
22
+ fastapi==0.108.0
23
+ filetype==1.2.0
24
+ frozenlist==1.4.1
25
+ gitdb==4.0.11
26
+ GitPython==3.1.40
27
+ greenlet==3.0.3
28
+ h11==0.14.0
29
+ httpcore==1.0.2
30
+ httpx==0.26.0
31
+ idna==3.6
32
+ importlib-metadata==6.11.0
33
+ install==1.3.5
34
+ Jinja2==3.1.2
35
+ joblib==1.3.2
36
+ jsonpatch==1.33
37
+ jsonpath-python==1.0.6
38
+ jsonpointer==2.4
39
+ jsonschema==4.20.0
40
+ jsonschema-specifications==2023.12.1
41
+ langchain==0.0.354
42
+ langchain-community==0.0.8
43
+ langchain-core==0.1.5
44
+ langdetect==1.0.9
45
+ langsmith==0.0.77
46
+ loguru==0.7.2
47
+ lxml==5.0.0
48
+ markdown-it-py==3.0.0
49
+ MarkupSafe==2.1.3
50
+ marshmallow==3.20.1
51
+ mdurl==0.1.2
52
+ multidict==6.0.4
53
+ mypy-extensions==1.0.0
54
+ nltk==3.8.1
55
+ numpy==1.26.3
56
+ openai==1.6.1
57
+ packaging==23.2
58
+ pandas==2.1.4
59
+ pathspec==0.12.1
60
+ pillow==10.2.0
61
+ pinecone-client==2.2.4
62
+ platformdirs==4.1.0
63
+ protobuf==4.25.1
64
+ pyarrow==14.0.2
65
+ pydantic==2.5.3
66
+ pydantic_core==2.14.6
67
+ pydeck==0.8.1b0
68
+ Pygments==2.17.2
69
+ PyMuPDF==1.23.8
70
+ PyMuPDFb==1.23.7
71
+ python-dateutil==2.8.2
72
+ python-dotenv==1.0.0
73
+ python-iso639==2024.1.2
74
+ python-magic==0.4.27
75
+ pytz==2023.3.post1
76
+ PyYAML==6.0.1
77
+ rapidfuzz==3.6.1
78
+ referencing==0.32.0
79
+ regex==2023.12.25
80
+ requests==2.31.0
81
+ rich==13.7.0
82
+ rpds-py==0.16.2
83
+ six==1.16.0
84
+ smmap==5.0.1
85
+ sniffio==1.3.0
86
+ soupsieve==2.5
87
+ SQLAlchemy==2.0.25
88
+ starlette==0.32.0.post1
89
+ streamlit==1.29.0
90
+ streamlit-chat==0.1.1
91
+ tabulate==0.9.0
92
+ tenacity==8.2.3
93
+ tiktoken==0.5.2
94
+ toml==0.10.2
95
+ tomli==2.0.1
96
+ toolz==0.12.0
97
+ tornado==6.4
98
+ tqdm==4.66.1
99
+ typing-inspect==0.9.0
100
+ typing_extensions==4.9.0
101
+ tzdata==2023.4
102
+ tzlocal==5.2
103
+ unstructured==0.11.7
104
+ unstructured-client==0.15.1
105
+ urllib3==2.1.0
106
+ uvicorn==0.25.0
107
+ validators==0.22.0
108
+ wrapt==1.16.0
109
+ yarl==1.9.4
110
+ zipp==3.17.0