Spaces:
Sleeping
Sleeping
SalehAhmad
commited on
Commit
•
93aa82a
1
Parent(s):
7ca28e9
Upload 5 files
Browse files- .gitattributes +1 -0
- RAG.py +137 -0
- app.py +27 -0
- requirements.txt +110 -0
- vector_db/.milvus_example.db.lock +0 -0
- vector_db/milvus_example.db +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
vector_db/milvus_example.db filter=lfs diff=lfs merge=lfs -text
|
RAG.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
from dotenv import load_dotenv
|
5 |
+
load_dotenv()
|
6 |
+
import shutil
|
7 |
+
|
8 |
+
from langchain_milvus import Milvus
|
9 |
+
from langchain_ollama import OllamaEmbeddings
|
10 |
+
from langchain_openai import OpenAIEmbeddings
|
11 |
+
from langchain_openai import ChatOpenAI
|
12 |
+
from git import Repo
|
13 |
+
from langchain_community.document_loaders import GitLoader
|
14 |
+
|
15 |
+
class GitHubGPT:
|
16 |
+
def __init__(self):
|
17 |
+
self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
18 |
+
self.embeddings = self.__initialize_embeddings()
|
19 |
+
self.vector_db = self.__initialize_vector_db()
|
20 |
+
self.llm = self.__initialize_llm()
|
21 |
+
self.system_prompt = self.__initialize_system_prompt()
|
22 |
+
|
23 |
+
def __initialize_embeddings(self):
|
24 |
+
return OpenAIEmbeddings(
|
25 |
+
model="text-embedding-3-small",
|
26 |
+
openai_api_key=self.OPENAI_API_KEY
|
27 |
+
)
|
28 |
+
|
29 |
+
def __initialize_vector_db(self):
|
30 |
+
if not os.path.exists("./vector_db"):
|
31 |
+
os.makedirs("./vector_db", mode=0o777)
|
32 |
+
|
33 |
+
return Milvus(
|
34 |
+
embedding_function=self.embeddings,
|
35 |
+
connection_args={"uri": "./vector_db/milvus_example.db"},
|
36 |
+
auto_id=True,
|
37 |
+
collection_name="github_gpt",
|
38 |
+
)
|
39 |
+
|
40 |
+
def __initialize_llm(self):
|
41 |
+
llm = ChatOpenAI(model="gpt-4o",
|
42 |
+
temperature=0.25,
|
43 |
+
max_tokens=None,
|
44 |
+
timeout=None,
|
45 |
+
max_retries=3)
|
46 |
+
return llm
|
47 |
+
|
48 |
+
def __initialize_system_prompt(self):
|
49 |
+
return '''
|
50 |
+
What are you? A well informed, intelligent chatbot which can talk to a given codebase.
|
51 |
+
What do you do? You are always given some file content from a codebase and a question/prompt. Your job is to generate a response.
|
52 |
+
What should be the tone of your output? It should be friendly, helpful, confident, narrative.
|
53 |
+
What outputs can we expect from you? You can be asked to genetate documentations, code, or anything else only relavant to the given codebase content.
|
54 |
+
'''
|
55 |
+
|
56 |
+
@staticmethod
|
57 |
+
def __clean_repo_name(name):
|
58 |
+
return name.replace('-', '_')
|
59 |
+
|
60 |
+
@staticmethod
|
61 |
+
def __declean_repo_name(name):
|
62 |
+
return name.replace('_', '-')
|
63 |
+
|
64 |
+
def __add_repo_data_to_db(self):
|
65 |
+
data = self.loader.load()
|
66 |
+
print(f'Length of Data to Add: {len(data)}')
|
67 |
+
print(f'Adding Data to Milvus Vector DB')
|
68 |
+
self.vector_db.add_documents(documents=data)
|
69 |
+
print(f'Done Adding Data to Milvus Vector DB')
|
70 |
+
|
71 |
+
def add_repo(self, repo_url):
|
72 |
+
repo_name = repo_url.split('/')[-1]
|
73 |
+
repo_save_path = f"./Data/Repos"
|
74 |
+
if not os.path.exists(repo_save_path):
|
75 |
+
os.makedirs(repo_save_path)
|
76 |
+
else:
|
77 |
+
shutil.rmtree(repo_save_path)
|
78 |
+
os.makedirs(repo_save_path)
|
79 |
+
repo_save_path = repo_save_path + "/" + self.__clean_repo_name(repo_name)
|
80 |
+
|
81 |
+
print(f'Cloning the repo from: {repo_url}')
|
82 |
+
repo = Repo.clone_from(
|
83 |
+
repo_url,
|
84 |
+
to_path=repo_save_path,
|
85 |
+
branch="master"
|
86 |
+
)
|
87 |
+
print(f'Repo Cloned to: {repo_save_path}')
|
88 |
+
self.repo_save_path = repo_save_path
|
89 |
+
self.branch = repo.head.reference
|
90 |
+
self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
|
91 |
+
self.__add_repo_data_to_db()
|
92 |
+
|
93 |
+
def load_repo(self):
|
94 |
+
repo_save_path = "./Data/Repos"
|
95 |
+
repo_name = os.listdir(repo_save_path)[0]
|
96 |
+
self.repo_save_path = repo_save_path + "/" + repo_name
|
97 |
+
self.branch = "master"
|
98 |
+
print(f'Loading repo: {repo_name}')
|
99 |
+
print(f'Branch: {self.branch}')
|
100 |
+
print(f'Repo path: {self.repo_save_path}')
|
101 |
+
self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
|
102 |
+
self.__add_repo_data_to_db()
|
103 |
+
|
104 |
+
def __retrieve_documents(self, prompt, k=3):
|
105 |
+
retrieved_documents = self.vector_db.similarity_search(
|
106 |
+
prompt,
|
107 |
+
k=k
|
108 |
+
)
|
109 |
+
return retrieved_documents
|
110 |
+
|
111 |
+
@staticmethod
|
112 |
+
def __concatenate_documents(documents):
|
113 |
+
print(f'Length of docs to concatenate: {len(documents)}')
|
114 |
+
All_content = ''
|
115 |
+
for idx, doc in enumerate(documents):
|
116 |
+
print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
|
117 |
+
All_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
|
118 |
+
print("\n\n")
|
119 |
+
return All_content
|
120 |
+
|
121 |
+
def query(self, prompt):
|
122 |
+
retrieved_documents = self.__retrieve_documents(prompt)
|
123 |
+
context = self.__concatenate_documents(retrieved_documents)
|
124 |
+
|
125 |
+
messages = [
|
126 |
+
(
|
127 |
+
"system",
|
128 |
+
f"{self.system_prompt}",
|
129 |
+
),
|
130 |
+
(
|
131 |
+
"human",
|
132 |
+
f"Context from codebase:{context}\nUser query prompt:{prompt}\nResponse:\n",
|
133 |
+
)
|
134 |
+
]
|
135 |
+
|
136 |
+
response = self.llm.invoke(messages)
|
137 |
+
return response.content
|
app.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Assuming the GitHubGPT class is in the same directory
|
6 |
+
from RAG import GitHubGPT
|
7 |
+
|
8 |
+
# Load environment variables
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
# Initialize the chatbot object
|
12 |
+
gpt_bot = GitHubGPT()
|
13 |
+
|
14 |
+
# Streamlit UI
|
15 |
+
st.title("GitHubGPT Chatbot")
|
16 |
+
st.write("Interact with your codebase through this RAG-based chatbot!")
|
17 |
+
|
18 |
+
# User input
|
19 |
+
user_input = st.text_input("Ask a question about the codebase:")
|
20 |
+
|
21 |
+
if st.button("Get Response"):
|
22 |
+
if user_input:
|
23 |
+
# Get response from the chatbot
|
24 |
+
response = gpt_bot.query(user_input)
|
25 |
+
st.write("Response:", response)
|
26 |
+
else:
|
27 |
+
st.write("Please enter a question.")
|
requirements.txt
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohappyeyeballs==2.4.0
|
2 |
+
aiohttp==3.10.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==5.4.1
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anyio==4.4.0
|
7 |
+
asttokens==2.4.1
|
8 |
+
attrs==24.2.0
|
9 |
+
blinker==1.8.2
|
10 |
+
cachetools==5.5.0
|
11 |
+
certifi==2024.7.4
|
12 |
+
charset-normalizer==3.3.2
|
13 |
+
click==8.1.7
|
14 |
+
comm==0.2.2
|
15 |
+
dataclasses-json==0.6.7
|
16 |
+
debugpy==1.8.5
|
17 |
+
decorator==5.1.1
|
18 |
+
distro==1.9.0
|
19 |
+
environs==9.5.0
|
20 |
+
executing==2.0.1
|
21 |
+
frozenlist==1.4.1
|
22 |
+
gitdb==4.0.11
|
23 |
+
GitPython==3.1.43
|
24 |
+
greenlet==3.0.3
|
25 |
+
grpcio==1.63.0
|
26 |
+
h11==0.14.0
|
27 |
+
httpcore==1.0.5
|
28 |
+
httpx==0.27.2
|
29 |
+
idna==3.8
|
30 |
+
ipykernel==6.29.5
|
31 |
+
ipython==8.26.0
|
32 |
+
jedi==0.19.1
|
33 |
+
Jinja2==3.1.4
|
34 |
+
jiter==0.5.0
|
35 |
+
jsonpatch==1.33
|
36 |
+
jsonpointer==3.0.0
|
37 |
+
jsonschema==4.23.0
|
38 |
+
jsonschema-specifications==2023.12.1
|
39 |
+
jupyter_client==8.6.2
|
40 |
+
jupyter_core==5.7.2
|
41 |
+
langchain==0.2.15
|
42 |
+
langchain-community==0.2.13
|
43 |
+
langchain-core==0.2.35
|
44 |
+
langchain-milvus==0.1.4
|
45 |
+
langchain-ollama==0.1.1
|
46 |
+
langchain-openai==0.1.23
|
47 |
+
langchain-text-splitters==0.2.2
|
48 |
+
langsmith==0.1.105
|
49 |
+
markdown-it-py==3.0.0
|
50 |
+
MarkupSafe==2.1.5
|
51 |
+
marshmallow==3.22.0
|
52 |
+
matplotlib-inline==0.1.7
|
53 |
+
mdurl==0.1.2
|
54 |
+
milvus-lite==2.4.9
|
55 |
+
multidict==6.0.5
|
56 |
+
mypy-extensions==1.0.0
|
57 |
+
narwhals==1.5.5
|
58 |
+
nest-asyncio==1.6.0
|
59 |
+
numpy==1.26.4
|
60 |
+
ollama==0.3.1
|
61 |
+
openai==1.42.0
|
62 |
+
orjson==3.10.7
|
63 |
+
packaging==24.1
|
64 |
+
pandas==2.2.2
|
65 |
+
parso==0.8.4
|
66 |
+
pexpect==4.9.0
|
67 |
+
pillow==10.4.0
|
68 |
+
platformdirs==4.2.2
|
69 |
+
prompt_toolkit==3.0.47
|
70 |
+
protobuf==5.27.3
|
71 |
+
psutil==6.0.0
|
72 |
+
ptyprocess==0.7.0
|
73 |
+
pure_eval==0.2.3
|
74 |
+
pyarrow==17.0.0
|
75 |
+
pydantic==2.8.2
|
76 |
+
pydantic_core==2.20.1
|
77 |
+
pydeck==0.9.1
|
78 |
+
Pygments==2.18.0
|
79 |
+
pymilvus==2.4.5
|
80 |
+
python-dateutil==2.9.0.post0
|
81 |
+
python-dotenv==1.0.1
|
82 |
+
pytz==2024.1
|
83 |
+
PyYAML==6.0.2
|
84 |
+
pyzmq==26.2.0
|
85 |
+
referencing==0.35.1
|
86 |
+
regex==2024.7.24
|
87 |
+
requests==2.32.3
|
88 |
+
rich==13.8.0
|
89 |
+
rpds-py==0.20.0
|
90 |
+
scipy==1.14.1
|
91 |
+
six==1.16.0
|
92 |
+
smmap==5.0.1
|
93 |
+
sniffio==1.3.1
|
94 |
+
SQLAlchemy==2.0.32
|
95 |
+
stack-data==0.6.3
|
96 |
+
streamlit==1.38.0
|
97 |
+
tenacity==8.5.0
|
98 |
+
tiktoken==0.7.0
|
99 |
+
toml==0.10.2
|
100 |
+
tornado==6.4.1
|
101 |
+
tqdm==4.66.5
|
102 |
+
traitlets==5.14.3
|
103 |
+
typing-inspect==0.9.0
|
104 |
+
typing_extensions==4.12.2
|
105 |
+
tzdata==2024.1
|
106 |
+
ujson==5.10.0
|
107 |
+
urllib3==2.2.2
|
108 |
+
watchdog==4.0.2
|
109 |
+
wcwidth==0.2.13
|
110 |
+
yarl==1.9.4
|
vector_db/.milvus_example.db.lock
ADDED
File without changes
|
vector_db/milvus_example.db
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee45ed323d10fe46a53948bc0376bb78f33801725a318856b70196bee23fb3fc
|
3 |
+
size 19869696
|