SalehAhmad commited on
Commit
93aa82a
1 Parent(s): 7ca28e9

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vector_db/milvus_example.db filter=lfs diff=lfs merge=lfs -text
RAG.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ from dotenv import load_dotenv
5
+ load_dotenv()
6
+ import shutil
7
+
8
+ from langchain_milvus import Milvus
9
+ from langchain_ollama import OllamaEmbeddings
10
+ from langchain_openai import OpenAIEmbeddings
11
+ from langchain_openai import ChatOpenAI
12
+ from git import Repo
13
+ from langchain_community.document_loaders import GitLoader
14
+
15
+ class GitHubGPT:
16
+ def __init__(self):
17
+ self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
18
+ self.embeddings = self.__initialize_embeddings()
19
+ self.vector_db = self.__initialize_vector_db()
20
+ self.llm = self.__initialize_llm()
21
+ self.system_prompt = self.__initialize_system_prompt()
22
+
23
+ def __initialize_embeddings(self):
24
+ return OpenAIEmbeddings(
25
+ model="text-embedding-3-small",
26
+ openai_api_key=self.OPENAI_API_KEY
27
+ )
28
+
29
+ def __initialize_vector_db(self):
30
+ if not os.path.exists("./vector_db"):
31
+ os.makedirs("./vector_db", mode=0o777)
32
+
33
+ return Milvus(
34
+ embedding_function=self.embeddings,
35
+ connection_args={"uri": "./vector_db/milvus_example.db"},
36
+ auto_id=True,
37
+ collection_name="github_gpt",
38
+ )
39
+
40
+ def __initialize_llm(self):
41
+ llm = ChatOpenAI(model="gpt-4o",
42
+ temperature=0.25,
43
+ max_tokens=None,
44
+ timeout=None,
45
+ max_retries=3)
46
+ return llm
47
+
48
+ def __initialize_system_prompt(self):
49
+ return '''
50
+ What are you? A well informed, intelligent chatbot which can talk to a given codebase.
51
+ What do you do? You are always given some file content from a codebase and a question/prompt. Your job is to generate a response.
52
+ What should be the tone of your output? It should be friendly, helpful, confident, narrative.
53
+ What outputs can we expect from you? You can be asked to genetate documentations, code, or anything else only relavant to the given codebase content.
54
+ '''
55
+
56
+ @staticmethod
57
+ def __clean_repo_name(name):
58
+ return name.replace('-', '_')
59
+
60
+ @staticmethod
61
+ def __declean_repo_name(name):
62
+ return name.replace('_', '-')
63
+
64
+ def __add_repo_data_to_db(self):
65
+ data = self.loader.load()
66
+ print(f'Length of Data to Add: {len(data)}')
67
+ print(f'Adding Data to Milvus Vector DB')
68
+ self.vector_db.add_documents(documents=data)
69
+ print(f'Done Adding Data to Milvus Vector DB')
70
+
71
+ def add_repo(self, repo_url):
72
+ repo_name = repo_url.split('/')[-1]
73
+ repo_save_path = f"./Data/Repos"
74
+ if not os.path.exists(repo_save_path):
75
+ os.makedirs(repo_save_path)
76
+ else:
77
+ shutil.rmtree(repo_save_path)
78
+ os.makedirs(repo_save_path)
79
+ repo_save_path = repo_save_path + "/" + self.__clean_repo_name(repo_name)
80
+
81
+ print(f'Cloning the repo from: {repo_url}')
82
+ repo = Repo.clone_from(
83
+ repo_url,
84
+ to_path=repo_save_path,
85
+ branch="master"
86
+ )
87
+ print(f'Repo Cloned to: {repo_save_path}')
88
+ self.repo_save_path = repo_save_path
89
+ self.branch = repo.head.reference
90
+ self.loader = GitLoader(repo_path=repo_save_path, branch=self.branch)
91
+ self.__add_repo_data_to_db()
92
+
93
+ def load_repo(self):
94
+ repo_save_path = "./Data/Repos"
95
+ repo_name = os.listdir(repo_save_path)[0]
96
+ self.repo_save_path = repo_save_path + "/" + repo_name
97
+ self.branch = "master"
98
+ print(f'Loading repo: {repo_name}')
99
+ print(f'Branch: {self.branch}')
100
+ print(f'Repo path: {self.repo_save_path}')
101
+ self.loader = GitLoader(repo_path=self.repo_save_path, branch=self.branch)
102
+ self.__add_repo_data_to_db()
103
+
104
+ def __retrieve_documents(self, prompt, k=3):
105
+ retrieved_documents = self.vector_db.similarity_search(
106
+ prompt,
107
+ k=k
108
+ )
109
+ return retrieved_documents
110
+
111
+ @staticmethod
112
+ def __concatenate_documents(documents):
113
+ print(f'Length of docs to concatenate: {len(documents)}')
114
+ All_content = ''
115
+ for idx, doc in enumerate(documents):
116
+ print(f"Retrieved Document: {idx} --- [{doc.metadata}]")
117
+ All_content += "Chunk:" + str(idx) + ":\n" + doc.page_content + "\n\n"
118
+ print("\n\n")
119
+ return All_content
120
+
121
+ def query(self, prompt):
122
+ retrieved_documents = self.__retrieve_documents(prompt)
123
+ context = self.__concatenate_documents(retrieved_documents)
124
+
125
+ messages = [
126
+ (
127
+ "system",
128
+ f"{self.system_prompt}",
129
+ ),
130
+ (
131
+ "human",
132
+ f"Context from codebase:{context}\nUser query prompt:{prompt}\nResponse:\n",
133
+ )
134
+ ]
135
+
136
+ response = self.llm.invoke(messages)
137
+ return response.content
app.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ import os
4
+
5
+ # Assuming the GitHubGPT class is in the same directory
6
+ from RAG import GitHubGPT
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Initialize the chatbot object
12
+ gpt_bot = GitHubGPT()
13
+
14
+ # Streamlit UI
15
+ st.title("GitHubGPT Chatbot")
16
+ st.write("Interact with your codebase through this RAG-based chatbot!")
17
+
18
+ # User input
19
+ user_input = st.text_input("Ask a question about the codebase:")
20
+
21
+ if st.button("Get Response"):
22
+ if user_input:
23
+ # Get response from the chatbot
24
+ response = gpt_bot.query(user_input)
25
+ st.write("Response:", response)
26
+ else:
27
+ st.write("Please enter a question.")
requirements.txt ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.0
2
+ aiohttp==3.10.5
3
+ aiosignal==1.3.1
4
+ altair==5.4.1
5
+ annotated-types==0.7.0
6
+ anyio==4.4.0
7
+ asttokens==2.4.1
8
+ attrs==24.2.0
9
+ blinker==1.8.2
10
+ cachetools==5.5.0
11
+ certifi==2024.7.4
12
+ charset-normalizer==3.3.2
13
+ click==8.1.7
14
+ comm==0.2.2
15
+ dataclasses-json==0.6.7
16
+ debugpy==1.8.5
17
+ decorator==5.1.1
18
+ distro==1.9.0
19
+ environs==9.5.0
20
+ executing==2.0.1
21
+ frozenlist==1.4.1
22
+ gitdb==4.0.11
23
+ GitPython==3.1.43
24
+ greenlet==3.0.3
25
+ grpcio==1.63.0
26
+ h11==0.14.0
27
+ httpcore==1.0.5
28
+ httpx==0.27.2
29
+ idna==3.8
30
+ ipykernel==6.29.5
31
+ ipython==8.26.0
32
+ jedi==0.19.1
33
+ Jinja2==3.1.4
34
+ jiter==0.5.0
35
+ jsonpatch==1.33
36
+ jsonpointer==3.0.0
37
+ jsonschema==4.23.0
38
+ jsonschema-specifications==2023.12.1
39
+ jupyter_client==8.6.2
40
+ jupyter_core==5.7.2
41
+ langchain==0.2.15
42
+ langchain-community==0.2.13
43
+ langchain-core==0.2.35
44
+ langchain-milvus==0.1.4
45
+ langchain-ollama==0.1.1
46
+ langchain-openai==0.1.23
47
+ langchain-text-splitters==0.2.2
48
+ langsmith==0.1.105
49
+ markdown-it-py==3.0.0
50
+ MarkupSafe==2.1.5
51
+ marshmallow==3.22.0
52
+ matplotlib-inline==0.1.7
53
+ mdurl==0.1.2
54
+ milvus-lite==2.4.9
55
+ multidict==6.0.5
56
+ mypy-extensions==1.0.0
57
+ narwhals==1.5.5
58
+ nest-asyncio==1.6.0
59
+ numpy==1.26.4
60
+ ollama==0.3.1
61
+ openai==1.42.0
62
+ orjson==3.10.7
63
+ packaging==24.1
64
+ pandas==2.2.2
65
+ parso==0.8.4
66
+ pexpect==4.9.0
67
+ pillow==10.4.0
68
+ platformdirs==4.2.2
69
+ prompt_toolkit==3.0.47
70
+ protobuf==5.27.3
71
+ psutil==6.0.0
72
+ ptyprocess==0.7.0
73
+ pure_eval==0.2.3
74
+ pyarrow==17.0.0
75
+ pydantic==2.8.2
76
+ pydantic_core==2.20.1
77
+ pydeck==0.9.1
78
+ Pygments==2.18.0
79
+ pymilvus==2.4.5
80
+ python-dateutil==2.9.0.post0
81
+ python-dotenv==1.0.1
82
+ pytz==2024.1
83
+ PyYAML==6.0.2
84
+ pyzmq==26.2.0
85
+ referencing==0.35.1
86
+ regex==2024.7.24
87
+ requests==2.32.3
88
+ rich==13.8.0
89
+ rpds-py==0.20.0
90
+ scipy==1.14.1
91
+ six==1.16.0
92
+ smmap==5.0.1
93
+ sniffio==1.3.1
94
+ SQLAlchemy==2.0.32
95
+ stack-data==0.6.3
96
+ streamlit==1.38.0
97
+ tenacity==8.5.0
98
+ tiktoken==0.7.0
99
+ toml==0.10.2
100
+ tornado==6.4.1
101
+ tqdm==4.66.5
102
+ traitlets==5.14.3
103
+ typing-inspect==0.9.0
104
+ typing_extensions==4.12.2
105
+ tzdata==2024.1
106
+ ujson==5.10.0
107
+ urllib3==2.2.2
108
+ watchdog==4.0.2
109
+ wcwidth==0.2.13
110
+ yarl==1.9.4
vector_db/.milvus_example.db.lock ADDED
File without changes
vector_db/milvus_example.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee45ed323d10fe46a53948bc0376bb78f33801725a318856b70196bee23fb3fc
3
+ size 19869696