nile / util.py
Divyanshh's picture
Update util.py
65037fc verified
raw
history blame
No virus
4.45 kB
import os
import streamlit as st
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
# from langchain.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
# import os
import google.generativeai as genai
import git # pip install gitpython
genai.configure(api_key = os.environ['GOOGLE_API_KEY'])
# quantization_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_compute_dtype=torch.bfloat16
# )
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs)
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config = quantization_config)
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000)
# llm = HuggingFacePipeline(pipeline=pipe)
# def clone_repo(repo):
# if os.path.exists("githubCode") and os.path.isdir("githubCode"):
# print("File already exists!!")
# pass
# else:
# print("Cloning repo!!")
# git.Repo.clone_from(repo,"githubCode")
# git.Repo.clone_from("https://github.com/Divyansh3021/Github_code_assistant.git", "githubCode")
llm = genai.GenerativeModel('gemini-pro')
def get_folder_paths(directory = "githubCode"):
folder_paths = []
for root, dirs, files in os.walk(directory):
if '.git' in dirs:
# Skip the directory if a .git folder is found
dirs.remove('.git')
for dir_name in dirs:
folder_paths.append(os.path.join(root, dir_name))
return folder_paths
directory_paths = get_folder_paths()
directory_paths.append("Code")
print("directory_paths: ", directory_paths)
with open("Code.txt", "w", encoding='utf-8') as output:
for directory_path in directory_paths:
for filename in os.listdir(directory_path):
if filename.endswith((".py",".ipynb",".js", ".ts")):
filepath = os.path.join(directory_path, filename)
with open(filepath, "r", encoding='utf-8') as file:
code = file.read()
output.write(f"Filepath: {filepath}:\n\n")
output.write(code + "\n\n")
elif filename.endswith((".txt")):
filepath = os.path.join(directory_path, filename)
with open(filepath, "r", encoding="utf-8") as file:
code = file.read()
output.write(f"Documentation list:\n\n")
output.write(code + "\n\n")
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
# for filename in os.listdir(directory_path):
# if filename.endswith(".txt"): # Only process PD files
# file_path = os.path.join(directory_path, filename)
loader = TextLoader("Code.txt", encoding="utf-8")
pages = loader.load_and_split()
# Split data into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size = 4000,
chunk_overlap = 20,
length_function = len,
add_start_index = True,
)
chunks = text_splitter.split_documents(pages)
# Store data into database
db=Chroma.from_documents(chunks,embedding=embeddings,persist_directory="test_index")
db.persist()
# Load the database
vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings)
# Load the retriver
retriever = vectordb.as_retriever(search_kwargs = {"k": 3})
# Function to generate assistant's response using ask function
def generate_assistant_response(question):
context = retriever.get_relevant_documents(question)
qna_prompt_template= f"""### [INST] Instruction: You will be provided with questions and context. Your task is to find the answers to the questions using the given data. If the data doesn't contain the answer to the question, then you must return 'Not enough information.'
Context: ```
{context}
```
### Question: {question} [/INST]"""
print("Context: ", context)
answer = llm.generate_content(qna_prompt_template).text
return answer
# print(generate_assistant_response("Tell me about the instructor_embeddings function."))