import os import streamlit as st from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma # from langchain.llms.huggingface_pipeline import HuggingFacePipeline # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate # import os import google.generativeai as genai import git # pip install gitpython genai.configure(api_key = os.environ['GOOGLE_API_KEY']) # quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_compute_dtype=torch.bfloat16 # ) model_kwargs = {'device': 'cpu'} embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs) # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") # model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config = quantization_config) # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000) # llm = HuggingFacePipeline(pipeline=pipe) # def clone_repo(repo): # if os.path.exists("githubCode") and os.path.isdir("githubCode"): # print("File already exists!!") # pass # else: # print("Cloning repo!!") # git.Repo.clone_from(repo,"githubCode") # git.Repo.clone_from("https://github.com/Divyansh3021/Github_code_assistant.git", "githubCode") llm = genai.GenerativeModel('gemini-pro') def get_folder_paths(directory = "githubCode"): folder_paths = [] for root, dirs, files in os.walk(directory): if '.git' in dirs: # Skip the directory if a .git folder is found dirs.remove('.git') for dir_name in dirs: folder_paths.append(os.path.join(root, dir_name)) return folder_paths directory_paths = get_folder_paths() directory_paths.append("Code") print("directory_paths: ", directory_paths) with open("Code.txt", "w", encoding='utf-8') as output: for directory_path in directory_paths: for filename in os.listdir(directory_path): if filename.endswith((".py",".ipynb",".js", ".ts")): filepath = os.path.join(directory_path, filename) with open(filepath, "r", encoding='utf-8') as file: code = file.read() output.write(f"Filepath: {filepath}:\n\n") output.write(code + "\n\n") from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import TextLoader # for filename in os.listdir(directory_path): # if filename.endswith(".txt"): # Only process PD files # file_path = os.path.join(directory_path, filename) loader = TextLoader("Code.txt", encoding="utf-8") pages = loader.load_and_split() # Split data into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size = 2000, chunk_overlap = 20, add_start_index = True, ) chunks = text_splitter.split_documents(pages) # Store data into database db=Chroma.from_documents(chunks,embedding=embeddings,persist_directory="test_index") db.persist() # Load the database vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings) # Load the retriver retriever = vectordb.as_retriever(search_kwargs = {"k": 3}) # Function to generate assistant's response using ask function def generate_assistant_response(question): context = retriever.get_relevant_documents(question) qna_prompt_template= f"""### [INST] Instruction: You will be provided with questions and context. Your task is to find the answers to the questions using the given data.' Context: ``` {context} ``` ### Question: {question} [/INST]""" print("Context: ", context) answer = llm.generate_content(qna_prompt_template).text return answer # print(generate_assistant_response("Tell me about the instructor_embeddings function."))