import os import streamlit as st from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma # from langchain.llms.huggingface_pipeline import HuggingFacePipeline # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate # import os import google.generativeai as genai import git # pip install gitpython genai.configure(api_key = os.environ['GOOGLE_API_KEY']) # quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_compute_dtype=torch.bfloat16 # ) model_kwargs = {'device': 'cpu'} embeddings = HuggingFaceEmbeddings(model_kwargs=model_kwargs) # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") # model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config = quantization_config) # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000) # llm = HuggingFacePipeline(pipeline=pipe) # def clone_repo(repo): # if os.path.exists("githubCode") and os.path.isdir("githubCode"): # print("File already exists!!") # pass # else: # print("Cloning repo!!") # git.Repo.clone_from(repo,"githubCode") # git.Repo.clone_from("https://github.com/Divyansh3021/Github_code_assistant.git", "githubCode") llm = genai.GenerativeModel('gemini-pro') def get_folder_paths(directory = "githubCode"): folder_paths = [] for root, dirs, files in os.walk(directory): if '.git' in dirs: # Skip the directory if a .git folder is found dirs.remove('.git') for dir_name in dirs: folder_paths.append(os.path.join(root, dir_name)) return folder_paths directory_paths = get_folder_paths() directory_paths.append("Code") print("directory_paths: ", directory_paths) with open("Code.txt", "w", encoding='utf-8') as output: for directory_path in directory_paths: for filename in os.listdir(directory_path): if filename.endswith((".py",".ipynb",".js", ".ts")): filepath = os.path.join(directory_path, filename) with open(filepath, "r", encoding='utf-8') as file: code = file.read() output.write(f"Filepath: {filepath}:\n\n") output.write(code + "\n\n") elif filename.endswith((".txt")): filepath = os.path.join(directory_path, filename) with open(filepath, "r", encoding="utf-8") as file: code = file.read() output.write(f"Documentation list:\n\n") output.write(code + "\n\n") from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.document_loaders import TextLoader # for filename in os.listdir(directory_path): # if filename.endswith(".txt"): # Only process PD files # file_path = os.path.join(directory_path, filename) loader = TextLoader("Code.txt", encoding="utf-8") pages = loader.load_and_split() # Split data into chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size = 4000, chunk_overlap = 20, length_function = len, add_start_index = True, ) chunks = text_splitter.split_documents(pages) # Store data into database db=Chroma.from_documents(chunks,embedding=embeddings,persist_directory="test_index") db.persist() # Load the database vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings) # Load the retriver retriever = vectordb.as_retriever(search_kwargs = {"k": 3}) # Function to generate assistant's response using ask function def generate_assistant_response(question): context = retriever.get_relevant_documents(question) qna_prompt_template= f"""### [INST] Instruction: You will be provided with questions and context. Your task is to find the answers to the questions using the given data. If the data doesn't contain the answer to the question, then you must return 'Not enough information.' Context: ``` {context} ``` ### Question: {question} [/INST]""" print("Context: ", context) answer = llm.generate_content(qna_prompt_template).text return answer # print(generate_assistant_response("Tell me about the instructor_embeddings function."))