Divyanshh commited on
Commit
d445e66
1 Parent(s): 8336356

Update util.py

Browse files
Files changed (1) hide show
  1. util.py +18 -63
util.py CHANGED
@@ -1,48 +1,17 @@
1
  import os
2
  import streamlit as st
3
- from langchain_community.embeddings import HuggingFaceEmbeddings
4
- from sentence_transformers import SentenceTransformer
5
  from langchain_community.vectorstores import Chroma
6
- # from langchain.llms.huggingface_pipeline import HuggingFacePipeline
7
- # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
8
- from langchain.chains.question_answering import load_qa_chain
9
- from langchain.prompts import PromptTemplate
10
 
11
- # import os
12
- import google.generativeai as genai
13
 
14
- import git # pip install gitpython
15
 
16
- genai.configure(api_key = os.environ['GOOGLE_API_KEY'])
17
 
18
- # quantization_config = BitsAndBytesConfig(
19
- # load_in_4bit=True,
20
- # bnb_4bit_compute_dtype=torch.bfloat16
21
- # )
22
-
23
-
24
- model_kwargs = {'device': 'cpu'}
25
- embeddings = HuggingFaceEmbeddings(model_name="michaelfeil/ct2fast-e5-small",model_kwargs=model_kwargs, )
26
- # embeddings = SentenceTransformer(model_name_or_path="All-MiniLM-L6-v2")
27
-
28
- # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
29
- # model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map='auto', quantization_config = quantization_config)
30
-
31
- # pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens = 1000)
32
- # llm = HuggingFacePipeline(pipeline=pipe)
33
-
34
-
35
- # def clone_repo(repo):
36
- # if os.path.exists("githubCode") and os.path.isdir("githubCode"):
37
- # print("File already exists!!")
38
- # pass
39
- # else:
40
- # print("Cloning repo!!")
41
- # git.Repo.clone_from(repo,"githubCode")
42
-
43
- # git.Repo.clone_from("https://github.com/Divyansh3021/Github_code_assistant.git", "githubCode")
44
-
45
- llm = genai.GenerativeModel('gemini-pro')
46
 
47
  def get_folder_paths(directory = "githubCode"):
48
  folder_paths = []
@@ -71,40 +40,26 @@ with open("Code.txt", "w", encoding='utf-8') as output:
71
  from langchain.text_splitter import RecursiveCharacterTextSplitter
72
  from langchain_community.document_loaders import TextLoader
73
 
74
- # for filename in os.listdir(directory_path):
75
- # if filename.endswith(".txt"): # Only process PD files
76
- # file_path = os.path.join(directory_path, filename)
77
  loader = TextLoader("Code.txt", encoding="utf-8")
78
  pages = loader.load_and_split()
79
 
80
  # Split data into chunks
81
- text_splitter = RecursiveCharacterTextSplitter(
82
- chunk_size = 2000,
83
- chunk_overlap = 20,
84
- add_start_index = True,
85
- )
86
- chunks = text_splitter.split_documents(pages)
87
 
88
- # Store data into database
89
- db=Chroma.from_documents(chunks,embedding=embeddings,persist_directory="test_index")
90
- db.persist()
91
 
92
- # Load the database
93
- vectordb = Chroma(persist_directory="test_index", embedding_function = embeddings)
 
 
94
 
95
- # Load the retriver
96
- retriever = vectordb.as_retriever(search_kwargs = {"k": 3})
97
 
98
  # Function to generate assistant's response using ask function
99
  def generate_assistant_response(question):
100
- context = retriever.get_relevant_documents(question)
101
- qna_prompt_template= f"""### [INST] Instruction: You will be provided with questions and context. Your task is to find the answers to the questions using the given data.'
102
- Context: ```
103
- {context}
104
- ```
105
- ### Question: {question} [/INST]"""
106
- print("Context: ", context)
107
- answer = llm.generate_content(qna_prompt_template).text
108
- return answer
109
 
110
  # print(generate_assistant_response("Tell me about the instructor_embeddings function."))
 
1
  import os
2
  import streamlit as st
3
+ from langchain_community.embeddings import HuggingFaceHubEmbeddings
 
4
  from langchain_community.vectorstores import Chroma
5
+ from langchain.chains import RetrievalQA
6
+ from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 
 
7
 
8
+ import git
 
9
 
10
+ from chromadb.utils import embedding_functions
11
 
12
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=os.environ['GOOGLE_API_KEY'], task_type="retrieval_query")
13
 
14
+ model = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=os.environ['GOOGLE_API_KEY'],temperature=0.2,convert_system_message_to_human=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def get_folder_paths(directory = "githubCode"):
17
  folder_paths = []
 
40
  from langchain.text_splitter import RecursiveCharacterTextSplitter
41
  from langchain_community.document_loaders import TextLoader
42
 
 
 
 
43
  loader = TextLoader("Code.txt", encoding="utf-8")
44
  pages = loader.load_and_split()
45
 
46
  # Split data into chunks
47
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
48
+ context = "\n\n".join(str(p.page_content) for p in pages)
49
+ texts = text_splitter.split_text(context)
 
 
 
50
 
51
+ vector_index = Chroma.from_texts(texts, embeddings).as_retriever(search_kwargs={"k":5})
 
 
52
 
53
+ qa_chain = RetrievalQA.from_chain_type(
54
+ model,
55
+ retriever=vector_index,
56
+ return_source_documents=True
57
 
58
+ )
 
59
 
60
  # Function to generate assistant's response using ask function
61
  def generate_assistant_response(question):
62
+ answer = qa_chain({"query": question})
63
+ return answer['result']
 
 
 
 
 
 
 
64
 
65
  # print(generate_assistant_response("Tell me about the instructor_embeddings function."))