satendra4u2022 commited on
Commit
ae31157
1 Parent(s): 57508b4

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ chroma/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
chroma/2c377c44-f8d1-490a-87ff-86458f8adab8/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a56bfe039be3845d219acd5b590595ec7caa0e9a5d4bd61aa4c9407b5781eb34
3
+ size 25136000
chroma/2c377c44-f8d1-490a-87ff-86458f8adab8/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df6086851e0ed6b004edb9fcd6d976b1aee7d2424bc955702e7373dbdb5753f3
3
+ size 100
chroma/2c377c44-f8d1-490a-87ff-86458f8adab8/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87170a517300fef8206aeca2a7217e542e4f6f6af9aa0bda929e9eeed20515e2
3
+ size 230019
chroma/2c377c44-f8d1-490a-87ff-86458f8adab8/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:502471760e7e55abba071e374c9304b326d97d731ef4c5d91542be343a67530b
3
+ size 16000
chroma/2c377c44-f8d1-490a-87ff-86458f8adab8/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fa1bf89e90612633fb2e23793b030e1989531339a26e8c21a2a695501ecc4d
3
+ size 34768
chroma/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29c0d6469614d37b7eed2c39a30c5d1106fe23d895ae7b946fee6b732bf031c5
3
+ size 46448640
compare_embeddings.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import OpenAIEmbeddings
2
+ from langchain.evaluation import load_evaluator
3
+
4
+
5
+ def main():
6
+ # Get embedding for a word.
7
+ embedding_function = OpenAIEmbeddings()
8
+ vector = embedding_function.embed_query("apple")
9
+ print(f"Vector for 'apple': {vector}")
10
+ print(f"Vector length: {len(vector)}")
11
+
12
+ # Compare vector of two words
13
+ evaluator = load_evaluator("pairwise_embedding_distance")
14
+ words = ("apple", "iphone")
15
+ x = evaluator.evaluate_string_pairs(prediction=words[0], prediction_b=words[1])
16
+ print(f"Comparing ({words[0]}, {words[1]}): {x}")
17
+
18
+
19
+ if __name__ == "__main__":
20
+ main()
create_database.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import DirectoryLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.schema import Document
4
+ from langchain.embeddings import OpenAIEmbeddings
5
+ from langchain.vectorstores.chroma import Chroma
6
+ import os
7
+ import shutil
8
+
9
+ CHROMA_PATH = "chroma"
10
+ DATA_PATH = "data/final_crawl"
11
+
12
+
13
+ def main():
14
+ generate_data_store()
15
+
16
+
17
+ def generate_data_store():
18
+ documents = load_documents()
19
+ chunks = split_text(documents)
20
+ save_to_chroma(chunks)
21
+
22
+
23
+ def load_documents():
24
+ loader = DirectoryLoader(DATA_PATH, glob="*.md")
25
+ documents = loader.load()
26
+ return documents
27
+
28
+
29
+ def split_text(documents: list[Document]):
30
+ text_splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=300,
32
+ chunk_overlap=100,
33
+ length_function=len,
34
+ add_start_index=True,
35
+ )
36
+ chunks = text_splitter.split_documents(documents)
37
+ print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
38
+
39
+ document = chunks[10]
40
+ print(document.page_content)
41
+ print(document.metadata)
42
+
43
+ return chunks
44
+
45
+
46
+ def save_to_chroma(chunks: list[Document]):
47
+ # Clear out the database first.
48
+ if os.path.exists(CHROMA_PATH):
49
+ shutil.rmtree(CHROMA_PATH)
50
+
51
+ # Create a new DB from the documents.
52
+ db = Chroma.from_documents(
53
+ chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
54
+ )
55
+ db.persist()
56
+ print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
57
+
58
+
59
+ if __name__ == "__main__":
60
+ main()
query_data.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from langchain.vectorstores.chroma import Chroma
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+ from llamaapi import LlamaAPI
5
+ from langchain.prompts import ChatPromptTemplate
6
+
7
+ CHROMA_PATH = "chroma"
8
+
9
+ PROMPT_TEMPLATE = """
10
+ Answer the question based only on the following context:
11
+
12
+ {context}
13
+
14
+ ---
15
+
16
+ Answer the question based on the above context: {question}
17
+ """
18
+
19
+ def generate_reworded_question(prompt):
20
+ llama = LlamaAPI('LL-0tVJ5OwMLdglnL5Okd94ScFHyT6FMPP33oClu8i5cXWPScRswldmqXI7VH1JaT3x')
21
+
22
+ # API Request
23
+ api_request_json = {
24
+ "model": "llama-13b-chat",
25
+ "messages": [
26
+ {"role": "user", "content": prompt},
27
+ ],
28
+ "max_tokens": 200, # Set max_tokens to control the length of the generated question
29
+ "temperature": 0.1, # Adjust temperature to control the creativity of the generated question
30
+ "top_p": 0.9 # Adjust top_p to control the diversity of the generated question
31
+ }
32
+
33
+ try:
34
+ # Run llama
35
+ response = llama.run(api_request_json)
36
+ response_json = response.json()
37
+ reworded_questions = [choice['message']['content'] for choice in response_json['choices']]
38
+ return reworded_questions
39
+ except Exception as e:
40
+ print(f"Error generating reworded questions: {e}")
41
+ return [] # Return an empty list if there's an error
42
+
43
+ def main():
44
+ # Create CLI.
45
+ parser = argparse.ArgumentParser()
46
+ parser.add_argument("query_text", type=str, help="The query text.")
47
+ args = parser.parse_args()
48
+ query_text = args.query_text
49
+
50
+ # Prepare the DB.
51
+ embedding_function = OpenAIEmbeddings()
52
+ db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
53
+
54
+ # Search the DB.
55
+ results = db.similarity_search_with_relevance_scores(query_text, k=3)
56
+ if len(results) == 0 or results[0][1] < 0.7:
57
+ response_text = generate_reworded_question(query_text)
58
+ formatted_response = f"Response: {response_text}"
59
+ else:
60
+ context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
61
+ prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
62
+ prompt = prompt_template.format(context=context_text, question=query_text)
63
+ #print(prompt)
64
+ response_text = generate_reworded_question(prompt)
65
+ sources = [doc.metadata.get("source", None) for doc, _score in results]
66
+ formatted_response = f"\n\nResponse:\n {response_text}\n\nSources: {sources}"
67
+
68
+ print(formatted_response)
69
+
70
+ # Call the main function
71
+ if __name__ == "__main__":
72
+ main()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ unstructured # Document loading
3
+ chromadb # Vector storage
4
+ openai # For embeddings
5
+ tiktoken # For embeddings