Spaces:
Sleeping
Sleeping
Commit
·
9bf726b
1
Parent(s):
4c0b3a6
feat: added comments to class
Browse files- README.md +13 -0
- model.py +55 -4
- pages/upload_file.py +2 -1
- pages/upload_url.py +2 -1
- utilis.py +10 -14
README.md
CHANGED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Drake
|
2 |
+
**Make Notes without mess**
|
3 |
+
|
4 |
+
DrakeLLM is developed to help students to solve the issue of making notes from videos and LLMs. Utilising RAG, Drake helps in making quick notes along with a Q&A bot. Books, YouTube tutorials or Videos, Drake supports all your means.
|
5 |
+
|
6 |
+
## Features
|
7 |
+
- **Quick Notes**: Make notes quickly with Drake.
|
8 |
+
- **Q&A Bot**: Ask questions and get answers from Drake.
|
9 |
+
|
10 |
+
## Upcoming Features
|
11 |
+
- **Image Support**: Querying images on similarity criteria.
|
12 |
+
- **Image for context**: Using images for context in multimodal models like Llava.
|
13 |
+
- **Completely Open Source**: Supporting the app to run on completely open source models like Llava, Llama.
|
model.py
CHANGED
@@ -10,6 +10,15 @@ from langchain_core.documents.base import Document
|
|
10 |
|
11 |
class DrakeLM:
|
12 |
def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
self.llm_model = llm_model
|
14 |
|
15 |
if llm_model == "llama":
|
@@ -25,7 +34,18 @@ class DrakeLM:
|
|
25 |
self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
|
26 |
self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
|
27 |
|
28 |
-
def _chat_prompt(self, query: str, context: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
prompt = """You are assisting a student to understand topics. \n\n
|
30 |
You have to answer the below question by utilising the below context to answer the question. \n\n
|
31 |
Note to follow the rules given below \n\n
|
@@ -46,7 +66,19 @@ class DrakeLM:
|
|
46 |
prompt = prompt.format(query=query, context=context, rules=rules)
|
47 |
return PromptTemplate.from_template(prompt), prompt
|
48 |
|
49 |
-
def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
self.retriever.search_kwargs["distance_metric"] = distance_metric
|
51 |
self.retriever.search_kwargs["k"] = k
|
52 |
|
@@ -65,7 +97,17 @@ class DrakeLM:
|
|
65 |
|
66 |
return context
|
67 |
|
68 |
-
def ask_llm(self, query: str, metadata_filter: dict = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
context = self._retrieve(query, metadata_filter)
|
70 |
print("Retrieved context")
|
71 |
prompt_template, prompt_string = self._chat_prompt(query, context)
|
@@ -89,7 +131,16 @@ class DrakeLM:
|
|
89 |
|
90 |
return self.chat_history.messages[-1].content
|
91 |
|
92 |
-
def create_notes(self, documents: List[Document]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
rules = """
|
94 |
- Follow the Markdown format for creating notes as shown in the example.
|
95 |
- The heading of the content should be the title of the markdown file.
|
|
|
10 |
|
11 |
class DrakeLM:
|
12 |
def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
|
13 |
+
"""
|
14 |
+
Parameters:
|
15 |
+
model_path (str): The path to the model in case running Llama
|
16 |
+
db (DeepLake): The DeepLake DB object
|
17 |
+
config (dict): The configuration for the llama model
|
18 |
+
llm_model (str): The LLM model type
|
19 |
+
|
20 |
+
Initialize the DrakeLM model
|
21 |
+
"""
|
22 |
self.llm_model = llm_model
|
23 |
|
24 |
if llm_model == "llama":
|
|
|
34 |
self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
|
35 |
self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
|
36 |
|
37 |
+
def _chat_prompt(self, query: str, context: str) -> (PromptTemplate, str):
|
38 |
+
"""
|
39 |
+
Parameters:
|
40 |
+
query (str): The question asked by the user
|
41 |
+
context (str): The context retrieved from the DB
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
PromptTemplate: The prompt template for the chat
|
45 |
+
prompt (str): The prompt string for the chat
|
46 |
+
|
47 |
+
Create the chat prompt for the LLM model
|
48 |
+
"""
|
49 |
prompt = """You are assisting a student to understand topics. \n\n
|
50 |
You have to answer the below question by utilising the below context to answer the question. \n\n
|
51 |
Note to follow the rules given below \n\n
|
|
|
66 |
prompt = prompt.format(query=query, context=context, rules=rules)
|
67 |
return PromptTemplate.from_template(prompt), prompt
|
68 |
|
69 |
+
def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos") -> str:
|
70 |
+
"""
|
71 |
+
Parameters:
|
72 |
+
query (str): The question asked by the user
|
73 |
+
metadata_filter (dict): The metadata filter for the DB
|
74 |
+
k (int): The number of documents to retrieve
|
75 |
+
distance_metric (str): The distance metric for retrieval
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
str: The context retrieved from the DB
|
79 |
+
|
80 |
+
Retrieve the context from the DB
|
81 |
+
"""
|
82 |
self.retriever.search_kwargs["distance_metric"] = distance_metric
|
83 |
self.retriever.search_kwargs["k"] = k
|
84 |
|
|
|
97 |
|
98 |
return context
|
99 |
|
100 |
+
def ask_llm(self, query: str, metadata_filter: dict = None) -> str:
|
101 |
+
"""
|
102 |
+
Parameters:
|
103 |
+
query (str): The question asked by the user
|
104 |
+
metadata_filter (dict): The metadata filter for the DB
|
105 |
+
|
106 |
+
Returns:
|
107 |
+
str: The response from the LLM model
|
108 |
+
|
109 |
+
Ask the LLM model a question
|
110 |
+
"""
|
111 |
context = self._retrieve(query, metadata_filter)
|
112 |
print("Retrieved context")
|
113 |
prompt_template, prompt_string = self._chat_prompt(query, context)
|
|
|
131 |
|
132 |
return self.chat_history.messages[-1].content
|
133 |
|
134 |
+
def create_notes(self, documents: List[Document]) -> str:
|
135 |
+
"""
|
136 |
+
Parameters:
|
137 |
+
documents (List[Document]): The list of documents to create notes from
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
str: The notes generated from the LLM model
|
141 |
+
|
142 |
+
Create notes from the LLM model
|
143 |
+
"""
|
144 |
rules = """
|
145 |
- Follow the Markdown format for creating notes as shown in the example.
|
146 |
- The heading of the content should be the title of the markdown file.
|
pages/upload_file.py
CHANGED
@@ -16,7 +16,8 @@ if st.button("Youtube/Video URL"):
|
|
16 |
st.subheader('Upload the file')
|
17 |
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
18 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
19 |
-
llm_model = st.selectbox('Choose LLM Model',
|
|
|
20 |
drake.llm_model = llm_model
|
21 |
|
22 |
|
|
|
16 |
st.subheader('Upload the file')
|
17 |
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
18 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
19 |
+
llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
|
20 |
+
# llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
|
21 |
drake.llm_model = llm_model
|
22 |
|
23 |
|
pages/upload_url.py
CHANGED
@@ -13,7 +13,8 @@ if st.button("PDF/Transcript"):
|
|
13 |
|
14 |
st.subheader('Enter the Video URL')
|
15 |
video_url = st.text_input(label="Enter the URL")
|
16 |
-
llm_model = st.selectbox('Choose LLM Model',
|
|
|
17 |
drake.llm_model = llm_model
|
18 |
|
19 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
|
|
13 |
|
14 |
st.subheader('Enter the Video URL')
|
15 |
video_url = st.text_input(label="Enter the URL")
|
16 |
+
llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
|
17 |
+
# llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
|
18 |
drake.llm_model = llm_model
|
19 |
|
20 |
allow_make_notes = st.toggle('Make Complete Notes!')
|
utilis.py
CHANGED
@@ -13,16 +13,12 @@ from typing import Dict
|
|
13 |
import uuid
|
14 |
|
15 |
|
16 |
-
|
17 |
class Processing:
|
18 |
-
def __init__(self, dataset_path: str, embedding_model_name: str,
|
19 |
-
device='cpu', chunk_size=500, chunk_overlap=5):
|
20 |
"""
|
21 |
Parameters:
|
22 |
dataset_path (str): Path to the dataset in the Vector-DB
|
23 |
-
file_path (str): Path to the file to be processed
|
24 |
embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
|
25 |
-
device (str): Device to run the embedding model on
|
26 |
chunk_size (int): Size of each chunk to be processed
|
27 |
chunk_overlap (int): Overlap between each chunk
|
28 |
|
@@ -34,7 +30,6 @@ class Processing:
|
|
34 |
|
35 |
self.embedding_model = HuggingFaceEmbeddings(
|
36 |
model_name=embedding_model_name,
|
37 |
-
model_kwargs={'device': device},
|
38 |
encode_kwargs={'normalize_embeddings': False}
|
39 |
)
|
40 |
|
@@ -43,8 +38,8 @@ class Processing:
|
|
43 |
exec_option="compute_engine"
|
44 |
)
|
45 |
|
46 |
-
def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
|
47 |
-
|
48 |
"""
|
49 |
Parameters:
|
50 |
documents (List[Document]): List of documents to add metadata to
|
@@ -54,7 +49,7 @@ class Processing:
|
|
54 |
course_tag (str): Tag to identify the course the documents belongs to
|
55 |
|
56 |
Returns:
|
57 |
-
documents (List[Document]): List of documents with metadata added
|
58 |
|
59 |
Add metadata to the documents
|
60 |
"""
|
@@ -69,10 +64,10 @@ class Processing:
|
|
69 |
doc.metadata = metadata
|
70 |
return documents, metadata
|
71 |
|
72 |
-
def load_pdf(self,
|
73 |
"""
|
74 |
Returns:
|
75 |
-
|
76 |
|
77 |
Load PDF file, split into chunks and add metadata
|
78 |
"""
|
@@ -83,7 +78,7 @@ class Processing:
|
|
83 |
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
84 |
"""
|
85 |
Returns:
|
86 |
-
|
87 |
|
88 |
Load transcript, split into chunks and add metadata
|
89 |
"""
|
@@ -91,12 +86,13 @@ class Processing:
|
|
91 |
print("Transcribed")
|
92 |
transcript_chunk = self.text_splitter.create_documents([transcript.text])
|
93 |
print("Created transcript chunks")
|
94 |
-
return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
|
|
|
95 |
|
96 |
def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
|
97 |
"""
|
98 |
Returns:
|
99 |
-
|
100 |
|
101 |
Load YouTube transcript, split into chunks and add metadata
|
102 |
"""
|
|
|
13 |
import uuid
|
14 |
|
15 |
|
|
|
16 |
class Processing:
|
17 |
+
def __init__(self, dataset_path: str, embedding_model_name: str, chunk_size=500, chunk_overlap=5):
|
|
|
18 |
"""
|
19 |
Parameters:
|
20 |
dataset_path (str): Path to the dataset in the Vector-DB
|
|
|
21 |
embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
|
|
|
22 |
chunk_size (int): Size of each chunk to be processed
|
23 |
chunk_overlap (int): Overlap between each chunk
|
24 |
|
|
|
30 |
|
31 |
self.embedding_model = HuggingFaceEmbeddings(
|
32 |
model_name=embedding_model_name,
|
|
|
33 |
encode_kwargs={'normalize_embeddings': False}
|
34 |
)
|
35 |
|
|
|
38 |
exec_option="compute_engine"
|
39 |
)
|
40 |
|
41 |
+
def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
|
42 |
+
course_tag="") -> (List[Document], Dict[str, str]):
|
43 |
"""
|
44 |
Parameters:
|
45 |
documents (List[Document]): List of documents to add metadata to
|
|
|
49 |
course_tag (str): Tag to identify the course the documents belongs to
|
50 |
|
51 |
Returns:
|
52 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
53 |
|
54 |
Add metadata to the documents
|
55 |
"""
|
|
|
64 |
doc.metadata = metadata
|
65 |
return documents, metadata
|
66 |
|
67 |
+
def load_pdf(self, text) -> (List[Document], Dict[str, str]):
|
68 |
"""
|
69 |
Returns:
|
70 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
71 |
|
72 |
Load PDF file, split into chunks and add metadata
|
73 |
"""
|
|
|
78 |
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
79 |
"""
|
80 |
Returns:
|
81 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
82 |
|
83 |
Load transcript, split into chunks and add metadata
|
84 |
"""
|
|
|
86 |
print("Transcribed")
|
87 |
transcript_chunk = self.text_splitter.create_documents([transcript.text])
|
88 |
print("Created transcript chunks")
|
89 |
+
return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
|
90 |
+
file_type="transcript")
|
91 |
|
92 |
def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
|
93 |
"""
|
94 |
Returns:
|
95 |
+
documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
|
96 |
|
97 |
Load YouTube transcript, split into chunks and add metadata
|
98 |
"""
|