kausthubkannan17 commited on
Commit
9bf726b
·
1 Parent(s): 4c0b3a6

feat: added comments to class

Browse files
Files changed (5) hide show
  1. README.md +13 -0
  2. model.py +55 -4
  3. pages/upload_file.py +2 -1
  4. pages/upload_url.py +2 -1
  5. utilis.py +10 -14
README.md CHANGED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Drake
2
+ **Make Notes without mess**
3
+
4
+ DrakeLLM is developed to help students to solve the issue of making notes from videos and LLMs. Utilising RAG, Drake helps in making quick notes along with a Q&A bot. Books, YouTube tutorials or Videos, Drake supports all your means.
5
+
6
+ ## Features
7
+ - **Quick Notes**: Make notes quickly with Drake.
8
+ - **Q&A Bot**: Ask questions and get answers from Drake.
9
+
10
+ ## Upcoming Features
11
+ - **Image Support**: Querying images on similarity criteria.
12
+ - **Image for context**: Using images for context in multimodal models like Llava.
13
+ - **Completely Open Source**: Supporting the app to run on completely open source models like Llava, Llama.
model.py CHANGED
@@ -10,6 +10,15 @@ from langchain_core.documents.base import Document
10
 
11
  class DrakeLM:
12
  def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
 
 
 
 
 
 
 
 
 
13
  self.llm_model = llm_model
14
 
15
  if llm_model == "llama":
@@ -25,7 +34,18 @@ class DrakeLM:
25
  self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
26
  self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
27
 
28
- def _chat_prompt(self, query: str, context: str):
 
 
 
 
 
 
 
 
 
 
 
29
  prompt = """You are assisting a student to understand topics. \n\n
30
  You have to answer the below question by utilising the below context to answer the question. \n\n
31
  Note to follow the rules given below \n\n
@@ -46,7 +66,19 @@ class DrakeLM:
46
  prompt = prompt.format(query=query, context=context, rules=rules)
47
  return PromptTemplate.from_template(prompt), prompt
48
 
49
- def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos"):
 
 
 
 
 
 
 
 
 
 
 
 
50
  self.retriever.search_kwargs["distance_metric"] = distance_metric
51
  self.retriever.search_kwargs["k"] = k
52
 
@@ -65,7 +97,17 @@ class DrakeLM:
65
 
66
  return context
67
 
68
- def ask_llm(self, query: str, metadata_filter: dict = None):
 
 
 
 
 
 
 
 
 
 
69
  context = self._retrieve(query, metadata_filter)
70
  print("Retrieved context")
71
  prompt_template, prompt_string = self._chat_prompt(query, context)
@@ -89,7 +131,16 @@ class DrakeLM:
89
 
90
  return self.chat_history.messages[-1].content
91
 
92
- def create_notes(self, documents: List[Document]):
 
 
 
 
 
 
 
 
 
93
  rules = """
94
  - Follow the Markdown format for creating notes as shown in the example.
95
  - The heading of the content should be the title of the markdown file.
 
10
 
11
  class DrakeLM:
12
  def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
13
+ """
14
+ Parameters:
15
+ model_path (str): The path to the model in case running Llama
16
+ db (DeepLake): The DeepLake DB object
17
+ config (dict): The configuration for the llama model
18
+ llm_model (str): The LLM model type
19
+
20
+ Initialize the DrakeLM model
21
+ """
22
  self.llm_model = llm_model
23
 
24
  if llm_model == "llama":
 
34
  self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
35
  self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
36
 
37
+ def _chat_prompt(self, query: str, context: str) -> (PromptTemplate, str):
38
+ """
39
+ Parameters:
40
+ query (str): The question asked by the user
41
+ context (str): The context retrieved from the DB
42
+
43
+ Returns:
44
+ PromptTemplate: The prompt template for the chat
45
+ prompt (str): The prompt string for the chat
46
+
47
+ Create the chat prompt for the LLM model
48
+ """
49
  prompt = """You are assisting a student to understand topics. \n\n
50
  You have to answer the below question by utilising the below context to answer the question. \n\n
51
  Note to follow the rules given below \n\n
 
66
  prompt = prompt.format(query=query, context=context, rules=rules)
67
  return PromptTemplate.from_template(prompt), prompt
68
 
69
+ def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos") -> str:
70
+ """
71
+ Parameters:
72
+ query (str): The question asked by the user
73
+ metadata_filter (dict): The metadata filter for the DB
74
+ k (int): The number of documents to retrieve
75
+ distance_metric (str): The distance metric for retrieval
76
+
77
+ Returns:
78
+ str: The context retrieved from the DB
79
+
80
+ Retrieve the context from the DB
81
+ """
82
  self.retriever.search_kwargs["distance_metric"] = distance_metric
83
  self.retriever.search_kwargs["k"] = k
84
 
 
97
 
98
  return context
99
 
100
+ def ask_llm(self, query: str, metadata_filter: dict = None) -> str:
101
+ """
102
+ Parameters:
103
+ query (str): The question asked by the user
104
+ metadata_filter (dict): The metadata filter for the DB
105
+
106
+ Returns:
107
+ str: The response from the LLM model
108
+
109
+ Ask the LLM model a question
110
+ """
111
  context = self._retrieve(query, metadata_filter)
112
  print("Retrieved context")
113
  prompt_template, prompt_string = self._chat_prompt(query, context)
 
131
 
132
  return self.chat_history.messages[-1].content
133
 
134
+ def create_notes(self, documents: List[Document]) -> str:
135
+ """
136
+ Parameters:
137
+ documents (List[Document]): The list of documents to create notes from
138
+
139
+ Returns:
140
+ str: The notes generated from the LLM model
141
+
142
+ Create notes from the LLM model
143
+ """
144
  rules = """
145
  - Follow the Markdown format for creating notes as shown in the example.
146
  - The heading of the content should be the title of the markdown file.
pages/upload_file.py CHANGED
@@ -16,7 +16,8 @@ if st.button("Youtube/Video URL"):
16
  st.subheader('Upload the file')
17
  uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
18
  allow_make_notes = st.toggle('Make Complete Notes!')
19
- llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama', 'Mobile phone'))
 
20
  drake.llm_model = llm_model
21
 
22
 
 
16
  st.subheader('Upload the file')
17
  uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
18
  allow_make_notes = st.toggle('Make Complete Notes!')
19
+ llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
20
+ # llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
21
  drake.llm_model = llm_model
22
 
23
 
pages/upload_url.py CHANGED
@@ -13,7 +13,8 @@ if st.button("PDF/Transcript"):
13
 
14
  st.subheader('Enter the Video URL')
15
  video_url = st.text_input(label="Enter the URL")
16
- llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama', 'Mobile phone'))
 
17
  drake.llm_model = llm_model
18
 
19
  allow_make_notes = st.toggle('Make Complete Notes!')
 
13
 
14
  st.subheader('Enter the Video URL')
15
  video_url = st.text_input(label="Enter the URL")
16
+ llm_model = st.selectbox('Choose LLM Model', 'gemini-pro')
17
+ # llm_model = st.selectbox('Choose LLM Model', ('gemini-pro', 'llama'))
18
  drake.llm_model = llm_model
19
 
20
  allow_make_notes = st.toggle('Make Complete Notes!')
utilis.py CHANGED
@@ -13,16 +13,12 @@ from typing import Dict
13
  import uuid
14
 
15
 
16
-
17
  class Processing:
18
- def __init__(self, dataset_path: str, embedding_model_name: str,
19
- device='cpu', chunk_size=500, chunk_overlap=5):
20
  """
21
  Parameters:
22
  dataset_path (str): Path to the dataset in the Vector-DB
23
- file_path (str): Path to the file to be processed
24
  embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
25
- device (str): Device to run the embedding model on
26
  chunk_size (int): Size of each chunk to be processed
27
  chunk_overlap (int): Overlap between each chunk
28
 
@@ -34,7 +30,6 @@ class Processing:
34
 
35
  self.embedding_model = HuggingFaceEmbeddings(
36
  model_name=embedding_model_name,
37
- model_kwargs={'device': device},
38
  encode_kwargs={'normalize_embeddings': False}
39
  )
40
 
@@ -43,8 +38,8 @@ class Processing:
43
  exec_option="compute_engine"
44
  )
45
 
46
- def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str, course_tag="") -> (List[
47
- Document], Dict[str, str]):
48
  """
49
  Parameters:
50
  documents (List[Document]): List of documents to add metadata to
@@ -54,7 +49,7 @@ class Processing:
54
  course_tag (str): Tag to identify the course the documents belongs to
55
 
56
  Returns:
57
- documents (List[Document]): List of documents with metadata added
58
 
59
  Add metadata to the documents
60
  """
@@ -69,10 +64,10 @@ class Processing:
69
  doc.metadata = metadata
70
  return documents, metadata
71
 
72
- def load_pdf(self, name, text) -> (List[Document], Dict[str, str]):
73
  """
74
  Returns:
75
- pdf_chunk (List[Document]): List of documents with metadata added
76
 
77
  Load PDF file, split into chunks and add metadata
78
  """
@@ -83,7 +78,7 @@ class Processing:
83
  def load_transcript(self, url) -> (List[Document], Dict[str, str]):
84
  """
85
  Returns:
86
- transcript_chunk (List[Document]): List of documents with metadata added
87
 
88
  Load transcript, split into chunks and add metadata
89
  """
@@ -91,12 +86,13 @@ class Processing:
91
  print("Transcribed")
92
  transcript_chunk = self.text_splitter.create_documents([transcript.text])
93
  print("Created transcript chunks")
94
- return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video", file_type="transcript")
 
95
 
96
  def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
97
  """
98
  Returns:
99
- yt_transcript_chunk (List[Document]): List of documents with metadata added
100
 
101
  Load YouTube transcript, split into chunks and add metadata
102
  """
 
13
  import uuid
14
 
15
 
 
16
  class Processing:
17
+ def __init__(self, dataset_path: str, embedding_model_name: str, chunk_size=500, chunk_overlap=5):
 
18
  """
19
  Parameters:
20
  dataset_path (str): Path to the dataset in the Vector-DB
 
21
  embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
 
22
  chunk_size (int): Size of each chunk to be processed
23
  chunk_overlap (int): Overlap between each chunk
24
 
 
30
 
31
  self.embedding_model = HuggingFaceEmbeddings(
32
  model_name=embedding_model_name,
 
33
  encode_kwargs={'normalize_embeddings': False}
34
  )
35
 
 
38
  exec_option="compute_engine"
39
  )
40
 
41
+ def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str,
42
+ course_tag="") -> (List[Document], Dict[str, str]):
43
  """
44
  Parameters:
45
  documents (List[Document]): List of documents to add metadata to
 
49
  course_tag (str): Tag to identify the course the documents belongs to
50
 
51
  Returns:
52
+ documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
53
 
54
  Add metadata to the documents
55
  """
 
64
  doc.metadata = metadata
65
  return documents, metadata
66
 
67
+ def load_pdf(self, text) -> (List[Document], Dict[str, str]):
68
  """
69
  Returns:
70
+ documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
71
 
72
  Load PDF file, split into chunks and add metadata
73
  """
 
78
  def load_transcript(self, url) -> (List[Document], Dict[str, str]):
79
  """
80
  Returns:
81
+ documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
82
 
83
  Load transcript, split into chunks and add metadata
84
  """
 
86
  print("Transcribed")
87
  transcript_chunk = self.text_splitter.create_documents([transcript.text])
88
  print("Created transcript chunks")
89
+ return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video",
90
+ file_type="transcript")
91
 
92
  def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
93
  """
94
  Returns:
95
+ documents (List[Document], Dict[str, str): List of documents with metadata added along with the metadata
96
 
97
  Load YouTube transcript, split into chunks and add metadata
98
  """