Spaces:
Sleeping
Sleeping
kausthubkannan17
commited on
Commit
•
a828a8b
1
Parent(s):
5adde41
feat: interface and model pipline
Browse files- .gitignore +2 -0
- app.py +46 -0
- examples/chat_examples/example_1.json +6 -0
- examples/chat_examples/example_2.json +6 -0
- examples/notes_examples/example_1.json +5 -0
- examples/notes_examples/example_2.json +5 -0
- model.py +106 -0
- pages/chat.py +43 -0
- pages/upload_file.py +69 -0
- pages/upload_url.py +65 -0
- prompt_templates/chat_prompt.yaml +179 -0
- prompt_templates/notes_prompt.yaml +208 -0
- requirements.txt +10 -0
- requiremnts.txt +0 -1
- utilis.py +157 -0
.gitignore
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
1 |
.venv
|
2 |
.idea
|
3 |
__pycache__
|
|
|
|
1 |
+
.streamlit/secrets.toml
|
2 |
.venv
|
3 |
.idea
|
4 |
__pycache__
|
5 |
+
dev_clear.py
|
app.py
CHANGED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from model import DrakeLM
|
3 |
+
from utilis import Processing
|
4 |
+
|
5 |
+
initial_page = "pages/upload_url.py"
|
6 |
+
|
7 |
+
|
8 |
+
@st.cache_resource()
|
9 |
+
def initialize_models():
|
10 |
+
processing = Processing(
|
11 |
+
dataset_path=st.secrets["DEEPLAKE_DB_URL"],
|
12 |
+
embedding_model_name="mixedbread-ai/mxbai-embed-large-v1",
|
13 |
+
chunk_size=1300,
|
14 |
+
)
|
15 |
+
config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
|
16 |
+
drake = DrakeLM(
|
17 |
+
model_path="Mistral/mistral-7b-instruct-v0.2.Q5_K_S.gguf",
|
18 |
+
config=config,
|
19 |
+
db=processing.db,
|
20 |
+
)
|
21 |
+
|
22 |
+
return processing, drake
|
23 |
+
|
24 |
+
|
25 |
+
def disable_sidebar():
|
26 |
+
st.set_page_config(page_title="Upload",
|
27 |
+
page_icon=None,
|
28 |
+
layout="centered",
|
29 |
+
initial_sidebar_state="collapsed",
|
30 |
+
)
|
31 |
+
no_sidebar_style = """
|
32 |
+
<style>
|
33 |
+
div[data-testid="stSidebarNav"] {display: none;}
|
34 |
+
</style>
|
35 |
+
"""
|
36 |
+
st.markdown(no_sidebar_style, unsafe_allow_html=True)
|
37 |
+
|
38 |
+
|
39 |
+
def main():
|
40 |
+
disable_sidebar()
|
41 |
+
initialize_models()
|
42 |
+
st.switch_page(initial_page)
|
43 |
+
|
44 |
+
|
45 |
+
if __name__ == "__main__":
|
46 |
+
main()
|
examples/chat_examples/example_1.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"query": "Assess DBSCAN, Dense cells and CLIQUE with appropriate steps. (8 marks)",
|
3 |
+
"context": "DBSACN\nStep 1: Randomly select a point p. Compute distance between P and ail other points '\nStep 2: Find all points]from p with respect to its neighbourhoud and check whether it has minimum number of points m. If 80, it is marked as a core point\nStep 3: If it is a core point, then a new cluster is formed, or existing cluster 1s enlarged.\nStep 4: [fit is a border point, then the algorithm moves to the next point and marks it as visited\nStep 5: If it is a noise point, they are removed.\nStep 6: Merge the clusters if it is mergeable, dist (cc )< ¢.\nStep 7: Repeat the process 3-6 till all Points are processed. \n\nDense Cell\nStep 1: Defining a set of grid points and assigning the given data points on the grid.\nStep 2: Determine the dense and sparse cells. lf the number of points in a cell exceeds the threshold\nvalue t, the cell is categorized as a dense cell. Sparse cells are removed from the list.\nStep 3: Merge the dense cells if they are adjacent.\nStep 4: Form a list of grid cells for every subspace as output.\n\nCLIQUE\nStage 1\nStep 1: Identify the dense cells\nStep 2: Merge dense cells c. and c, if they share the same interval.\nStep 3: Generate Apriori rule to generate (k + 1)\" cell tor higher dimension. Then, check\nwhether the number of points across the threshold This 1s repeated till there are no\ndense cells or a new generation of dense cells\n\nStage 2\nStep 1: Merging of dense cells into a cluster is carried out in each subspace using maximal regions to cover dense cells The maximal region is a hyperrectangle where all cells fall into.\nStep 2; Maximal region tries to cover all dense cells to form clusters.",
|
4 |
+
"rules": "- If the question says answer for X number of marks, you have to provide X number of points.\n - Each point has to be explained in 3-4 sentences.\n - In case the context express a mathematical equation, provide the equation in LaTeX format as shown in the example.\n - In case the user requests for a code snippet, provide the code snippet in the language specified in the example.- If the user requests to summarise or use the previous message as context ignoring the explicit context given in the message.",
|
5 |
+
"answer": "# Dense Cells\nStep 1: Define a set of grid points and assign the given data points on the grid.\nStep 2: Determine the dense and sparse cells. If the number of points in a cell exceeds the threshold value t, the cell is categorized as dense cell. Sparse cells are removed from the list.\nStep 3: Merge the dense cells if they are adjacent.\nStep 4: Form a list of grid cells for every subspace as output.\n\n#CILQUE\n **Stage 1:**\n- Step 1: Identify the dense cells.\n- Step 2: Merge dense cells c₁ and c₂ if they share the same interval.\n- Step 3: Generate a particle rule to generate (k + 1)th cell for higher dimension. Then, check whether the number of points cross the threshold. This is repeated till there are no dense cells or new generation of dense cells.\n\n**Stage 2:**\n- Step 1: Merging of dense cells into a cluster is carried out in each subspace using maximal regions to cover dense cells. The maximal region is an hyperrectangle where all cells fall into.\n- Step 2: Maximal region tries to cover all dense cells to form clusters.\n\n # DBSCAN\n- Step 1: Randomly select a point p. Compute distance between p and all other points.\n- Step 2: Find all points from p with respect to its neighborhood and check whether it has minimum number of points m. If so, it is marked as a core point.\n- Step 3: If it is a core point, then a new cluster is formed, or existing cluster is enlarged.\n- Step 4: If it is a border point, then the algorithm moves to the next point and marks it as visited.\n- Step 5: If it is a noise point, they are removed.\n- Step 6: Merge the clusters if it is mergeable, dist (c, c) < ɛ.\n- Step 7: Repeat the process 3-6 till all points are processed."
|
6 |
+
}
|
examples/chat_examples/example_2.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"query": "Explain sharding in system design along with its benefits. (10 marks)",
|
3 |
+
"context": "It is a very important concept that helps the system to keep data in different resources\naccording to the sharding process. The word “Shard” means “a small part of a\nwhole“. Hence Sharding means dividing a larger part into smaller parts. In DBMS,\nSharding is a type of DataBase partitioning in which a large database is divided or\n\npartitioned into smaller data and different nodes. These shards are not only smaller,\nbut also faster and hence easily manageable.\nHow does Sharding work?\nIn a sharded system, the data is partitioned into shards based on a predetermined\ncriterion. For example, a sharding scheme may divide the data based on geographic\nlocation, user ID, or time period. Once the data is partitioned, it is distributed across\nmultiple servers or nodes. Each server or node is responsible for storing and processing a\nsubset of the data.\nExample:\n\nTo query data from a sharded database, the system needs to know which shard contains\nthe required data. This is achieved using a shard key, which is a unique identifier that is\nused to map the data to its corresponding shard. When a query is received, the system\nuses the shard key to determine which shard contains the required data and then sends\nthe query to the appropriate server or node.\nFeatures of Sharding:\n\uF0B7 Sharding makes the Database smaller\n\uF0B7 Sharding makes the Database faster\n\uF0B7 Sharding makes the Database much more easily manageable\n\uF0B7 Sharding can be a complex operation sometimes\n\uF0B7 Sharding reduces the transaction cost of the Database\n\uF0B7 Each shard reads and writes its own data.\n\uF0B7 Many NoSQL databases offer auto-sharding.\n\uF0B7 Failure of one shard doesn’t effect the data processing of other shards.\nBenefits of Sharding:\n1. Improved Scalability: Sharding allows the system to scale horizontally by adding more\nservers or nodes as the data grows. This improves the system’s capacity to handle\nlarge volumes of data and requests.\n\n2. Increased Performance: Sharding distributes the data across multiple servers or\nnodes, which improves the system’s performance by reducing the load on each server\nor node. This results in faster response times and better throughput.\n3. Fault Tolerance: Sharding provides a degree of fault tolerance as the system can\ncontinue to function even if one or more servers or nodes fail. This is because the data\nis replicated across multiple servers or nodes, and if one fails, the others can continue\nto serve the requests.\n4. Reduced Costs: Sharding allows the system to scale horizontally, which can be more\ncost-effective than scaling vertically by upgrading hardware. This is because horizontal\nscaling can be done",
|
4 |
+
"rules": "- If the question says answer for X number of marks, you have to provide X number of points.\n - Each point has to be explained in 3-4 sentences.\n - In case the context express a mathematical equation, provide the equation in LaTeX format as shown in the example.\n - In case the user requests for a code snippet, provide the code snippet in the language specified in the example.- If the user requests to summarise or use the previous message as context ignoring the explicit context given in the message.\n",
|
5 |
+
"answer": "Sharding is a technique for dividing a large database into smaller, manageable parts called shards, which are stored across multiple servers or nodes. This process enhances scalability, performance, and fault tolerance by distributing data and processing load. Sharding works by partitioning data based on criteria like geographic location, user ID, or time period, and each shard is responsible for a subset of the data. This method allows for horizontal scaling, improving the system's capacity to handle large volumes of data and requests efficiently.\n\nThe system uses a shard key to identify which shard contains the required data for a query. The shard key is a unique identifier that maps data to its corresponding shard. Upon receiving a query, the system determines the appropriate shard and forwards the query to the correct server or node.\n\n**Features of Sharding:**\n- Sharding makes the database smaller, faster, and more manageable.\n- It can be complex to implement.\n- Sharding reduces transaction costs and allows each shard to read and write its own data.\n- Many NoSQL databases offer auto-sharding.\n- Failure of one shard does not affect the data processing of other shards.\n\n**Benefits of Sharding:**\n1. **Improved Scalability:** Sharding allows horizontal scaling by adding more servers or nodes, enhancing the system's capacity to handle large volumes of data and requests.\n2. **Increased Performance:**By distributing data across multiple servers or nodes, sharding improves performance, resulting in faster response times and better throughput.\n3. **Fault Tolerance:** Sharding provides fault tolerance as the system can continue to function even if one or more servers or nodes fail, thanks to data replication across multiple servers or nodes.\n4. **Reduced Costs:** Horizontal scaling with sharding can be more cost-effective than vertical scaling by upgrading hardware, as it can be done using commodity hardware, which is typically less expensive than high-end servers."
|
6 |
+
}
|
examples/notes_examples/example_1.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"content_chunk": "The cyclic redundancy check codes are popularly employed in LANs and WANs for error correction. The principle of operation of CRC encoders and decoders can be better explained with the following examples. CRC is the most powerful and easy to implement technique.CRC is based on binary division. In CRC, a sequence of redundant bits, are appended to the end of data unit so that the resulting data unit becomes exactly divisible by a second, predetermined binary number. At the destination, the incoming data unit is divided by the same number. If at this step there is no remainder, the data unit is assumed to be correct and is therefore accepted. A remainder indicates that the data unit has been damaged in transit and therefore must be rejected. The binary number, which is (r+1) bit in length, can also be considered as the coefficients of a polynomial, called Generator Polynomial. PERFORMANCE OF CRC CRC is a very effective error detection technique. If the divisor is chosen according to the previously mentioned rules, its performance can be summarized as follows. CRC can detect all single-bit errors and double bit errors (three 1’s). CRC can detect any odd number of errors (X+1) and it can also detect all burst errors of less than the degree of the polynomial. 1. The Sender follows the given steps: 2. The block unit is divided into k sections, and each of n bits. 3. 4. 5. All the k sections are added together by using one's complement to get the sum. The sum is complemented and it becomes the checksum field. The original data and checksum field are sent across the network. Checksum Checker A Checksum is verified at the receiving side. The receiver subdivides the incoming data into equal segments of n bits each, and all these segments are added together, and then this sum is complemented. If the complement of the sum is zero, then the data is accepted otherwise data is rejected. 1. The Receiver follows the given steps: 2. The block unit is divided into k sections and each of n bits. 3. 4. 5. All the k sections are added together by using one's complement algorithm to get the sum. The sum is complemented. If the result of the sum is zero, then the data is accepted otherwise the data is discarded. Cyclic Redundancy Check (CRC) CRC is a redundancy error technique used to determine the error. Following are the steps used in CRC for error detection: - In CRC technique, a string of n 0s is appended to the data unit, and this n number is less than the number of bits in a predetermined number, known as division which is n+1 bits. - Secondly, the newly extended data is divided by a divisor using a process is known as binary division. The remainder generated from this division is known as CRC remainder. - Thirdly, the CRC remainder replaces the appended 0s at the end of the original data. This newly generated unit is sent to the receiver. - The receiver receives the data followed by the CRC remainder. The receiver will treat this whole unit as a single unit, and it is divided by the same divisor that was used to find the CRC remainder. If the resultant of this division is zero which means that it has no error, and the data is accepted. If the resultant of this division is not zero which means that the data consists of an error. Therefore, the data is discarded.",
|
3 |
+
"rules": "- Follow the Markdown format for creating notes as shown in the example. \n - The heading of the content should be the title of the markdown file. \n - Create subheadings for each section. \n - Use numbered bullet points for each point.",
|
4 |
+
"notes": "# CRC (Cyclic Redundancy Check) Encoder and Decoder \n- They are popularly employed in LANs and WANs for error correction. \n- Based on binary division - a sequence of redundant bits are appended to the end of the data unit so that the resulting data unit becomes exactly divisible by a predetermined binary number.\n- At destination, if there is no remainder, it is assumed to be correct. A remainder indicates data unit is damaged and therefore rejected. \n- The binary number (r+1 bits by length) can be considered coefficients of a polynomial, called Generator Polynomial. \n- It is very effective, since it can detect all single-bit errors and double bit errors, odd number of errors and also burst errors of lesser degree than the polynomial. \n- It consists of two components, a generator and a checker.\n 1. **CRC Generator**: uses modulo 2 division. n 0s are appended to the end if divisor contains n+1 units. The remainder generated is called CRC remainder which replaces the appended string of 0s with itself and the final string is sent across the network.\n 2. **CRC Checker:** performs modulo 2 division to the number received from generator by the same divisor, and data is accepted if remainder is zero. \n\n- **CRC Algorithm:**\n 1. Append a string of n 0s to the data unit, where n < the predetermined number of bits in the divisor (n+1 bits).\n 2. Perform binary division between the extended data and the divisor to obtain the CRC remainder.\n 3. Replace the appended 0s at the end of the original data with the CRC remainder.\n 4. Transmit the newly generated unit to the receiver.\n 5. Upon reception, treat the data and CRC remainder as a single unit and divide it by the same divisor used earlier to determine the CRC remainder."
|
5 |
+
}
|
examples/notes_examples/example_2.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"content_chunk": "Meaning/Defination: A content delivery network (CDN) is a group of geographically\ndistributed servers that speed up the delivery of web content by bringing it closer to where\nusers are.\n It is a network of strategically positioned servers aimed at enhancing the speed and\ndependability of delivering content to users in different locations. These servers store cached\ncopies of content, ranging from web pages to videos, guaranteeing that when a user makes\na request, it’s served from the closest server available. This reduces delays and speeds up\nloading times.\nWhen a user requests specific content, CDN architecture comes into play. It directs the\nrequest to the nearest server, taking into account factors like server health and proximity.\nThis approach minimizes data travel distance, resulting in fast and efficient content delivery.\nAnalogy: You could think of a CDN like an ATM. If your money were only available from one\nbank in town, you’d have to make a time-consuming trip and stand in a long line every time\nyou wanted to withdraw cash. However, with a cash machine on practically every corner,\nyou have fast and easy access to your money any time you need it.\n\nWhat is The Use of a Content Distribution Network?\nCDNs are designed to optimize the delivery of web content, and some of the main\nadvantages that they provide to a company and its users include:\n\uF0B7 Faster Load Times: CDNs cache content geographically close to its users,\ndecreasing the distance that requests and responses need to travel. As a result,\nusers experience faster load times for webpages, which can increase conversions\nand decrease bounce rates. How does a CDN improve page load time?: As\nmentioned earlier, it is a globally distributed network of servers that store (commonly\nreferred to as "cache") and deliver some or all of your website's content. Each of\nthese servers in the CDN's network is called a Point of Presence (PoP) or an edge\nserver.\n\uF0B7 Reduced Bandwidth Costs: Serving all requested content from the origin server\nrequires significant bandwidth and processing power at the origin. CDNs reduce load\nand bandwidth requirements at the bandwidth by caching static content and\nperforming other optimizations. Of course, this helps to greatly reduce costs.\n\uF0B7 Improved Availability and Redundancy: Reliance on centralized infrastructure —\nsuch as serving all content from the origin server — increases the risk of downtime\ndue to hardware failures, network outages, and other events. CDNs distribute content\nand requests across multiple locations, reducing the impact of a localized outage.\nWith a CDN coming into the picture, it does two things. One, a lot of traffic doesn't\neven come to your servers. The edge server of the CDN serves a lot of content from\nits cache. So, you need a slightly fewer number of servers.\nSecond, as long as the content is available in the CDNs cache, even if your actual servers\nare not working, the CDN will keep serving the content. This gives you some buffer time to\nfix issues on your servers while the CDN serves whatever content it can from its cache.\n\n\uF0B7 Enhanced Website Security: In addition to optimizing access to web content, a\nCDN may incorporate security functionality. By blocking distributed denial-of-service\n(DDoS) attacks, enhancing digital certificate security, and other security controls,\nCDNs can reduce the probability and impact of a cyberattack.\n\uF0B7 Web security: if a CDN can isolate bad traffic from good traffic, it can stop all the\nbad traffic from coming to your servers. Your servers only respond to the "good"\nrequests coming from actual users.",
|
3 |
+
"rules": "- Follow the Markdown format for creating notes as shown in the example. \n - The heading of the content should be the title of the markdown file. \n - Create subheadings for each section. \n - Use numbered bullet points for each point.",
|
4 |
+
"notes": "# CDNs\n- A content delivery network (CDN) is a group of geographically distributed servers that speed up the delivery of web content by bringing it closer to where users are. \n- It is a network of strategically positioned servers aimed at enhancing the speed and dependability of delivering content to users in different locations. \n- These servers store cached copies of content, ranging from web pages to videos, guaranteeing that when a user makes a request, it’s served from the closest server available. This reduces delays and speeds up loading times.\n- When a user requests specific content, CDN architecture comes into play. It directs the request to the nearest server, taking into account factors like server health and proximity. This approach minimizes data travel distance, resulting in fast and efficient content delivery.\n- CDNs are designed to optimize the delivery of web content, and some of the main advantages that they provide to a company and its users include:\n- Faster Load Times: CDNs cache content geographically close to its users, decreasing the distance that requests and responses need to travel. As a result, users experience faster load times for webpages, which can increase conversions and decrease bounce rates. How does a CDN improve page load time?: As mentioned earlier, it is a globally distributed network of servers that store (commonly referred to as \"cache\") and deliver some or all of your website's content. Each of these servers in the CDN's network is called a Point of Presence (PoP) or an edge server.\n- **Reduced Bandwidth Costs:** Serving all requested content from the origin server requires significant bandwidth and processing power at the origin. CDNs reduce load and bandwidth requirements at the bandwidth by caching static content and performing other optimizations. Of course, this helps to greatly reduce costs.\n- **Improved Availability and Redundancy:** Reliance on centralized infrastructure — such as serving all content from the origin server — increases the risk of downtime due to hardware failures, network outages, and other events. CDNs distribute content and requests across multiple locations, reducing the impact of a localized outage. With a CDN coming into the picture, it does two things. One, a lot of traffic doesn't even come to your servers. The edge server of the CDN serves a lot of content from its cache. So, you need a slightly fewer number of servers.\n- Second, as long as the content is available in the CDNs cache, even if your actual servers are not working, the CDN will keep serving the content. This gives you some buffer time to fix issues on your servers while the CDN serves whatever content it can from its cache.\n- Enhanced Website Security: In addition to optimizing access to web content, a CDN may incorporate security functionality. By blocking distributed denial-of-service(DDoS) attacks, enhancing digital certificate security, and other security controls, CDNs can reduce the probability and impact of a cyberattack.\n- Web security: if a CDN can isolate bad traffic from good traffic, it can stop all the bad traffic from coming to your servers. Your servers only respond to the \"good\" requests coming from actual users. "
|
5 |
+
}
|
model.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.chat_message_histories.in_memory import ChatMessageHistory
|
2 |
+
from langchain_community.llms.ctransformers import CTransformers
|
3 |
+
from langchain_community.vectorstores import DeepLake
|
4 |
+
from langchain_core.messages import AIMessage
|
5 |
+
from langchain_core.prompts import PromptTemplate, load_prompt
|
6 |
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
7 |
+
from typing import List
|
8 |
+
from langchain_core.documents.base import Document
|
9 |
+
|
10 |
+
|
11 |
+
class DrakeLM:
|
12 |
+
def __init__(self, model_path: str, db: DeepLake, config: dict, llm_model="gemini-pro"):
|
13 |
+
self.llm_model = llm_model
|
14 |
+
|
15 |
+
if llm_model == "llama":
|
16 |
+
self.llama = CTransformers(
|
17 |
+
model=model_path,
|
18 |
+
model_type="llama",
|
19 |
+
config=config
|
20 |
+
)
|
21 |
+
self.gemini = ChatGoogleGenerativeAI(model="gemini-pro", convert_system_message_to_human=True)
|
22 |
+
self.retriever = db.as_retriever()
|
23 |
+
self.chat_history = ChatMessageHistory()
|
24 |
+
self.chat_history.add_user_message("You are assisting a student to understand topics.")
|
25 |
+
self.notes_prompt = load_prompt("prompt_templates/notes_prompt.yaml")
|
26 |
+
self.chat_prompt = load_prompt("prompt_templates/chat_prompt.yaml")
|
27 |
+
|
28 |
+
def _chat_prompt(self, query: str, context: str):
|
29 |
+
prompt = """You are assisting a student to understand topics. \n\n
|
30 |
+
You have to answer the below question by utilising the below context to answer the question. \n\n
|
31 |
+
Note to follow the rules given below \n\n
|
32 |
+
Question: {query} \n\n
|
33 |
+
Context: {context} \n\n
|
34 |
+
Rules: {rules} \n\n
|
35 |
+
Answer:
|
36 |
+
"""
|
37 |
+
|
38 |
+
rules = """
|
39 |
+
- If the question says answer for X number of marks, you have to provide X number of points.
|
40 |
+
- Each point has to be explained in 3-4 sentences.
|
41 |
+
- In case the context express a mathematical equation, provide the equation in LaTeX format as shown in the example.
|
42 |
+
- In case the user requests for a code snippet, provide the code snippet in the language specified in the example.
|
43 |
+
- If the user requests to summarise or use the previous message as context ignoring the explicit context given in the message.
|
44 |
+
"""
|
45 |
+
|
46 |
+
prompt = prompt.format(query=query, context=context, rules=rules)
|
47 |
+
return PromptTemplate.from_template(prompt), prompt
|
48 |
+
|
49 |
+
def _retrieve(self, query: str, metadata_filter, k=3, distance_metric="cos"):
|
50 |
+
self.retriever.search_kwargs["distance_metric"] = distance_metric
|
51 |
+
self.retriever.search_kwargs["k"] = k
|
52 |
+
|
53 |
+
if metadata_filter:
|
54 |
+
self.retriever.search_kwargs["filter"] = {
|
55 |
+
"metadata": {
|
56 |
+
"id": metadata_filter["id"]
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
retrieved_docs = self.retriever.get_relevant_documents(query)
|
61 |
+
|
62 |
+
context = ""
|
63 |
+
for rd in retrieved_docs:
|
64 |
+
context += "\n" + rd.page_content
|
65 |
+
|
66 |
+
return context
|
67 |
+
|
68 |
+
def ask_llm(self, query: str, metadata_filter: dict = None):
|
69 |
+
context = self._retrieve(query, metadata_filter)
|
70 |
+
print("Retrieved context")
|
71 |
+
prompt_template, prompt_string = self._chat_prompt(query, context)
|
72 |
+
self.chat_history.add_user_message(prompt_string)
|
73 |
+
print("Generating response...")
|
74 |
+
|
75 |
+
rules = """
|
76 |
+
- If the question says answer for X number of marks, you have to provide X number of points.
|
77 |
+
- Each point has to be explained in 3-4 sentences.
|
78 |
+
- In case the context express a mathematical equation, provide the equation in LaTeX format as shown in the example.
|
79 |
+
- In case the user requests for a code snippet, provide the code snippet in the language specified in the example.
|
80 |
+
- If the user requests to summarise or use the previous message as context ignoring the explicit context given in the message.
|
81 |
+
"""
|
82 |
+
|
83 |
+
prompt_template = self.chat_prompt.format(query=query, context=context, rules=rules)
|
84 |
+
|
85 |
+
if self.llm_model == "llama":
|
86 |
+
self.chat_history.add_ai_message(AIMessage(content=self.llama.invoke(prompt_template)))
|
87 |
+
else:
|
88 |
+
self.chat_history.add_ai_message(AIMessage(content=self.gemini.invoke(prompt_template).content))
|
89 |
+
|
90 |
+
return self.chat_history.messages[-1].content
|
91 |
+
|
92 |
+
def create_notes(self, documents: List[Document]):
|
93 |
+
rules = """
|
94 |
+
- Follow the Markdown format for creating notes as shown in the example.
|
95 |
+
- The heading of the content should be the title of the markdown file.
|
96 |
+
- Create subheadings for each section.
|
97 |
+
- Use numbered bullet points for each point.
|
98 |
+
"""
|
99 |
+
|
100 |
+
notes_chunk = []
|
101 |
+
for doc in documents:
|
102 |
+
prompt = self.notes_prompt.format(content_chunk=doc.page_content, rules=rules)
|
103 |
+
response = self.gemini.invoke(prompt)
|
104 |
+
notes_chunk.append(response.content)
|
105 |
+
|
106 |
+
return '\n'.join(notes_chunk)
|
pages/chat.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from app import disable_sidebar, initialize_models
|
3 |
+
from model import DrakeLM
|
4 |
+
from utilis import Processing
|
5 |
+
|
6 |
+
disable_sidebar()
|
7 |
+
col1, col2 = st.columns([1.2, 0.3])
|
8 |
+
|
9 |
+
col1.title('Chat with Drake!')
|
10 |
+
if col2.button("Home"):
|
11 |
+
st.switch_page("app.py")
|
12 |
+
|
13 |
+
universal_chat = st.toggle("Universal Chat")
|
14 |
+
st.caption("Note: Universal Chat uses the complete DB to retrieve context, use it with caution")
|
15 |
+
|
16 |
+
st.divider()
|
17 |
+
|
18 |
+
if "messages" not in st.session_state:
|
19 |
+
st.session_state.messages = []
|
20 |
+
|
21 |
+
# Display chat messages from history on app rerun
|
22 |
+
for message in st.session_state.messages:
|
23 |
+
with st.chat_message(message["role"]):
|
24 |
+
st.markdown(message["content"])
|
25 |
+
|
26 |
+
if prompt := st.chat_input("Ask Drake your questions"):
|
27 |
+
# Display user message in chat message container
|
28 |
+
with st.chat_message("user"):
|
29 |
+
st.markdown(prompt)
|
30 |
+
# Add user message to chat history
|
31 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
32 |
+
|
33 |
+
with st.spinner("Drake is thinking..."):
|
34 |
+
query = f"{prompt}"
|
35 |
+
_, drake = initialize_models()
|
36 |
+
if universal_chat:
|
37 |
+
response = drake.ask_llm(query)
|
38 |
+
else:
|
39 |
+
response = drake.ask_llm(query, metadata_filter=st.session_state["metadata"])
|
40 |
+
|
41 |
+
with st.chat_message("assistant"):
|
42 |
+
st.markdown(response)
|
43 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
pages/upload_file.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import io
|
3 |
+
import PyPDF2
|
4 |
+
from app import disable_sidebar, initialize_models
|
5 |
+
from model import DrakeLM
|
6 |
+
from utilis import Processing
|
7 |
+
|
8 |
+
disable_sidebar()
|
9 |
+
st.title('Drake')
|
10 |
+
st.subheader('Learn without the mess of making notes!')
|
11 |
+
st.divider()
|
12 |
+
|
13 |
+
if st.button("Youtube/Video URL"):
|
14 |
+
st.switch_page("pages/upload_url.py")
|
15 |
+
|
16 |
+
st.subheader('Upload the file')
|
17 |
+
uploaded_file = st.file_uploader(label="Choose a file", type=['pdf', 'doc'])
|
18 |
+
allow_make_notes = st.toggle('Make Complete Notes!')
|
19 |
+
|
20 |
+
|
21 |
+
if uploaded_file:
|
22 |
+
if st.button("Upload to DB"):
|
23 |
+
|
24 |
+
# Chunking the file
|
25 |
+
with st.spinner('Please wait, file is chunking ...'):
|
26 |
+
try:
|
27 |
+
pdf_stream = io.BytesIO(uploaded_file.read())
|
28 |
+
pdf_reader = PyPDF2.PdfReader(pdf_stream)
|
29 |
+
|
30 |
+
text = ""
|
31 |
+
for page in pdf_reader.pages:
|
32 |
+
text += page.extract_text()
|
33 |
+
|
34 |
+
processing, drake = initialize_models()
|
35 |
+
documents, metadata = processing.load_pdf("hello world", text)
|
36 |
+
st.session_state["metadata"] = metadata
|
37 |
+
st.success("Successfully chunked the file")
|
38 |
+
|
39 |
+
except Exception as e:
|
40 |
+
st.error("Error in chunking")
|
41 |
+
|
42 |
+
# Uploading to DB
|
43 |
+
with st.spinner('Please wait, file is uploading ...'):
|
44 |
+
try:
|
45 |
+
processing.upload_to_db(documents)
|
46 |
+
except Exception as e:
|
47 |
+
st.error("Error in uploading")
|
48 |
+
|
49 |
+
# Generating Notes
|
50 |
+
if allow_make_notes:
|
51 |
+
with st.spinner('Please wait, notes are being generated ...'):
|
52 |
+
try:
|
53 |
+
config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
|
54 |
+
notes = drake.create_notes(documents)
|
55 |
+
encoded_text = notes.encode('utf-8')
|
56 |
+
st.success("Notes generated successfully")
|
57 |
+
if st.download_button(
|
58 |
+
label="Download data as Markdown",
|
59 |
+
data=encoded_text,
|
60 |
+
file_name='your_notes.md',
|
61 |
+
mime='text/markdown',
|
62 |
+
):
|
63 |
+
st.switch_page("pages/chat.py")
|
64 |
+
except Exception as e:
|
65 |
+
print(e)
|
66 |
+
st.error("Error in generating notes")
|
67 |
+
|
68 |
+
else:
|
69 |
+
st.switch_page("pages/chat.py")
|
pages/upload_url.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from app import disable_sidebar, initialize_models
|
4 |
+
from model import DrakeLM
|
5 |
+
from utilis import Processing
|
6 |
+
|
7 |
+
# Upload Template
|
8 |
+
disable_sidebar()
|
9 |
+
processing, drake = initialize_models()
|
10 |
+
st.title('Drake')
|
11 |
+
st.subheader('Learn without the mess of making notes!')
|
12 |
+
st.divider()
|
13 |
+
|
14 |
+
if st.button("PDF/Transcript"):
|
15 |
+
st.switch_page("pages/upload_file.py")
|
16 |
+
|
17 |
+
st.subheader('Enter the Video URL')
|
18 |
+
video_url = st.text_input(label="Enter the URL")
|
19 |
+
allow_make_notes = st.toggle('Make Complete Notes!')
|
20 |
+
|
21 |
+
|
22 |
+
if video_url:
|
23 |
+
# Upload to DB
|
24 |
+
if st.button("Upload to DB"):
|
25 |
+
|
26 |
+
# Chunking the file
|
27 |
+
with st.spinner('Please wait, file is chunking ...'):
|
28 |
+
try:
|
29 |
+
documents, metadata = processing.load_yt_transcript(video_url)
|
30 |
+
st.session_state["metadata"] = {"id": metadata["id"]}
|
31 |
+
st.success("Successfully chunked the file")
|
32 |
+
except Exception as e:
|
33 |
+
print(e)
|
34 |
+
st.error("Error in chunking")
|
35 |
+
|
36 |
+
# Uploading to DB
|
37 |
+
# with st.spinner('Please wait, documents uploading ...'):
|
38 |
+
# try:
|
39 |
+
# processing.upload_to_db(documents)
|
40 |
+
# except Exception as e:
|
41 |
+
# st.error("Error in uploading")
|
42 |
+
|
43 |
+
# Generating Notes
|
44 |
+
if allow_make_notes:
|
45 |
+
with st.spinner('Please wait, notes are being generated ...'):
|
46 |
+
try:
|
47 |
+
config = {"max_new_tokens": 4096, "context_length": 8192, "temperature": 0.3}
|
48 |
+
notes = drake.create_notes(documents)
|
49 |
+
encoded_text = notes.encode('utf-8')
|
50 |
+
st.success("Notes generated successfully")
|
51 |
+
if st.download_button(
|
52 |
+
label="Download data as Markdown",
|
53 |
+
data=encoded_text,
|
54 |
+
file_name='your_notes.md',
|
55 |
+
mime='text/markdown',
|
56 |
+
):
|
57 |
+
st.switch_page("pages/chat.py")
|
58 |
+
except Exception as e:
|
59 |
+
print(e)
|
60 |
+
st.error("Error in generating notes")
|
61 |
+
|
62 |
+
else:
|
63 |
+
st.switch_page("pages/chat.py")
|
64 |
+
|
65 |
+
|
prompt_templates/chat_prompt.yaml
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_type: few_shot
|
2 |
+
example_prompt:
|
3 |
+
_type: prompt
|
4 |
+
input_types: {}
|
5 |
+
input_variables:
|
6 |
+
- answer
|
7 |
+
- context
|
8 |
+
- query
|
9 |
+
- rules
|
10 |
+
metadata: null
|
11 |
+
name: null
|
12 |
+
output_parser: null
|
13 |
+
partial_variables: {}
|
14 |
+
tags: null
|
15 |
+
template: "\n<Question> {query}\n \n<Context> {context}\n \n<Rules> {rules}\n \n\
|
16 |
+
<Answer> {answer}\n"
|
17 |
+
template_format: f-string
|
18 |
+
validate_template: false
|
19 |
+
example_selector: null
|
20 |
+
example_separator: '
|
21 |
+
|
22 |
+
|
23 |
+
'
|
24 |
+
examples:
|
25 |
+
- answer: "# Dense Cells\nStep 1: Define a set of grid points and assign the given\
|
26 |
+
\ data points on the grid.\nStep 2: Determine the dense and sparse cells. If the\
|
27 |
+
\ number of points in a cell exceeds the threshold value t, the cell is categorized\
|
28 |
+
\ as dense cell. Sparse cells are removed from the list.\nStep 3: Merge the dense\
|
29 |
+
\ cells if they are adjacent.\nStep 4: Form a list of grid cells for every subspace\
|
30 |
+
\ as output.\n\n#CILQUE\n **Stage 1:**\n- Step 1: Identify the dense cells.\n\
|
31 |
+
- Step 2: Merge dense cells c\u2081 and c\u2082 if they share the same interval.\n\
|
32 |
+
- Step 3: Generate a particle rule to generate (k + 1)th cell for higher dimension.\
|
33 |
+
\ Then, check whether the number of points cross the threshold. This is repeated\
|
34 |
+
\ till there are no dense cells or new generation of dense cells.\n\n**Stage 2:**\n\
|
35 |
+
- Step 1: Merging of dense cells into a cluster is carried out in each subspace\
|
36 |
+
\ using maximal regions to cover dense cells. The maximal region is an hyperrectangle\
|
37 |
+
\ where all cells fall into.\n- Step 2: Maximal region tries to cover all dense\
|
38 |
+
\ cells to form clusters.\n\n # DBSCAN\n- Step 1: Randomly select a point p. Compute\
|
39 |
+
\ distance between p and all other points.\n- Step 2: Find all points from p with\
|
40 |
+
\ respect to its neighborhood and check whether it has minimum number of points\
|
41 |
+
\ m. If so, it is marked as a core point.\n- Step 3: If it is a core point, then\
|
42 |
+
\ a new cluster is formed, or existing cluster is enlarged.\n- Step 4: If it is\
|
43 |
+
\ a border point, then the algorithm moves to the next point and marks it as visited.\n\
|
44 |
+
- Step 5: If it is a noise point, they are removed.\n- Step 6: Merge the clusters\
|
45 |
+
\ if it is mergeable, dist (c, c) < \u025B.\n- Step 7: Repeat the process 3-6\
|
46 |
+
\ till all points are processed."
|
47 |
+
context: "DBSACN\nStep 1: Randomly select a point p. Compute distance between P\
|
48 |
+
\ and ail other points '\nStep 2: Find all points]from p with respect to its neighbourhoud\
|
49 |
+
\ and check whether it has minimum number of points m. If 80, it is marked as\
|
50 |
+
\ a core point\nStep 3: If it is a core point, then a new cluster is formed, or\
|
51 |
+
\ existing cluster 1s enlarged.\nStep 4: [fit is a border point, then the algorithm\
|
52 |
+
\ moves to the next point and marks it as visited\nStep 5: If it is a noise point,\
|
53 |
+
\ they are removed.\nStep 6: Merge the clusters if it is mergeable, dist (cc )<\
|
54 |
+
\ \xA2.\nStep 7: Repeat the process 3-6 till all Points are processed. \n\nDense\
|
55 |
+
\ Cell\nStep 1: Defining a set of grid points and assigning the given data points\
|
56 |
+
\ on the grid.\nStep 2: Determine the dense and sparse cells. lf the number of\
|
57 |
+
\ points in a cell exceeds the threshold\nvalue t, the cell is categorized as\
|
58 |
+
\ a dense cell. Sparse cells are removed from the list.\nStep 3: Merge the dense\
|
59 |
+
\ cells if they are adjacent.\nStep 4: Form a list of grid cells for every subspace\
|
60 |
+
\ as output.\n\nCLIQUE\nStage 1\nStep 1: Identify the dense cells\nStep 2: Merge\
|
61 |
+
\ dense cells c. and c, if they share the same interval.\nStep 3: Generate Apriori\
|
62 |
+
\ rule to generate (k + 1)\" cell tor higher dimension. Then, check\nwhether the\
|
63 |
+
\ number of points across the threshold This 1s repeated till there are no\ndense\
|
64 |
+
\ cells or a new generation of dense cells\n\nStage 2\nStep 1: Merging of dense\
|
65 |
+
\ cells into a cluster is carried out in each subspace using maximal regions to\
|
66 |
+
\ cover dense cells The maximal region is a hyperrectangle where all cells fall\
|
67 |
+
\ into.\nStep 2; Maximal region tries to cover all dense cells to form clusters."
|
68 |
+
query: Assess DBSCAN, Dense cells and CLIQUE with appropriate steps. (8 marks)
|
69 |
+
rules: "- If the question says answer for X number of marks, you have to provide\
|
70 |
+
\ X number of points.\n - Each point has to be explained in 3-4 sentences.\n -\
|
71 |
+
\ In case the context express a mathematical equation, provide the equation in\
|
72 |
+
\ LaTeX format as shown in the example.\n - In case the user requests for a code\
|
73 |
+
\ snippet, provide the code snippet in the language specified in the example.-\
|
74 |
+
\ If the user requests to summarise or use the previous message as context ignoring\
|
75 |
+
\ the explicit context given in the message."
|
76 |
+
- answer: 'Sharding is a technique for dividing a large database into smaller, manageable
|
77 |
+
parts called shards, which are stored across multiple servers or nodes. This process
|
78 |
+
enhances scalability, performance, and fault tolerance by distributing data and
|
79 |
+
processing load. Sharding works by partitioning data based on criteria like geographic
|
80 |
+
location, user ID, or time period, and each shard is responsible for a subset
|
81 |
+
of the data. This method allows for horizontal scaling, improving the system''s
|
82 |
+
capacity to handle large volumes of data and requests efficiently.
|
83 |
+
|
84 |
+
|
85 |
+
The system uses a shard key to identify which shard contains the required data
|
86 |
+
for a query. The shard key is a unique identifier that maps data to its corresponding
|
87 |
+
shard. Upon receiving a query, the system determines the appropriate shard and
|
88 |
+
forwards the query to the correct server or node.
|
89 |
+
|
90 |
+
|
91 |
+
**Features of Sharding:**
|
92 |
+
|
93 |
+
- Sharding makes the database smaller, faster, and more manageable.
|
94 |
+
|
95 |
+
- It can be complex to implement.
|
96 |
+
|
97 |
+
- Sharding reduces transaction costs and allows each shard to read and write its
|
98 |
+
own data.
|
99 |
+
|
100 |
+
- Many NoSQL databases offer auto-sharding.
|
101 |
+
|
102 |
+
- Failure of one shard does not affect the data processing of other shards.
|
103 |
+
|
104 |
+
|
105 |
+
**Benefits of Sharding:**
|
106 |
+
|
107 |
+
1. **Improved Scalability:** Sharding allows horizontal scaling by adding more
|
108 |
+
servers or nodes, enhancing the system''s capacity to handle large volumes of
|
109 |
+
data and requests.
|
110 |
+
|
111 |
+
2. **Increased Performance:**By distributing data across multiple servers or nodes,
|
112 |
+
sharding improves performance, resulting in faster response times and better throughput.
|
113 |
+
|
114 |
+
3. **Fault Tolerance:** Sharding provides fault tolerance as the system can continue
|
115 |
+
to function even if one or more servers or nodes fail, thanks to data replication
|
116 |
+
across multiple servers or nodes.
|
117 |
+
|
118 |
+
4. **Reduced Costs:** Horizontal scaling with sharding can be more cost-effective
|
119 |
+
than vertical scaling by upgrading hardware, as it can be done using commodity
|
120 |
+
hardware, which is typically less expensive than high-end servers.'
|
121 |
+
context: "It is a very important concept that helps the system to keep data in different\
|
122 |
+
\ resources\naccording to the sharding process. The word \u201CShard\u201D means\
|
123 |
+
\ \u201Ca small part of a\nwhole\u201C. Hence Sharding means dividing a larger\
|
124 |
+
\ part into smaller parts. In DBMS,\nSharding is a type of DataBase partitioning\
|
125 |
+
\ in which a large database is divided or\n\npartitioned into smaller data and\
|
126 |
+
\ different nodes. These shards are not only smaller,\nbut also faster and hence\
|
127 |
+
\ easily manageable.\nHow does Sharding work?\nIn a sharded system, the data is\
|
128 |
+
\ partitioned into shards based on a predetermined\ncriterion. For example, a\
|
129 |
+
\ sharding scheme may divide the data based on geographic\nlocation, user ID,\
|
130 |
+
\ or time period. Once the data is partitioned, it is distributed across\nmultiple\
|
131 |
+
\ servers or nodes. Each server or node is responsible for storing and processing\
|
132 |
+
\ a\nsubset of the data.\nExample:\n\nTo query data from a sharded database, the\
|
133 |
+
\ system needs to know which shard contains\nthe required data. This is achieved\
|
134 |
+
\ using a shard key, which is a unique identifier that is\nused to map the data\
|
135 |
+
\ to its corresponding shard. When a query is received, the system\nuses the shard\
|
136 |
+
\ key to determine which shard contains the required data and then sends\nthe\
|
137 |
+
\ query to the appropriate server or node.\nFeatures of Sharding:\n\uF0B7 Sharding\
|
138 |
+
\ makes the Database smaller\n\uF0B7 Sharding makes the Database faster\n\uF0B7\
|
139 |
+
\ Sharding makes the Database much more easily manageable\n\uF0B7 Sharding can\
|
140 |
+
\ be a complex operation sometimes\n\uF0B7 Sharding reduces the transaction cost\
|
141 |
+
\ of the Database\n\uF0B7 Each shard reads and writes its own data.\n\uF0B7 Many\
|
142 |
+
\ NoSQL databases offer auto-sharding.\n\uF0B7 Failure of one shard doesn\u2019\
|
143 |
+
t effect the data processing of other shards.\nBenefits of Sharding:\n1. Improved\
|
144 |
+
\ Scalability: Sharding allows the system to scale horizontally by adding more\n\
|
145 |
+
servers or nodes as the data grows. This improves the system\u2019s capacity to\
|
146 |
+
\ handle\nlarge volumes of data and requests.\n\n2. Increased Performance: Sharding\
|
147 |
+
\ distributes the data across multiple servers or\nnodes, which improves the system\u2019\
|
148 |
+
s performance by reducing the load on each server\nor node. This results in faster\
|
149 |
+
\ response times and better throughput.\n3. Fault Tolerance: Sharding provides\
|
150 |
+
\ a degree of fault tolerance as the system can\ncontinue to function even if\
|
151 |
+
\ one or more servers or nodes fail. This is because the data\nis replicated across\
|
152 |
+
\ multiple servers or nodes, and if one fails, the others can continue\nto serve\
|
153 |
+
\ the requests.\n4. Reduced Costs: Sharding allows the system to scale horizontally,\
|
154 |
+
\ which can be more\ncost-effective than scaling vertically by upgrading hardware.\
|
155 |
+
\ This is because horizontal\nscaling can be done"
|
156 |
+
query: Explain sharding in system design along with its benefits. (10 marks)
|
157 |
+
rules: "- If the question says answer for X number of marks, you have to provide\
|
158 |
+
\ X number of points.\n - Each point has to be explained in 3-4 sentences.\n -\
|
159 |
+
\ In case the context express a mathematical equation, provide the equation in\
|
160 |
+
\ LaTeX format as shown in the example.\n - In case the user requests for a code\
|
161 |
+
\ snippet, provide the code snippet in the language specified in the example.-\
|
162 |
+
\ If the user requests to summarise or use the previous message as context ignoring\
|
163 |
+
\ the explicit context given in the message.\n"
|
164 |
+
input_types: {}
|
165 |
+
input_variables:
|
166 |
+
- context
|
167 |
+
- query
|
168 |
+
- rules
|
169 |
+
metadata: null
|
170 |
+
name: null
|
171 |
+
output_parser: null
|
172 |
+
partial_variables: {}
|
173 |
+
prefix: "\nYou are assisting a student to understand topics.\n \nYou have to answer\
|
174 |
+
\ the below question by utilising the below context to answer the question.\nNote\
|
175 |
+
\ to follow the rules given below.\n"
|
176 |
+
suffix: "\n<Question> {query}\n \n<Context> {context}\n \n<Rules> {rules}\n <Answer>"
|
177 |
+
tags: null
|
178 |
+
template_format: f-string
|
179 |
+
validate_template: false
|
prompt_templates/notes_prompt.yaml
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_type: few_shot
|
2 |
+
example_prompt:
|
3 |
+
_type: prompt
|
4 |
+
input_types: {}
|
5 |
+
input_variables:
|
6 |
+
- content_chunk
|
7 |
+
- notes
|
8 |
+
- rules
|
9 |
+
metadata: null
|
10 |
+
name: null
|
11 |
+
output_parser: null
|
12 |
+
partial_variables: {}
|
13 |
+
tags: null
|
14 |
+
template: "\n<Content Chunk> {content_chunk}\n \n<Rules> {rules}\n \n<Notes> {notes}\n"
|
15 |
+
template_format: f-string
|
16 |
+
validate_template: false
|
17 |
+
example_selector: null
|
18 |
+
example_separator: '
|
19 |
+
|
20 |
+
|
21 |
+
'
|
22 |
+
examples:
|
23 |
+
- content_chunk: "The cyclic redundancy check codes are popularly employed in LANs\
|
24 |
+
\ and WANs for error correction. The principle of operation of CRC encoders and\
|
25 |
+
\ decoders can be better explained with the following examples. CRC is the most\
|
26 |
+
\ powerful and easy to implement technique.CRC is based on binary division. In\
|
27 |
+
\ CRC, a sequence of redundant bits, are appended to the end of data unit so that\
|
28 |
+
\ the resulting data unit becomes exactly divisible by a second, predetermined\
|
29 |
+
\ binary number. At the destination, the incoming data unit is divided by the\
|
30 |
+
\ same number. If at this step there is no remainder, the data unit is assumed\
|
31 |
+
\ to be correct and is therefore accepted. A remainder indicates that the data\
|
32 |
+
\ unit has been damaged in transit and therefore must be rejected. The binary\
|
33 |
+
\ number, which is (r+1) bit in length, can also be considered as the coefficients\
|
34 |
+
\ of a polynomial, called Generator Polynomial. PERFORMANCE OF CRC CRC is a very\
|
35 |
+
\ effective error detection technique. If the divisor is chosen according to the\
|
36 |
+
\ previously mentioned rules, its performance can be summarized as follows. CRC\
|
37 |
+
\ can detect all single-bit errors and double bit errors (three 1\u2019s). CRC\
|
38 |
+
\ can detect any odd number of errors (X+1) and it can also detect all burst errors\
|
39 |
+
\ of less than the degree of the polynomial. 1. The Sender follows the given steps:\
|
40 |
+
\ 2. The block unit is divided into k sections, and each of n bits. 3. \
|
41 |
+
\ 4. 5. All the k sections are added together by using one's complement to\
|
42 |
+
\ get the sum. The sum is complemented and it becomes the checksum field. \
|
43 |
+
\ The original data and checksum field are sent across the network. Checksum\
|
44 |
+
\ Checker A Checksum is verified at the receiving side. The receiver subdivides\
|
45 |
+
\ the incoming data into equal segments of n bits each, and all these segments\
|
46 |
+
\ are added together, and then this sum is complemented. If the complement of\
|
47 |
+
\ the sum is zero, then the data is accepted otherwise data is rejected. 1. The\
|
48 |
+
\ Receiver follows the given steps: 2. The block unit is divided into k sections\
|
49 |
+
\ and each of n bits. 3. 4. 5. All the k sections are added together by\
|
50 |
+
\ using one's complement algorithm to get the sum. The sum is complemented.\
|
51 |
+
\ If the result of the sum is zero, then the data is accepted otherwise the\
|
52 |
+
\ data is discarded. Cyclic Redundancy Check (CRC) CRC is a redundancy error\
|
53 |
+
\ technique used to determine the error. Following are the steps used in CRC for\
|
54 |
+
\ error detection: - In CRC technique, a string of n 0s is appended to the data\
|
55 |
+
\ unit, and this n number is less than the number of bits in a predetermined number,\
|
56 |
+
\ known as division which is n+1 bits. - Secondly, the newly extended data is\
|
57 |
+
\ divided by a divisor using a process is known as binary division. The remainder\
|
58 |
+
\ generated from this division is known as CRC remainder. - Thirdly, the CRC remainder\
|
59 |
+
\ replaces the appended 0s at the end of the original data. This newly generated\
|
60 |
+
\ unit is sent to the receiver. - The receiver receives the data followed by the\
|
61 |
+
\ CRC remainder. The receiver will treat this whole unit as a single unit, and\
|
62 |
+
\ it is divided by the same divisor that was used to find the CRC remainder. If\
|
63 |
+
\ the resultant of this division is zero which means that it has no error, and\
|
64 |
+
\ the data is accepted. If the resultant of this division is not zero which means\
|
65 |
+
\ that the data consists of an error. Therefore, the data is discarded."
|
66 |
+
notes: "# CRC (Cyclic Redundancy Check) Encoder and Decoder \n- They are popularly\
|
67 |
+
\ employed in LANs and WANs for error correction. \n- Based on binary division\
|
68 |
+
\ - a sequence of redundant bits are appended to the end of the data unit so that\
|
69 |
+
\ the resulting data unit becomes exactly divisible by a predetermined binary\
|
70 |
+
\ number.\n- At destination, if there is no remainder, it is assumed to be correct.\
|
71 |
+
\ A remainder indicates data unit is damaged and therefore rejected. \n- The binary\
|
72 |
+
\ number (r+1 bits by length) can be considered coefficients of a polynomial,\
|
73 |
+
\ called Generator Polynomial. \n- It is very effective, since it can detect all\
|
74 |
+
\ single-bit errors and double bit errors, odd number of errors and also burst\
|
75 |
+
\ errors of lesser degree than the polynomial. \n- It consists of two components,\
|
76 |
+
\ a generator and a checker.\n 1. **CRC Generator**: uses modulo 2 division.\
|
77 |
+
\ n 0s are appended to the end if divisor contains n+1 units. The remainder generated\
|
78 |
+
\ is called CRC remainder which replaces the appended string of 0s with itself\
|
79 |
+
\ and the final string is sent across the network.\n 2. **CRC Checker:** performs\
|
80 |
+
\ modulo 2 division to the number received from generator by the same divisor,\
|
81 |
+
\ and data is accepted if remainder is zero. \n\n- **CRC Algorithm:**\n 1. Append\
|
82 |
+
\ a string of n 0s to the data unit, where n < the predetermined number of bits\
|
83 |
+
\ in the divisor (n+1 bits).\n 2. Perform binary division between the extended\
|
84 |
+
\ data and the divisor to obtain the CRC remainder.\n 3. Replace the appended\
|
85 |
+
\ 0s at the end of the original data with the CRC remainder.\n 4. Transmit the\
|
86 |
+
\ newly generated unit to the receiver.\n 5. Upon reception, treat the data and\
|
87 |
+
\ CRC remainder as a single unit and divide it by the same divisor used earlier\
|
88 |
+
\ to determine the CRC remainder."
|
89 |
+
rules: "- Follow the Markdown format for creating notes as shown in the example.\
|
90 |
+
\ \n - The heading of the content should be the title of the markdown file. \n\
|
91 |
+
\ - Create subheadings for each section. \n - Use numbered bullet points for each\
|
92 |
+
\ point."
|
93 |
+
- content_chunk: "Meaning/Defination: A content delivery network (CDN) is a group\
|
94 |
+
\ of geographically\ndistributed servers that speed up the delivery of web content\
|
95 |
+
\ by bringing it closer to where\nusers are.\n\_It is a network of strategically\
|
96 |
+
\ positioned servers aimed at enhancing the speed and\ndependability of delivering\
|
97 |
+
\ content to users in different locations. These servers store cached\ncopies\
|
98 |
+
\ of content, ranging from web pages to videos, guaranteeing that when a user\
|
99 |
+
\ makes\na request, it\u2019s served from the closest server available. This reduces\
|
100 |
+
\ delays and speeds up\nloading times.\nWhen a user requests specific content,\
|
101 |
+
\ CDN architecture comes into play. It directs the\nrequest to the nearest server,\
|
102 |
+
\ taking into account factors like server health and proximity.\nThis approach\
|
103 |
+
\ minimizes data travel distance, resulting in fast and efficient content delivery.\n\
|
104 |
+
Analogy: You could think of a CDN like an ATM. If your money were only available\
|
105 |
+
\ from one\nbank in town, you\u2019d have to make a time-consuming trip and stand\
|
106 |
+
\ in a long line every time\nyou wanted to withdraw cash. However, with a cash\
|
107 |
+
\ machine on practically every corner,\nyou have fast and easy access to your\
|
108 |
+
\ money any time you need it.\n\nWhat is The Use of a Content Distribution Network?\n\
|
109 |
+
CDNs are designed to optimize the delivery of web content, and some of the main\n\
|
110 |
+
advantages that they provide to a company and its users include:\n\uF0B7 Faster\
|
111 |
+
\ Load Times:\_CDNs cache content geographically close to its users,\ndecreasing\
|
112 |
+
\ the distance that requests and responses need to travel.\_ As a result,\nusers\
|
113 |
+
\ experience faster load times for webpages, which can increase conversions\n\
|
114 |
+
and decrease bounce rates. How does a CDN improve page load time?: As\nmentioned\
|
115 |
+
\ earlier, it is a globally distributed network of servers that store (commonly\n\
|
116 |
+
referred to as "cache") and deliver some or all of your website's\
|
117 |
+
\ content. Each of\nthese servers in the CDN's network is called a Point of\
|
118 |
+
\ Presence (PoP) or an edge\nserver.\n\uF0B7 Reduced Bandwidth Costs:\_Serving\
|
119 |
+
\ all requested content from the origin server\nrequires significant bandwidth\
|
120 |
+
\ and processing power at the origin. CDNs reduce load\nand bandwidth requirements\
|
121 |
+
\ at the bandwidth by caching static content and\nperforming other optimizations.\
|
122 |
+
\ Of course, this helps to greatly reduce costs.\n\uF0B7 Improved Availability\
|
123 |
+
\ and Redundancy:\_Reliance on centralized infrastructure \u2014\nsuch as serving\
|
124 |
+
\ all content from the origin server \u2014 increases the risk of downtime\ndue\
|
125 |
+
\ to hardware failures, network outages, and other events. CDNs distribute content\n\
|
126 |
+
and requests across multiple locations, reducing the impact of a localized outage.\n\
|
127 |
+
With a CDN coming into the picture, it does two things. One, a lot of traffic\
|
128 |
+
\ doesn't\neven come to your servers. The edge server of the CDN serves a\
|
129 |
+
\ lot of content from\nits cache. So, you need a slightly fewer number of servers.\n\
|
130 |
+
Second, as long as the content is available in the CDNs cache, even if your actual\
|
131 |
+
\ servers\nare not working, the CDN will keep serving the content. This gives\
|
132 |
+
\ you some buffer time to\nfix issues on your servers while the CDN serves whatever\
|
133 |
+
\ content it can from its cache.\n\n\uF0B7 Enhanced Website Security:\_In addition\
|
134 |
+
\ to optimizing access to web content, a\nCDN may incorporate security functionality.\_\
|
135 |
+
\ By blocking distributed denial-of-service\n(DDoS) attacks, enhancing digital\
|
136 |
+
\ certificate security, and other security controls,\nCDNs can reduce the probability\
|
137 |
+
\ and impact of a cyberattack.\n\uF0B7 Web security: if a CDN can isolate bad\
|
138 |
+
\ traffic from good traffic, it can stop all the\nbad traffic from coming to your\
|
139 |
+
\ servers. Your servers only respond to the "good"\nrequests coming\
|
140 |
+
\ from actual users."
|
141 |
+
notes: "# CDNs\n- A content delivery network (CDN) is a group of geographically\
|
142 |
+
\ distributed servers that speed up the delivery of web content by bringing it\
|
143 |
+
\ closer to where users are. \n- It is a network of strategically positioned servers\
|
144 |
+
\ aimed at enhancing the speed and dependability of delivering content to users\
|
145 |
+
\ in different locations. \n- These servers store cached copies of content, ranging\
|
146 |
+
\ from web pages to videos, guaranteeing that when a user makes a request, it\u2019\
|
147 |
+
s served from the closest server available. This reduces delays and speeds up\
|
148 |
+
\ loading times.\n- When a user requests specific content, CDN architecture comes\
|
149 |
+
\ into play. It directs the request to the nearest server, taking into account\
|
150 |
+
\ factors like server health and proximity. This approach minimizes data travel\
|
151 |
+
\ distance, resulting in fast and efficient content delivery.\n- CDNs are designed\
|
152 |
+
\ to optimize the delivery of web content, and some of the main advantages that\
|
153 |
+
\ they provide to a company and its users include:\n- Faster Load Times:\_CDNs\
|
154 |
+
\ cache content geographically close to its users, decreasing the distance that\
|
155 |
+
\ requests and responses need to travel.\_ As a result, users experience faster\
|
156 |
+
\ load times for webpages, which can increase conversions and decrease bounce\
|
157 |
+
\ rates. How does a CDN improve page load time?: As mentioned earlier, it is a\
|
158 |
+
\ globally distributed network of servers that store (commonly referred to as\
|
159 |
+
\ \"cache\") and deliver some or all of your website's content. Each of these\
|
160 |
+
\ servers in the CDN's network is called a Point of Presence (PoP) or an edge\
|
161 |
+
\ server.\n- **Reduced Bandwidth Costs:**\_Serving all requested content from\
|
162 |
+
\ the origin server requires significant bandwidth and processing power at the\
|
163 |
+
\ origin. CDNs reduce load and bandwidth requirements at the bandwidth by caching\
|
164 |
+
\ static content and performing other optimizations. Of course, this helps to\
|
165 |
+
\ greatly reduce costs.\n- **Improved Availability and Redundancy:**\_Reliance\
|
166 |
+
\ on centralized infrastructure \u2014 such as serving all content from the origin\
|
167 |
+
\ server \u2014 increases the risk of downtime due to hardware failures, network\
|
168 |
+
\ outages, and other events. CDNs distribute content and requests across multiple\
|
169 |
+
\ locations, reducing the impact of a localized outage. With a CDN coming into\
|
170 |
+
\ the picture, it does two things. One, a lot of traffic doesn't even come\
|
171 |
+
\ to your servers. The edge server of the CDN serves a lot of content from its\
|
172 |
+
\ cache. So, you need a slightly fewer number of servers.\n- Second, as long as\
|
173 |
+
\ the content is available in the CDNs cache, even if your actual servers are\
|
174 |
+
\ not working, the CDN will keep serving the content. This gives you some buffer\
|
175 |
+
\ time to fix issues on your servers while the CDN serves whatever content it\
|
176 |
+
\ can from its cache.\n- Enhanced Website Security:\_In addition to optimizing\
|
177 |
+
\ access to web content, a CDN may incorporate security functionality.\_ By blocking\
|
178 |
+
\ distributed denial-of-service(DDoS) attacks, enhancing digital certificate security,\
|
179 |
+
\ and other security controls, CDNs can reduce the probability and impact of a\
|
180 |
+
\ cyberattack.\n- Web security: if a CDN can isolate bad traffic from good traffic,\
|
181 |
+
\ it can stop all the bad traffic from coming to your servers. Your servers only\
|
182 |
+
\ respond to the \"good\" requests coming from actual users. "
|
183 |
+
rules: "- Follow the Markdown format for creating notes as shown in the example.\
|
184 |
+
\ \n - The heading of the content should be the title of the markdown file. \n\
|
185 |
+
\ - Create subheadings for each section. \n - Use numbered bullet points for each\
|
186 |
+
\ point."
|
187 |
+
input_types: {}
|
188 |
+
input_variables:
|
189 |
+
- content_chunk
|
190 |
+
- rules
|
191 |
+
metadata: null
|
192 |
+
name: null
|
193 |
+
output_parser: null
|
194 |
+
partial_variables: {}
|
195 |
+
prefix: '
|
196 |
+
|
197 |
+
You are assigned a task to create notes for a student.
|
198 |
+
|
199 |
+
You have to create notes based on the below content chunk which is part if a huger
|
200 |
+
document.
|
201 |
+
|
202 |
+
Note to follow the rules given below.
|
203 |
+
|
204 |
+
'
|
205 |
+
suffix: "\n<Content Chunk> {content_chunk}\n \n<Rules> {rules}\n \n<Notes>\n"
|
206 |
+
tags: null
|
207 |
+
template_format: f-string
|
208 |
+
validate_template: false
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PyPDF2
|
2 |
+
streamlit
|
3 |
+
langchain
|
4 |
+
deeplake
|
5 |
+
assemblyai
|
6 |
+
sentence-transformers
|
7 |
+
youtube-transcript-api
|
8 |
+
modal
|
9 |
+
ctransformers
|
10 |
+
langchain-google-genai
|
requiremnts.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
streamlit
|
|
|
|
utilis.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import List
|
4 |
+
import assemblyai as aai
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
7 |
+
from langchain_core.documents.base import Document
|
8 |
+
from langchain_community.vectorstores import DeepLake
|
9 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
10 |
+
from langchain.prompts.few_shot import FewShotPromptTemplate
|
11 |
+
from langchain.prompts.prompt import PromptTemplate
|
12 |
+
from typing import Dict
|
13 |
+
import uuid
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
class Processing:
|
18 |
+
def __init__(self, dataset_path: str, embedding_model_name: str,
|
19 |
+
device='cpu', chunk_size=500, chunk_overlap=5):
|
20 |
+
"""
|
21 |
+
Parameters:
|
22 |
+
dataset_path (str): Path to the dataset in the Vector-DB
|
23 |
+
file_path (str): Path to the file to be processed
|
24 |
+
embedding_model_name (str): Name of the HuggingFace model to be used for embeddings
|
25 |
+
device (str): Device to run the embedding model on
|
26 |
+
chunk_size (int): Size of each chunk to be processed
|
27 |
+
chunk_overlap (int): Overlap between each chunk
|
28 |
+
|
29 |
+
Initialize embedding model, text splitter, transcriber and Vector-DB
|
30 |
+
"""
|
31 |
+
self.dataset_path = dataset_path
|
32 |
+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
33 |
+
self.transcriber = aai.Transcriber()
|
34 |
+
|
35 |
+
self.embedding_model = HuggingFaceEmbeddings(
|
36 |
+
model_name=embedding_model_name,
|
37 |
+
model_kwargs={'device': device},
|
38 |
+
encode_kwargs={'normalize_embeddings': False}
|
39 |
+
)
|
40 |
+
|
41 |
+
self.db = DeepLake(dataset_path=f"hub://{self.dataset_path}",
|
42 |
+
embedding=self.embedding_model,
|
43 |
+
exec_option="compute_engine"
|
44 |
+
)
|
45 |
+
|
46 |
+
def _add_metadata(self, documents: List[Document], url: str, id: str, source: str, file_type: str, course_tag="") -> (List[
|
47 |
+
Document], Dict[str, str]):
|
48 |
+
"""
|
49 |
+
Parameters:
|
50 |
+
documents (List[Document]): List of documents to add metadata to
|
51 |
+
id (str): ID of the documents
|
52 |
+
source (str): Source of the documents
|
53 |
+
file_type (str): Type of the documents
|
54 |
+
course_tag (str): Tag to identify the course the documents belongs to
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
documents (List[Document]): List of documents with metadata added
|
58 |
+
|
59 |
+
Add metadata to the documents
|
60 |
+
"""
|
61 |
+
metadata = {
|
62 |
+
"id": id,
|
63 |
+
"source": source,
|
64 |
+
"url": url,
|
65 |
+
"file_type": file_type,
|
66 |
+
"course_tag": course_tag
|
67 |
+
}
|
68 |
+
for doc in documents:
|
69 |
+
doc.metadata = metadata
|
70 |
+
return documents, metadata
|
71 |
+
|
72 |
+
def load_pdf(self, name, text) -> (List[Document], Dict[str, str]):
|
73 |
+
"""
|
74 |
+
Returns:
|
75 |
+
pdf_chunk (List[Document]): List of documents with metadata added
|
76 |
+
|
77 |
+
Load PDF file, split into chunks and add metadata
|
78 |
+
"""
|
79 |
+
pdf_chunk = self.text_splitter.create_documents([text])
|
80 |
+
print("Created document chunks")
|
81 |
+
return self._add_metadata(pdf_chunk, url="NaN", id=str(uuid.uuid4()), source="document", file_type="pdf")
|
82 |
+
|
83 |
+
def load_transcript(self, url) -> (List[Document], Dict[str, str]):
|
84 |
+
"""
|
85 |
+
Returns:
|
86 |
+
transcript_chunk (List[Document]): List of documents with metadata added
|
87 |
+
|
88 |
+
Load transcript, split into chunks and add metadata
|
89 |
+
"""
|
90 |
+
transcript = self.transcriber.transcribe(url)
|
91 |
+
print("Transcribed")
|
92 |
+
transcript_chunk = self.text_splitter.create_documents([transcript.text])
|
93 |
+
print("Created transcript chunks")
|
94 |
+
return self._add_metadata(transcript_chunk, url="NaN", id=str(uuid.uuid4()), source="custom_video", file_type="transcript")
|
95 |
+
|
96 |
+
def load_yt_transcript(self, url) -> (List[Document], Dict[str, str]):
|
97 |
+
"""
|
98 |
+
Returns:
|
99 |
+
yt_transcript_chunk (List[Document]): List of documents with metadata added
|
100 |
+
|
101 |
+
Load YouTube transcript, split into chunks and add metadata
|
102 |
+
"""
|
103 |
+
if url.startswith("https://www.youtube.com/watch?v="):
|
104 |
+
video_id = url.replace("https://www.youtube.com/watch?v=", "")
|
105 |
+
else:
|
106 |
+
video_id = url.replace("https://youtu.be/", "")
|
107 |
+
|
108 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
109 |
+
print("Downloaded transcript")
|
110 |
+
transcript = [line['text'] for line in transcript]
|
111 |
+
transcript_text = ' '.join(transcript)
|
112 |
+
yt_transcript_chunk = self.text_splitter.create_documents([transcript_text])
|
113 |
+
print("Created YouTube transcript chunks")
|
114 |
+
return self._add_metadata(yt_transcript_chunk, url=url, id=video_id, source="youtube", file_type="transcript")
|
115 |
+
|
116 |
+
def upload_to_db(self, documents: List[Document]):
|
117 |
+
"""
|
118 |
+
Parameters:
|
119 |
+
documents (List[Document]): List of documents to upload to Vector-DB
|
120 |
+
|
121 |
+
Upload documents to Vector-DB
|
122 |
+
"""
|
123 |
+
print("Embedding and uploading to Vector-DB...")
|
124 |
+
self.db.add_documents(documents)
|
125 |
+
print("Uploaded to Vector-DB")
|
126 |
+
|
127 |
+
|
128 |
+
class PromptCreate:
|
129 |
+
def __init__(self, example_path, save_path):
|
130 |
+
self.examples = []
|
131 |
+
self.example_prompt = None
|
132 |
+
self.few_shot_prompt = None
|
133 |
+
self.example_path = example_path
|
134 |
+
self.file_name = "example_{i}.json"
|
135 |
+
self.save_path = save_path
|
136 |
+
|
137 |
+
def load_examples(self):
|
138 |
+
for i in range(1, len(os.listdir(self.example_path)) + 1):
|
139 |
+
filename = os.path.join(self.example_path, self.file_name.format(i=i))
|
140 |
+
try:
|
141 |
+
with open(filename, "r") as json_file:
|
142 |
+
self.examples.append(json.load(json_file))
|
143 |
+
except FileNotFoundError:
|
144 |
+
print(f"File {filename} not found.")
|
145 |
+
except json.JSONDecodeError:
|
146 |
+
print(f"Error decoding JSON from file {filename}.")
|
147 |
+
|
148 |
+
def create_prompt_template(self, input_variables, template_string):
|
149 |
+
self.example_prompt = PromptTemplate(input_variables=input_variables, template=template_string)
|
150 |
+
|
151 |
+
def create_few_shot_prompt(self, prefix, suffix):
|
152 |
+
self.few_shot_prompt = FewShotPromptTemplate(
|
153 |
+
examples=self.examples, example_prompt=self.example_prompt, prefix=prefix, suffix=suffix
|
154 |
+
)
|
155 |
+
|
156 |
+
def save_prompt(self):
|
157 |
+
self.few_shot_prompt.save(self.save_path)
|