File size: 1,685 Bytes
ba97fa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from huggingface_hub import hf_hub_download


# loading the data
def load_data(path):
    loader = PyPDFDirectoryLoader(path)
    extracted_data = loader.load()
    return extracted_data


#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks


#download embedding model
def download_hf_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings


# downloading any pdf on web

import os
import requests

def download_pdf(url):
    if not os.path.exists('data'):
        os.makedirs('data')

    pdf_url = url

    # Get the filename from the URL
    filename = pdf_url.split("/")[-1]

    # Full path where the PDF will be saved
    save_path = os.path.join('data', filename)

    # Download the PDF
    response = requests.get(pdf_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Write the content to a file
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f"PDF downloaded and saved to {save_path}")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")





def download_hf_model(model_name_or_path, model_basename):
    model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
    return model_path