medicure / src /
history blame
No virus
1.71 kB
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from huggingface_hub import hf_hub_download
# loading the data
def load_data(path):
loader = PyPDFDirectoryLoader(path)
extracted_data = loader.load()
return extracted_data
#Create text chunks
def text_split(extracted_data):
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
text_chunks = text_splitter.split_documents(extracted_data)
return text_chunks
#download embedding model
def download_hf_embeddings():
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
return embeddings
# downloading any pdf on web
import os
import requests
def download_pdf(url):
if not os.path.exists('data'):
pdf_url = url
# Get the filename from the URL
filename = pdf_url.split("/")[-1]
# Full path where the PDF will be saved
save_path = os.path.join('data', filename)
# Download the PDF
response = requests.get(pdf_url)
# Check if the request was successful
if response.status_code == 200:
# Write the content to a file
with open(save_path, 'wb') as file:
print(f"PDF downloaded and saved to {save_path}")
print(f"Failed to download PDF. Status code: {response.status_code}")
def download_hf_model(model_name_or_path, model_basename):
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
return model_path