Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import PyPDFDirectoryLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from huggingface_hub import hf_hub_download | |
# loading the data | |
def load_data(path): | |
loader = PyPDFDirectoryLoader(path) | |
extracted_data = loader.load() | |
return extracted_data | |
#Create text chunks | |
def text_split(extracted_data): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20) | |
text_chunks = text_splitter.split_documents(extracted_data) | |
return text_chunks | |
#download embedding model | |
def download_hf_embeddings(): | |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
return embeddings | |
# downloading any pdf on web | |
import os | |
import requests | |
def download_pdf(url): | |
if not os.path.exists('data'): | |
os.makedirs('data') | |
pdf_url = url | |
# Get the filename from the URL | |
filename = pdf_url.split("/")[-1] | |
# Full path where the PDF will be saved | |
save_path = os.path.join('data', filename) | |
# Download the PDF | |
response = requests.get(pdf_url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Write the content to a file | |
with open(save_path, 'wb') as file: | |
file.write(response.content) | |
print(f"PDF downloaded and saved to {save_path}") | |
else: | |
print(f"Failed to download PDF. Status code: {response.status_code}") | |
def download_hf_model(model_name_or_path, model_basename): | |
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename) | |
return model_path | |