Spaces:

4darsh-Dev
/

medicure

Sleeping

App Files Files Community

medicure / src /helper.py

adarsh

updated

5839dd5 3 months ago

raw

history blame

No virus

1.71 kB

	from langchain_community.document_loaders import PyPDFDirectoryLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from huggingface_hub import hf_hub_download


	# loading the data
	def load_data(path):
	loader = PyPDFDirectoryLoader(path)
	extracted_data = loader.load()
	return extracted_data


	#Create text chunks
	def text_split(extracted_data):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
	text_chunks = text_splitter.split_documents(extracted_data)

	return text_chunks


	#download embedding model
	def download_hf_embeddings():
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	return embeddings


	# downloading any pdf on web

	import os
	import requests

	def download_pdf(url):
	if not os.path.exists('data'):
	os.makedirs('data')

	pdf_url = url

	# Get the filename from the URL
	filename = pdf_url.split("/")[-1]

	# Full path where the PDF will be saved
	save_path = os.path.join('data', filename)

	# Download the PDF
	response = requests.get(pdf_url)

	# Check if the request was successful
	if response.status_code == 200:
	# Write the content to a file
	with open(save_path, 'wb') as file:
	file.write(response.content)
	print(f"PDF downloaded and saved to {save_path}")
	else:
	print(f"Failed to download PDF. Status code: {response.status_code}")





	def download_hf_model(model_name_or_path, model_basename):
	model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
	return model_path