Spaces:

impossiblecisne
/

RAG_Langchain

Runtime error

App Files Files Community

RAG_Langchain / app.py

impossiblecisne

Create app.py

735d745 verified about 1 year ago

raw

history blame contribute delete

2.07 kB

	!pip install -q langchain
	!pip install -q torch
	!pip install -q transformers
	!pip install -q sentence-transformers
	!pip install -q datasets
	!pip install -q faiss-cpu

	from langchain.document_loaders import HuggingFaceDatasetLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering
	from transformers import AutoTokenizer, pipeline
	from langchain import HuggingFacePipeline
	from langchain.chains import RetrievalQA

	# Specify the dataset name and the column containing the content
	dataset_name = "databricks/databricks-dolly-15k"
	page_content_column = "context" # or any other column you're interested in

	# Create a loader instance
	loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

	# Load the data
	data = loader.load()

	# Display the first 15 entries
	data[:2]



	# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
	# It splits text into chunks of 1000 characters each with a 150-character overlap.
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

	# 'data' holds the text you want to split, split the text into documents using the text splitter.
	docs = text_splitter.split_documents(data)


	# Define the path to the pre-trained model you want to use
	modelPath = "sentence-transformers/all-MiniLM-l6-v2"

	# Create a dictionary with model configuration options, specifying to use the CPU for computations
	model_kwargs = {'device':'cpu'}

	# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
	encode_kwargs = {'normalize_embeddings': False}

	# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
	embeddings = HuggingFaceEmbeddings(
	model_name=modelPath, # Provide the pre-trained model's path
	model_kwargs=model_kwargs, # Pass the model configuration options
	encode_kwargs=encode_kwargs # Pass the encoding options