Spaces:
Runtime error
Runtime error
| !pip install -q langchain | |
| !pip install -q torch | |
| !pip install -q transformers | |
| !pip install -q sentence-transformers | |
| !pip install -q datasets | |
| !pip install -q faiss-cpu | |
| from langchain.document_loaders import HuggingFaceDatasetLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering | |
| from transformers import AutoTokenizer, pipeline | |
| from langchain import HuggingFacePipeline | |
| from langchain.chains import RetrievalQA | |
| # Specify the dataset name and the column containing the content | |
| dataset_name = "databricks/databricks-dolly-15k" | |
| page_content_column = "context" # or any other column you're interested in | |
| # Create a loader instance | |
| loader = HuggingFaceDatasetLoader(dataset_name, page_content_column) | |
| # Load the data | |
| data = loader.load() | |
| # Display the first 15 entries | |
| data[:2] | |
| # Create an instance of the RecursiveCharacterTextSplitter class with specific parameters. | |
| # It splits text into chunks of 1000 characters each with a 150-character overlap. | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| # 'data' holds the text you want to split, split the text into documents using the text splitter. | |
| docs = text_splitter.split_documents(data) | |
| # Define the path to the pre-trained model you want to use | |
| modelPath = "sentence-transformers/all-MiniLM-l6-v2" | |
| # Create a dictionary with model configuration options, specifying to use the CPU for computations | |
| model_kwargs = {'device':'cpu'} | |
| # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False | |
| encode_kwargs = {'normalize_embeddings': False} | |
| # Initialize an instance of HuggingFaceEmbeddings with the specified parameters | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name=modelPath, # Provide the pre-trained model's path | |
| model_kwargs=model_kwargs, # Pass the model configuration options | |
| encode_kwargs=encode_kwargs # Pass the encoding options |