impossiblecisne commited on
Commit
735d745
·
verified ·
1 Parent(s): aef163f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py CHANGED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install -q langchain
2
+ !pip install -q torch
3
+ !pip install -q transformers
4
+ !pip install -q sentence-transformers
5
+ !pip install -q datasets
6
+ !pip install -q faiss-cpu
7
+
8
+ from langchain.document_loaders import HuggingFaceDatasetLoader
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.embeddings import HuggingFaceEmbeddings
11
+ from langchain.vectorstores import FAISS
12
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
13
+ from transformers import AutoTokenizer, pipeline
14
+ from langchain import HuggingFacePipeline
15
+ from langchain.chains import RetrievalQA
16
+
17
+ # Specify the dataset name and the column containing the content
18
+ dataset_name = "databricks/databricks-dolly-15k"
19
+ page_content_column = "context" # or any other column you're interested in
20
+
21
+ # Create a loader instance
22
+ loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)
23
+
24
+ # Load the data
25
+ data = loader.load()
26
+
27
+ # Display the first 15 entries
28
+ data[:2]
29
+
30
+
31
+
32
+ # Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
33
+ # It splits text into chunks of 1000 characters each with a 150-character overlap.
34
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
35
+
36
+ # 'data' holds the text you want to split, split the text into documents using the text splitter.
37
+ docs = text_splitter.split_documents(data)
38
+
39
+
40
+ # Define the path to the pre-trained model you want to use
41
+ modelPath = "sentence-transformers/all-MiniLM-l6-v2"
42
+
43
+ # Create a dictionary with model configuration options, specifying to use the CPU for computations
44
+ model_kwargs = {'device':'cpu'}
45
+
46
+ # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
47
+ encode_kwargs = {'normalize_embeddings': False}
48
+
49
+ # Initialize an instance of HuggingFaceEmbeddings with the specified parameters
50
+ embeddings = HuggingFaceEmbeddings(
51
+ model_name=modelPath, # Provide the pre-trained model's path
52
+ model_kwargs=model_kwargs, # Pass the model configuration options
53
+ encode_kwargs=encode_kwargs # Pass the encoding options