Chris4K commited on
Commit
b871802
·
verified ·
1 Parent(s): d545add

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -2
app.py CHANGED
@@ -72,12 +72,13 @@ from langchain.chains import RetrievalQA
72
 
73
  from langchain.memory import ConversationBufferMemory
74
 
 
75
 
76
 
77
  # Function to index URLs in RAG
78
  def index_urls_in_rag(urls=[]):
79
  # Load the RAG model
80
- rag_model = "BAAI/bge-base-en-v1.5"
81
  encode_kwargs = {
82
  "normalize_embeddings": True
83
  } # set True to compute cosine similarity
@@ -96,9 +97,15 @@ def index_urls_in_rag(urls=[]):
96
  loader = WebBaseLoader(url)
97
  document = loader.load()
98
 
 
 
 
 
 
 
99
  # Split the document into chunks
100
  text_splitter = RecursiveCharacterTextSplitter()
101
- document_chunks = text_splitter.split_documents(document)
102
  print(document_chunks)
103
  # Index document chunks into the vector store
104
  vector_store.add_documents(document_chunks)
 
72
 
73
  from langchain.memory import ConversationBufferMemory
74
 
75
+ from langchain_community.document_transformers import BeautifulSoupTransformer
76
 
77
 
78
  # Function to index URLs in RAG
79
  def index_urls_in_rag(urls=[]):
80
  # Load the RAG model
81
+ rag_model = "jinaai/jina-embeddings-v2-base-de"
82
  encode_kwargs = {
83
  "normalize_embeddings": True
84
  } # set True to compute cosine similarity
 
97
  loader = WebBaseLoader(url)
98
  document = loader.load()
99
 
100
+ # Transform
101
+ bs_transformer = BeautifulSoupTransformer()
102
+ docs_transformed = bs_transformer.transform_documents(
103
+ document, class_to_extract=["p", "li", "div", "a"]
104
+ )
105
+
106
  # Split the document into chunks
107
  text_splitter = RecursiveCharacterTextSplitter()
108
+ document_chunks = text_splitter.split_documents(docs_transformed)
109
  print(document_chunks)
110
  # Index document chunks into the vector store
111
  vector_store.add_documents(document_chunks)