Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -72,12 +72,13 @@ from langchain.chains import RetrievalQA
|
|
72 |
|
73 |
from langchain.memory import ConversationBufferMemory
|
74 |
|
|
|
75 |
|
76 |
|
77 |
# Function to index URLs in RAG
|
78 |
def index_urls_in_rag(urls=[]):
|
79 |
# Load the RAG model
|
80 |
-
rag_model = "
|
81 |
encode_kwargs = {
|
82 |
"normalize_embeddings": True
|
83 |
} # set True to compute cosine similarity
|
@@ -96,9 +97,15 @@ def index_urls_in_rag(urls=[]):
|
|
96 |
loader = WebBaseLoader(url)
|
97 |
document = loader.load()
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
# Split the document into chunks
|
100 |
text_splitter = RecursiveCharacterTextSplitter()
|
101 |
-
document_chunks = text_splitter.split_documents(
|
102 |
print(document_chunks)
|
103 |
# Index document chunks into the vector store
|
104 |
vector_store.add_documents(document_chunks)
|
|
|
72 |
|
73 |
from langchain.memory import ConversationBufferMemory
|
74 |
|
75 |
+
from langchain_community.document_transformers import BeautifulSoupTransformer
|
76 |
|
77 |
|
78 |
# Function to index URLs in RAG
|
79 |
def index_urls_in_rag(urls=[]):
|
80 |
# Load the RAG model
|
81 |
+
rag_model = "jinaai/jina-embeddings-v2-base-de"
|
82 |
encode_kwargs = {
|
83 |
"normalize_embeddings": True
|
84 |
} # set True to compute cosine similarity
|
|
|
97 |
loader = WebBaseLoader(url)
|
98 |
document = loader.load()
|
99 |
|
100 |
+
# Transform
|
101 |
+
bs_transformer = BeautifulSoupTransformer()
|
102 |
+
docs_transformed = bs_transformer.transform_documents(
|
103 |
+
document, class_to_extract=["p", "li", "div", "a"]
|
104 |
+
)
|
105 |
+
|
106 |
# Split the document into chunks
|
107 |
text_splitter = RecursiveCharacterTextSplitter()
|
108 |
+
document_chunks = text_splitter.split_documents(docs_transformed)
|
109 |
print(document_chunks)
|
110 |
# Index document chunks into the vector store
|
111 |
vector_store.add_documents(document_chunks)
|