Spaces:
Runtime error
Runtime error
Paul-Joshi
commited on
Commit
•
febd687
1
Parent(s):
0e981b6
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from langchain_community.document_loaders import WebBaseLoader
|
3 |
-
from langchain.text_splitter import CharacterTextSplitter
|
4 |
from langchain_community.vectorstores import Chroma
|
5 |
from langchain_nomic.embeddings import NomicEmbeddings
|
6 |
|
@@ -11,9 +11,11 @@ from langchain_core.runnables import RunnablePassthrough
|
|
11 |
from langchain_core.output_parsers import StrOutputParser
|
12 |
from langchain_core.prompts import ChatPromptTemplate
|
13 |
|
|
|
14 |
|
15 |
def method_get_website_text(urls):
|
16 |
# Convert string of URLs to list
|
|
|
17 |
urls_list = urls.split("\n")
|
18 |
docs = [WebBaseLoader(url).load() for url in urls_list]
|
19 |
docs_list = [item for sublist in docs for item in sublist]
|
@@ -22,7 +24,9 @@ def method_get_website_text(urls):
|
|
22 |
|
23 |
def method_get_text_chunks(text):
|
24 |
#split the text into chunks
|
25 |
-
|
|
|
|
|
26 |
doc_splits = text_splitter.split_documents(text)
|
27 |
return doc_splits
|
28 |
|
@@ -31,7 +35,8 @@ def method_get_vectorstore(document_chunks):
|
|
31 |
#convert text chunks into embeddings and store in vector database
|
32 |
|
33 |
# create the open-source embedding function
|
34 |
-
embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
|
|
35 |
|
36 |
# create a vectorstore from the chunks
|
37 |
vector_store = Chroma.from_documents(document_chunks, embeddings)
|
@@ -51,7 +56,7 @@ def get_context_retriever_chain(vector_store, question):
|
|
51 |
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
|
52 |
|
53 |
# Initialize the Hugging Face language model (LLM)
|
54 |
-
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2")
|
55 |
|
56 |
# Construct the RAG pipeline
|
57 |
after_rag_chain = (
|
|
|
1 |
import streamlit as st
|
2 |
from langchain_community.document_loaders import WebBaseLoader
|
3 |
+
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
4 |
from langchain_community.vectorstores import Chroma
|
5 |
from langchain_nomic.embeddings import NomicEmbeddings
|
6 |
|
|
|
11 |
from langchain_core.output_parsers import StrOutputParser
|
12 |
from langchain_core.prompts import ChatPromptTemplate
|
13 |
|
14 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
15 |
|
16 |
def method_get_website_text(urls):
|
17 |
# Convert string of URLs to list
|
18 |
+
|
19 |
urls_list = urls.split("\n")
|
20 |
docs = [WebBaseLoader(url).load() for url in urls_list]
|
21 |
docs_list = [item for sublist in docs for item in sublist]
|
|
|
24 |
|
25 |
def method_get_text_chunks(text):
|
26 |
#split the text into chunks
|
27 |
+
|
28 |
+
#text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
|
29 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
|
30 |
doc_splits = text_splitter.split_documents(text)
|
31 |
return doc_splits
|
32 |
|
|
|
35 |
#convert text chunks into embeddings and store in vector database
|
36 |
|
37 |
# create the open-source embedding function
|
38 |
+
#embeddings = NomicEmbeddings(model="nomic-embed-text-v1.5")
|
39 |
+
embeddings = HuggingFaceEmbeddings()
|
40 |
|
41 |
# create a vectorstore from the chunks
|
42 |
vector_store = Chroma.from_documents(document_chunks, embeddings)
|
|
|
56 |
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
|
57 |
|
58 |
# Initialize the Hugging Face language model (LLM)
|
59 |
+
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.2", model_kwargs={"temperature":0.6, "max_length":512})
|
60 |
|
61 |
# Construct the RAG pipeline
|
62 |
after_rag_chain = (
|