Spaces:
Sleeping
Sleeping
File size: 5,330 Bytes
8329090 e2fe55a 8329090 e2fe55a 2e4daca e2fe55a 8329090 e2fe55a 8329090 e2fe55a 8329090 e48d908 8329090 e2fe55a 8329090 e2fe55a 8329090 e2fe55a 8329090 e2fe55a 8329090 e2fe55a 8329090 e2fe55a 8329090 e2fe55a 8329090 e48d908 8329090 e48d908 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import streamlit as st
from utils.config import document_store_configs, model_configs
from haystack import Pipeline
from haystack.schema import Answer
from haystack.document_stores import BaseDocumentStore
from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode, PreProcessor
from milvus_haystack import MilvusDocumentStore
#Use this file to set up your Haystack pipeline and querying
@st.cache_resource(show_spinner=False)
def start_preprocessor_node():
print('initializing preprocessor node')
processor = PreProcessor(
clean_empty_lines= True,
clean_whitespace=True,
clean_header_footer=True,
#remove_substrings=None,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
#split_overlap=0,
#max_chars_check= 10_000
)
return processor
#return docs
@st.cache_resource(show_spinner=False)
def start_document_store(type: str):
#This function starts the documents store of your choice based on your command line preference
print('initializing document store')
if type == 'inmemory':
document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
documents = [
{
'content': "Pi is a super dog",
'meta': {'name': "pi.txt"}
},
{
'content': "The revenue of siemens is 5 milion Euro",
'meta': {'name': "siemens.txt"}
},
]
#document_store.write_documents(documents)
elif type == 'opensearch':
document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
username = document_store_configs['OPENSEARCH_USERNAME'],
password = document_store_configs['OPENSEARCH_PASSWORD'],
host = document_store_configs['OPENSEARCH_HOST'],
port = document_store_configs['OPENSEARCH_PORT'],
index = document_store_configs['OPENSEARCH_INDEX'],
embedding_dim = document_store_configs['OPENSEARCH_EMBEDDING_DIM'])
elif type == 'weaviate':
document_store = WeaviateDocumentStore(host = document_store_configs['WEAVIATE_HOST'],
port = document_store_configs['WEAVIATE_PORT'],
index = document_store_configs['WEAVIATE_INDEX'],
embedding_dim = document_store_configs['WEAVIATE_EMBEDDING_DIM'])
elif type == 'milvus':
document_store = MilvusDocumentStore(uri = document_store_configs['MILVUS_URI'],
index = document_store_configs['MILVUS_INDEX'],
embedding_dim = document_store_configs['MILVUS_EMBEDDING_DIM'],
return_embedding=True)
return document_store
# cached to make index and models load only at start
@st.cache_resource(show_spinner=False)
def start_retriever(_document_store: BaseDocumentStore):
print('initializing retriever')
retriever = EmbeddingRetriever(document_store=_document_store,
embedding_model=model_configs['EMBEDDING_MODEL'],
top_k=5)
#
#_document_store.update_embeddings(retriever)
return retriever
@st.cache_resource(show_spinner=False)
def start_reader():
print('initializing reader')
reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
return reader
# cached to make index and models load only at start
@st.cache_resource(show_spinner=False)
def start_haystack_extractive(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, _reader: FARMReader):
print('initializing pipeline')
pipe = Pipeline()
pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component= _reader, name="Reader", inputs=["Retriever"])
return pipe
@st.cache_resource(show_spinner=False)
def start_haystack_rag(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever):
prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
model_name_or_path=model_configs['GENERATIVE_MODEL'],
api_key=model_configs['OPENAI_KEY'])
pipe = Pipeline()
pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
return pipe
#@st.cache_data(show_spinner=True)
def query(_pipeline, question):
params = {}
results = _pipeline.run(question, params=params)
return results
def initialize_pipeline(task, document_store, retriever, reader):
if task == 'extractive':
return start_haystack_extractive(document_store, retriever, reader)
elif task == 'rag':
return start_haystack_rag(document_store, retriever)
|