semantic_search / app.py
seduerr's picture
changed the theme to HF
a3921be
import gradio as gr
from haystack.nodes import FARMReader, PreProcessor, PDFToTextConverter, TfidfRetriever
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
document_store = InMemoryDocumentStore()
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
preprocessor = PreProcessor(
clean_empty_lines=True,
clean_whitespace=True,
clean_header_footer=True,
split_by="word",
split_length=100,
split_respect_sentence_boundary=True,
split_overlap=3
)
def print_answers(results):
fields = ["answer", "score"] # "context",
answers = results["answers"]
filtered_answers = []
for ans in answers:
filtered_ans = {
field: getattr(ans, field)
for field in fields
if getattr(ans, field) is not None
}
filtered_answers.append(filtered_ans)
return filtered_answers
def pdf_to_document_store(pdf_file):
document_store.delete_documents()
converter = PDFToTextConverter(
remove_numeric_tables=True, valid_languages=["en"])
documents = [converter.convert(file_path=pdf_file.name, meta=None)[0]]
preprocessed_docs = preprocessor.process(documents)
document_store.write_documents(preprocessed_docs)
return None
def predict(question, pdf_file):
pdf_to_document_store(pdf_file)
retriever = TfidfRetriever(document_store=document_store)
pipe = ExtractiveQAPipeline(reader, retriever)
result = pipe.run(query=question, params={"Retriever": {
"top_k": 5}, "Reader": {"top_k": 3}})
answers = print_answers(result)
return answers
title = "Search PDF Business Reports with Sparse Passage Retrieval"
iface = gr.Interface(fn=predict,
inputs=[gr.inputs.Textbox(lines=3, label='Ask an open question starting with ... What/When/Where/Who?'),
gr.inputs.File(file_count="single",
type="file", label="Upload a pdf"),
],
outputs="text",
title=title,
theme="huggingface",
examples=[['What are the strategic initiatives?',
'samples/walmart-10k.pdf'],
['What efforts are being made in regards to digital experiences?',
'samples/walmart-10k.pdf'],
["How much does the company grow?",
'samples/walmart-10k.pdf'],
['What are the strategic initiatives?',
'samples/pfizer-10k.pdf'],
['What efforts are being made in regards to digital experiences?',
'samples/pfizer-10k.pdf'],
["How much does the company grow?",
'samples/pfizer-10k.pdf'], ]
)
iface.launch()