from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor import gradio as gr from haystack.nodes import PreProcessor from haystack.document_stores.faiss import FAISSDocumentStore from haystack.nodes import DensePassageRetriever from haystack.nodes import FARMReader from haystack.pipelines import ExtractiveQAPipeline pdf_converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) converted = pdf_converter.convert(file_path="statistics-for-machine-learning.pdf", meta preprocessor = PreProcessor( split_by="word", split_length=200, split_overlap=10, ) preprocessed = preprocessor.process(converted) document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True) document_store.delete_all_documents() document_store.write_documents(preprocessed) retriever = DensePassageRetriever(document_store=document_store) reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2-distilled', use_gpu=False) document_store.update_embeddings(retriever) pipeline = ExtractiveQAPipeline(reader, retriever) questions = [ 'What is linear regression?', 'What is machine learning?', 'What are the steps in machine learning model development and deployment?', 'What is classification?' ] answers = [] for question in questions: prediction = pipeline.run(query=question) answers.append(prediction) for answer in answers: print('Q:', answer['query']) print('A:', answer['answers'][0].answer) print('Context: ', answer['answers'][0].context) print('score: ',answer['answers'][0].score) print('\n') def correct(question): prediction = pipeline.run(query=question) return answers.append(prediction) app_inputs = gr.inputs.File() interface = gr.Interface(fn=correct, inputs=[app_inputs,gr.inputs.Textbox(lines=10)], outputs=gr.inputs.Textbox(lines=20), title='PDF QA system') interface.launch(share=True)