File size: 3,017 Bytes
b19c8bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

# %%
import os
from time import sleep

from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import launch_es

launch_es()
sleep(30)
# %%
os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False"
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
# %%
import pandas as pd

df_document = pd.read_csv("data/articles.csv")
df_document.head()
# %%
articles = []
for idx, row in df_document.iterrows():
    article = {
        "id": idx,
        "content": row["article"],
        "meta":{
            "chapter_name": row["chapter_name"],
            "article_page": row["article_page"],
            "article_number": row["article_number"],
            "article_name": row["article_name"],
        },
    }
    articles.append(article)

document_store.write_documents(articles, index="document")
print(f"Loaded {document_store.get_document_count()} documents")
# %%
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)
# %%
from haystack.nodes import FARMReader

model_ckpt = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
reader = FARMReader(
    model_name_or_path=model_ckpt,
    progress_bar=False,
    max_seq_len=384,
    doc_stride=128,
    return_no_answer=False,
    use_gpu=False,
)
# %%
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)
# %%
from textwrap import fill


def run_qa_pipeline(question):
    results = pipe.run(
        query=question,
        params={
            "Retriever": {"top_k": 10},
            "Reader": {"top_k": 5}
        }
    )
    return results

def results_as_markdown(results):
    top_answers = []
    for count, result in enumerate(results["answers"]):
        article = document_store.get_document_by_id(result.document_id)
        meta = result.meta
        formatted_answer = """**Capítulo: {}.\t número: {}.\t nombre: {}.\t página: {}.**
        {}
        """.format(
            meta["chapter_name"],
            meta["article_number"],
            meta["article_name"],
            meta["article_page"],
            fill(article.content, 80),
        )
        top_answers.append(formatted_answer)

    return "\n\n".join(top_answers)

def query_qa_pipeline(question):
    results = run_qa_pipeline(question)
    return results_as_markdown(results)

# %%
import gradio as gr

title = "**CONSOLIDADO NORMAS APROBADAS PARA LA PROPUESTA CONSTITUCIONAL POR EL PLENO DE LA CONVENCIÓN**"
default_question = "educación gratuita"

with gr.Blocks() as demo:
    gr.Markdown(title)
    with gr.Column():
        with gr.Row():
            question = gr.Textbox(lines=2, max_lines=3, label="Pregunta:", placeholder=default_question)
        with gr.Row():
            btn = gr.Button("Buscar")
    with gr.Row():
        answers = gr.Markdown()
    btn.click(
        fn=query_qa_pipeline,
        inputs=question,
        outputs=answers,
    )

demo.launch(share=True)

# %%