ViquipediaQA / app.py
from transformers import pipeline
import wikipedia
wikipedia.set_lang("es")
import random
import gradio as gr
import textwrap
model_name = "PlanTL-GOB-ES/roberta-base-bne-sqac"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
def get_wiki_article(topic):
topic=topic
try:
search = wikipedia.search(topic, results = 1)[0]
except wikipedia.DisambiguationError as e:
choices = [x for x in e.options if ('disambiguation' not in x) and ('All pages' not in x) and (x!=topic)]
search = random.choice(choices)
try:
p = wikipedia.page(search)
except wikipedia.exceptions.DisambiguationError as e:
choices = [x for x in e.options if ('disambiguation' not in x) and ('All pages' not in x) and (x!=topic)]
s = random.choice(choices)
p = wikipedia.page(s)
return p.content, p.url
def get_answer(topic, question):
w_art, w_url=get_wiki_article(topic)
if len(w_art) < 7000:
#w_art = w_art[:3000]
qa = {'question': question, 'context': w_art}
res = nlp(qa)
res['type'] = "one-pass"
if res['answer'] not in ['\n',' ','']:
ans = res['answer']
if res["start"] < 500:
st = 0
else:
st = res["start"] - 500
en = res["end"]+500
contextual = w_art[st:res["start"]]+ " --> "+ans+ " <-- "+w_art[res["end"]:en]
#thetext = w_art[res['start']-500:res['start']+500:]
res['text'] = contextual
#thetext = w_art[res['start']-200:res['start']+200:]
#res['text'] = thetext
else:
res = recursive_rank(w_art,question)
res['type'] = "recursive"
return res['answer'], res['text'],'<a href="'+w_url+'">'+w_url+'</a>', {'confidence':res['score']},res['type']
def recursive_rank(w_art,question):
parts = round(len(w_art)/5000)
size = round(len(w_art)/parts)
trozos = textwrap.wrap(w_art, size)
answers = []
for chunk in trozos:
if len(chunk) < 500:
pass
else:
qa = {'question': question, 'context': chunk}
res = nlp(qa)
if res['answer'] not in ['\n',' ','']:
ans = res['answer']
if res["start"] < 500:
st = 0#0newres["start"]
else:
st = res["start"] - 500
# if newres["end"] < 500:
# en = newres["end"]
# else:
en = res["end"]+500
contextual = chunk[st:res["start"]]+ " --> "+ans+ " <-- "+chunk[res["end"]:en]
thetext = contextual#chunk[res['start']-300:res['start']+300:]
res['text'] = thetext
answers.append(res)
answers.sort(key=lambda x: x['score'], reverse=True)
return answers[0]
inputs = [
gr.inputs.Textbox(lines=5, label="Tema"),
gr.inputs.Textbox(lines=5, label="Pregunta")
]
outputs = [
gr.outputs.Textbox(type='str',label="Resposta"),
gr.outputs.Textbox(type='str',label="Context de la resposta"),
gr.outputs.HTML(label="Article de Referencia"),
gr.outputs.Label(type="confidences",label="Confiança en la resposta (assumint el context i la pàgina del tòpic son correctes)"),
gr.outputs.Textbox(type='str',label="Tipus de cerca"),
]
title = "Pregunta/Respuesta en la Wikipedia en Castellano"
description = """
PROVES VERSIÓ CASTELLÁ ...
Pots fer preguntes tipus: que, qui, quin, quan, etc.
Els articles de mes de 8 mil caracteres podrien trigar força per que es fa una cerca recursiva. Si la resposta no es correcta, o la pàgina trobada no és rellevant, intenta replantejar la pregunta o triar com a tema algun terme mes específic.
"""
article = """<center>
</center>
"""
examples = [
['Invasión de Yugoslavia', '¿Cómo denominaron las fuerzas del Eje la invasión de Yugoslavia?'],
['Duna de Pilat', '¿Cómo son las pequeñas dunas que se encuentran junto a la Duna de Pilat?'],
['Fútbol', '¿Cuándo empezó a celebrarse la Copa Mundial de Fútbol?'],
['Batalla del Ebro', '¿Quién era jefe del Estado Mayor republicano?'],
['Batalla del Ebro', '¿Qué bando ganó la Batalla?'],
]
gr.Interface(get_answer, inputs, outputs, title=title, description=description, examples=examples, article=article,
theme="default", flagging_options=["incorrect", "correct"]).launch(share=False,enable_queue=False)