luiscgp's picture
incrementa base de verdades
589bb0b
raw
history blame
11 kB
from typing import List, Optional
import torch
import streamlit as st
import pandas as pd
import random
import time
import logging
import shutil
from json import JSONDecodeError
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.modeling.utils import initialize_device_settings
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import Pipeline
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
from config import (
RETRIEVER_TOP_K,
RETRIEVER_MODEL,
NLI_MODEL,
)
class EntailmentChecker(BaseComponent):
"""
This node checks the entailment between every document content and the statement.
It enrichs the documents metadata with entailment informations.
It also returns aggregate entailment information.
"""
outgoing_edges = 1
def __init__(
self,
model_name_or_path: str = "roberta-large-mnli",
model_version: Optional[str] = None,
tokenizer: Optional[str] = None,
use_gpu: bool = True,
batch_size: int = 100,
entailment_contradiction_consideration: float = 0.6,
entailment_contradiction_threshold: float = 0.8
):
"""
Load a Natural Language Inference model from Transformers.
:param model_name_or_path: Directory of a saved model or the name of a public model.
See https://huggingface.co/models for full list of available models.
:param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
:param tokenizer: Name of the tokenizer (usually the same as model)
:param use_gpu: Whether to use GPU (if available).
:param batch_size: Number of Documents to be processed at a time.
:param entailment_contradiction_threshold: Only consider sentences that have entailment or contradiction score greater than this param.
"""
super().__init__()
self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=False)
tokenizer = tokenizer or model_name_or_path
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
self.model = AutoModelForSequenceClassification.from_pretrained(
pretrained_model_name_or_path=model_name_or_path, revision=model_version
)
self.batch_size = batch_size
self.entailment_contradiction_threshold = entailment_contradiction_threshold
self.entailment_contradiction_consideration = entailment_contradiction_consideration
self.model.to(str(self.devices[0]))
id2label = AutoConfig.from_pretrained(model_name_or_path).id2label
self.labels = [id2label[k].lower() for k in sorted(id2label)]
if "entailment" not in self.labels:
raise ValueError("The model config must contain entailment value in the id2label dict.")
def run(self, query: str, documents: List[Document]):
scores, agg_con, agg_neu, agg_ent = 0, 0, 0, 0
premise_batch = [doc.content for doc in documents]
hypothesis_batch = [query] * len(documents)
entailment_info_batch = self.get_entailment_batch(
premise_batch=premise_batch, hypothesis_batch=hypothesis_batch
)
considered_documents = []
for i, (doc, entailment_info) in enumerate(zip(documents, entailment_info_batch)):
doc.meta["entailment_info"] = entailment_info
con, neu, ent = (
entailment_info["contradiction"],
entailment_info["neutral"],
entailment_info["entailment"],
)
if (con > self.entailment_contradiction_consideration) or (ent > self.entailment_contradiction_consideration):
considered_documents.append(doc)
agg_con += con
agg_neu += neu
agg_ent += ent
scores += 1
if max(agg_con, agg_ent)/scores > self.entailment_contradiction_threshold:
break
# if in the first documents there is a strong evidence of entailment/contradiction,
# there is no need to consider less relevant documents
aggregate_entailment_info = {
"contradiction": round(agg_con / scores, 2),
"neutral": round(agg_neu / scores, 2),
"entailment": round(agg_ent / scores, 2),
}
entailment_checker_result = {
"documents": considered_documents[: i + 1],
"aggregate_entailment_info": aggregate_entailment_info,
}
return entailment_checker_result, "output_1"
def run_batch(self, queries: List[str], documents: List[Document]):
entailment_checker_result_batch = []
entailment_info_batch = self.get_entailment_batch(premise_batch=documents, hypothesis_batch=queries)
for doc, entailment_info in zip(documents, entailment_info_batch):
doc.meta["entailment_info"] = entailment_info
aggregate_entailment_info = {
"contradiction": round(entailment_info["contradiction"] / doc.score),
"neutral": round(entailment_info["neutral"] / doc.score),
"entailment": round(entailment_info["entailment"] / doc.score),
}
entailment_checker_result_batch.append(
{
"documents": [doc],
"aggregate_entailment_info": aggregate_entailment_info,
}
)
return entailment_checker_result_batch, "output_1"
def get_entailment_dict(self, probs):
return {k.lower(): v for k, v in zip(self.labels, probs)}
def get_entailment_batch(self, premise_batch: List[str], hypothesis_batch: List[str]):
formatted_texts = [
f"{premise}{self.tokenizer.sep_token}{hypothesis}"
for premise, hypothesis in zip(premise_batch, hypothesis_batch)
]
with torch.inference_mode():
inputs = self.tokenizer(formatted_texts, return_tensors="pt", padding=True, truncation=True).to(
self.devices[0]
)
out = self.model(**inputs)
logits = out.logits
probs_batch = torch.nn.functional.softmax(logits, dim=-1).detach().cpu().numpy()
return [self.get_entailment_dict(probs) for probs in probs_batch]
# cached to make index and models load only at start
@st.cache_resource
def start_haystack():
"""
load document store, retriever, entailment checker and create pipeline
"""
try:
os.remove('./data/faiss_document_store.db')
except:
pass
shutil.copy("./data/faiss_document_store.db", ".")
document_store = FAISSDocumentStore(
faiss_index_path=f"./data/my_faiss_index.faiss",
faiss_config_path=f"./data/my_faiss_index.json",
)
print(f"Index size: {document_store.get_document_count()}")
retriever = EmbeddingRetriever(
document_store=document_store,
embedding_model=RETRIEVER_MODEL
)
entailment_checker = EntailmentChecker(
model_name_or_path=NLI_MODEL,
use_gpu=False,
)
pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=entailment_checker, name="ec", inputs=["retriever"])
return pipe
pipe = start_haystack()
@st.cache_resource
def check_statement(statement: str, retriever_top_k: int = 5):
"""Run query and verify statement"""
params = {"retriever": {"top_k": retriever_top_k}}
return pipe.run(statement, params=params)
def set_state_if_absent(key, value):
if key not in st.session_state:
st.session_state[key] = value
# Small callback to reset the interface in case the text of the question changes
def reset_results(*args):
st.session_state.answer = None
st.session_state.results = None
st.session_state.raw_json = None
def create_df_for_relevant_snippets(docs):
"""
Create a dataframe that contains all relevant snippets.
"""
rows = []
for doc in docs:
row = {
"Content": doc.content,
"con": f"{doc.meta['entailment_info']['contradiction']:.2f}",
"neu": f"{doc.meta['entailment_info']['neutral']:.2f}",
"ent": f"{doc.meta['entailment_info']['entailment']:.2f}",
}
rows.append(row)
df = pd.DataFrame(rows)
df["Content"] = df["Content"].str.wrap(75)
df = df.style.apply(highlight_cols)
return df
def highlight_cols(s):
coldict = {"con": "#FFA07A", "neu": "#E5E4E2", "ent": "#a9d39e"}
if s.name in coldict.keys():
return ["background-color: {}".format(coldict[s.name])] * len(s)
return [""] * len(s)
def main():
# Persistent state
set_state_if_absent("statement", "")
set_state_if_absent("answer", "")
set_state_if_absent("results", None)
set_state_if_absent("raw_json", None)
st.write("# Verificação de Sentenças sobre Amazônia Azul")
st.write()
st.markdown(
"""
##### Insira uma sentença sobre a amazônia azul.
"""
)
# Search bar
statement = st.text_input(
"", value=st.session_state.statement, max_chars=100, on_change=reset_results
)
st.markdown("<style>.stButton button {width:100%;}</style>", unsafe_allow_html=True)
run_pressed = st.button("Run")
run_query = (
run_pressed or statement != st.session_state.statement
)
# Get results for query
if run_query and statement:
time_start = time.time()
reset_results()
st.session_state.statement = statement
with st.spinner("&nbsp;&nbsp; Procurando a Similaridade no banco de sentenças..."):
try:
st.session_state.results = check_statement(statement, RETRIEVER_TOP_K)
print(f"S: {statement}")
time_end = time.time()
print(time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime()))
print(f"elapsed time: {time_end - time_start}")
except JSONDecodeError as je:
st.error(
"👓 &nbsp;&nbsp; Erro na document store."
)
return
except Exception as e:
logging.exception(e)
st.error("🐞 &nbsp;&nbsp; Erro Genérico.")
return
# Display results
if st.session_state.results:
docs = st.session_state.results["documents"]
agg_entailment_info = st.session_state.results["aggregate_entailment_info"]
st.markdown(f"###### Aggregate entailment information:")
st.write(agg_entailment_info)
st.markdown(f"###### Most Relevant snippets:")
df = create_df_for_relevant_snippets(docs)
st.dataframe(df)
main()