|
import pandas as pd |
|
import streamlit as st |
|
from transformers import * |
|
from carga_articulos import cargar_articulos |
|
from preprocesamiento_articulos import limpieza_articulos |
|
from entrenamiento_modelo import term_document_matrix, tf_idf_score |
|
from resultados_consulta import resultados_consulta, detalles_resultados |
|
import tensorflow as tf |
|
|
|
def crear_indice(): |
|
df=cargar_articulos() |
|
vocab = limpieza_articulos(df) |
|
|
|
td_matrix=term_document_matrix(df, vocab, 'ID', 'titulo') |
|
td_idf_matrix=tf_idf_score(td_matrix, df.ID.values) |
|
|
|
td_idf_matrix.to_csv('articulos_indexados.csv') |
|
|
|
def load_qa_model(): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es', use_fast="false") |
|
model = TFDistilBertForQuestionAnswering.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es", from_pt=True) |
|
return tokenizer, model |
|
|
|
|
|
def main(): |
|
|
|
|
|
|
|
st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰") |
|
st.header('El Repartidor Dominicano') |
|
st.image('repartidor_periodicos.jpeg', width=150) |
|
|
|
df=cargar_articulos() |
|
articulos_indexados = pd.read_csv('articulos_indexados.csv') |
|
articulos_indexados = articulos_indexados.set_index('Unnamed: 0') |
|
tokenizer, qa_model = load_qa_model() |
|
|
|
|
|
query = st.text_input( |
|
"Escribe tus términos de búsqueda o haz una pregunta terminando con el caracter ?:" |
|
) |
|
|
|
if query: |
|
|
|
if ('?' in query): |
|
st.write("Contestando a: ", query) |
|
text='Un texto es una composición de signos codificados en un sistema de escritura que forma una unidad de sentido.' |
|
inputs = tokenizer(query, text, return_tensors='tf') |
|
|
|
outputs = qa_model(input_ids=inputs['input_ids']) |
|
loss = outputs.loss |
|
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) |
|
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) |
|
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] |
|
answer=tokenizer.decode(predict_answer_tokens) |
|
|
|
st.info(answer, loss) |
|
|
|
else: |
|
|
|
st.write("Buscando: ", query) |
|
result = resultados_consulta(df,articulos_indexados, query) |
|
|
|
if result.empty: |
|
st.info("No se encontraron artículos para la búsqueda solicitada") |
|
|
|
else: |
|
|
|
df_results=detalles_resultados(df,result) |
|
N_cards_per_row = 1 |
|
for n_row, row in df_results.reset_index().iterrows(): |
|
i = n_row%N_cards_per_row |
|
if i==0: |
|
st.write("---") |
|
cols = st.columns(N_cards_per_row, gap="large") |
|
|
|
with cols[n_row%N_cards_per_row]: |
|
st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ") |
|
st.markdown(f"**{row['titulo'].strip()}**") |
|
st.markdown(f"{row['resumen'].strip()}") |
|
st.markdown(f"{row['link']}") |
|
if __name__ == "__main__": |
|
main() |
|
|