Spaces:

Lisibonny
/

buscador-periodicos-dominicanos

Sleeping

File size: 3,404 Bytes

c188624
 
cd22fcf
c188624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8e2bd5
 
fb4de31
3a6124d
c188624
 
 
 
266c60c
c188624
 
 
 
 
 
 
 
3a6124d
083cdef
c188624
 
 
 
 
 
 
 
 
a49cecf
c46cd83
6d45eee
73b59b1
699c8d1
 
 
 
 
c188624

import pandas as pd
import streamlit as st
from transformers import *
from carga_articulos import cargar_articulos
from preprocesamiento_articulos import limpieza_articulos
from entrenamiento_modelo import term_document_matrix, tf_idf_score
from resultados_consulta import resultados_consulta, detalles_resultados

def crear_indice():
    df=cargar_articulos()
    vocab = limpieza_articulos(df)

    td_matrix=term_document_matrix(df, vocab, 'ID', 'titulo')
    td_idf_matrix=tf_idf_score(td_matrix, df.ID.values)

    td_idf_matrix.to_csv('articulos_indexados.csv') 

def load_qa_model():

    tokenizer = AutoTokenizer.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es', use_fast="false")
    model = TFDistilBertForQuestionAnswering.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es", from_pt=True)
    return tokenizer, model

# 4. Use streamlit to create a web app
def main():

    #crear_indice()  

    st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰")
    st.header('El Repartidor Dominicano')
    st.image('repartidor_periodicos.jpeg', width=150)

    df=cargar_articulos()
    articulos_indexados = pd.read_csv('articulos_indexados.csv')
    articulos_indexados = articulos_indexados.set_index('Unnamed: 0')
    tokenizer, qa_model = load_qa_model()
    

    query = st.text_input(
        "Escribe tus términos de búsqueda o haz una pregunta terminando con el caracter ?:"
    )

    if query:

        if ('?' in query):
           st.write("Contestando a: ", query)
           text='Manuel Romero está colaborando activamente con huggingface/transformers para traer el poder de las últimas técnicas de procesamiento de lenguaje natural al idioma español' 
           inputs =  tokenizer(query, text, return_tensors='tf')
           st.info(inputs) 
           outputs = qa_model(input_ids=inputs['input_ids'])
           answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
           answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
           predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] 
            
           st.info(predict_answer_tokens)     

        else:    

            st.write("Buscando: ", query)
            result = resultados_consulta(df,articulos_indexados, query)

            if result.empty:
                st.info("No se encontraron artículos para la búsqueda solicitada")

            else:
                #st.write(detalles_resultados(df,result), unsafe_allow_html=True)
                df_results=detalles_resultados(df,result)
                N_cards_per_row = 1
                for n_row, row in df_results.reset_index().iterrows():
                    i = n_row%N_cards_per_row
                    if i==0:
                        st.write("---")
                        cols = st.columns(N_cards_per_row, gap="large")
                    # draw the card
                    with cols[n_row%N_cards_per_row]:
                        st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ")
                        st.markdown(f"**{row['titulo'].strip()}**")
                        st.markdown(f"{row['resumen'].strip()}")
                        st.markdown(f"{row['link']}")
if __name__ == "__main__":
    main()