File size: 3,404 Bytes
c188624 cd22fcf c188624 c8e2bd5 fb4de31 3a6124d c188624 266c60c c188624 3a6124d 083cdef c188624 a49cecf c46cd83 6d45eee 73b59b1 699c8d1 c188624 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import pandas as pd
import streamlit as st
from transformers import *
from carga_articulos import cargar_articulos
from preprocesamiento_articulos import limpieza_articulos
from entrenamiento_modelo import term_document_matrix, tf_idf_score
from resultados_consulta import resultados_consulta, detalles_resultados
def crear_indice():
df=cargar_articulos()
vocab = limpieza_articulos(df)
td_matrix=term_document_matrix(df, vocab, 'ID', 'titulo')
td_idf_matrix=tf_idf_score(td_matrix, df.ID.values)
td_idf_matrix.to_csv('articulos_indexados.csv')
def load_qa_model():
tokenizer = AutoTokenizer.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es', use_fast="false")
model = TFDistilBertForQuestionAnswering.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es", from_pt=True)
return tokenizer, model
# 4. Use streamlit to create a web app
def main():
#crear_indice()
st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰")
st.header('El Repartidor Dominicano')
st.image('repartidor_periodicos.jpeg', width=150)
df=cargar_articulos()
articulos_indexados = pd.read_csv('articulos_indexados.csv')
articulos_indexados = articulos_indexados.set_index('Unnamed: 0')
tokenizer, qa_model = load_qa_model()
query = st.text_input(
"Escribe tus términos de búsqueda o haz una pregunta terminando con el caracter ?:"
)
if query:
if ('?' in query):
st.write("Contestando a: ", query)
text='Manuel Romero está colaborando activamente con huggingface/transformers para traer el poder de las últimas técnicas de procesamiento de lenguaje natural al idioma español'
inputs = tokenizer(query, text, return_tensors='tf')
st.info(inputs)
outputs = qa_model(input_ids=inputs['input_ids'])
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
st.info(predict_answer_tokens)
else:
st.write("Buscando: ", query)
result = resultados_consulta(df,articulos_indexados, query)
if result.empty:
st.info("No se encontraron artículos para la búsqueda solicitada")
else:
#st.write(detalles_resultados(df,result), unsafe_allow_html=True)
df_results=detalles_resultados(df,result)
N_cards_per_row = 1
for n_row, row in df_results.reset_index().iterrows():
i = n_row%N_cards_per_row
if i==0:
st.write("---")
cols = st.columns(N_cards_per_row, gap="large")
# draw the card
with cols[n_row%N_cards_per_row]:
st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ")
st.markdown(f"**{row['titulo'].strip()}**")
st.markdown(f"{row['resumen'].strip()}")
st.markdown(f"{row['link']}")
if __name__ == "__main__":
main()
|