File size: 4,686 Bytes
c188624
 
cd22fcf
c188624
 
 
 
3285b2f
c188624
 
 
 
 
 
 
 
 
 
 
c8e2bd5
 
fb4de31
3a6124d
c188624
 
 
 
a17b65d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c188624
 
 
 
 
 
 
 
3a6124d
083cdef
c188624
 
 
 
 
 
 
 
 
aa06dc2
c46cd83
bc5855e
699c8d1
 
 
4b58234
 
c188624
 
 
 
 
 
 
 
 
 
0bb7e36
c188624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import streamlit as st
from transformers import *
from carga_articulos import cargar_articulos
from preprocesamiento_articulos import limpieza_articulos
from entrenamiento_modelo import term_document_matrix, tf_idf_score
from resultados_consulta import resultados_consulta, detalles_resultados
import tensorflow as tf

def crear_indice():
    df=cargar_articulos()
    vocab = limpieza_articulos(df)

    td_matrix=term_document_matrix(df, vocab, 'ID', 'titulo')
    td_idf_matrix=tf_idf_score(td_matrix, df.ID.values)

    td_idf_matrix.to_csv('articulos_indexados.csv') 

def load_qa_model():

    tokenizer = AutoTokenizer.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es', use_fast="false")
    model = TFDistilBertForQuestionAnswering.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es", from_pt=True)
    return tokenizer, model

# 4. Use streamlit to create a web app
def main():

    #crear_indice() 

    with st.expander("About the #30DaysOfStreamlit"):
        st.markdown(
            """
        The **#30DaysOfStreamlit** is a coding challenge designed to help you get started in building Streamlit apps.
        
        Particularly, you'll be able to:
        - Set up a coding environment for building Streamlit apps
        - Build your first Streamlit app
        - Learn about all the awesome input/output widgets to use for your Streamlit app
        """
        )
    
    # Sidebar
    st.sidebar.header("About")
    st.sidebar.markdown(
        "[Streamlit](https://streamlit.io) is a Python library that allows the creation of interactive, data-driven web applications in Python."
    )
    
    st.sidebar.header("Resources")
    st.sidebar.markdown(
        """
    - [Streamlit Documentation](https://docs.streamlit.io/)
    - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet)
    - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science)
    - [Blog](https://blog.streamlit.io/how-to-master-streamlit-for-data-science/) (How to master Streamlit for data science)
    """
    )
    
    st.sidebar.header("Deploy")
    st.sidebar.markdown(
        "You can quickly deploy Streamlit apps using [Streamlit Community Cloud](https://streamlit.io/cloud) in just a few clicks."
    )


    st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰")
    st.header('El Repartidor Dominicano')
    st.image('repartidor_periodicos.jpeg', width=150)

    df=cargar_articulos()
    articulos_indexados = pd.read_csv('articulos_indexados.csv')
    articulos_indexados = articulos_indexados.set_index('Unnamed: 0')
    tokenizer, qa_model = load_qa_model()
    

    query = st.text_input(
        "Escribe tus términos de búsqueda o haz una pregunta terminando con el caracter ?:"
    )

    if query:

        if ('?' in query):
           st.write("Contestando a: ", query)
           text='Un texto es una composición de signos codificados en un sistema de escritura que forma una unidad de sentido.' 
           inputs =  tokenizer(query, text, return_tensors='tf')
           outputs = qa_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
           answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
           answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
           predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] 
           answer=tokenizer.decode(predict_answer_tokens) 
           st.info(answer)     

        else:    

            st.write("Buscando: ", query)
            result = resultados_consulta(df,articulos_indexados, query)

            if result.empty:
                st.info("No se encontraron artículos para la búsqueda solicitada")

            else:
            
                df_results=detalles_resultados(df,result)
                N_cards_per_row = 1
                for n_row, row in df_results.reset_index().iterrows():
                    i = n_row%N_cards_per_row
                    if i==0:
                        st.write("---")
                        cols = st.columns(N_cards_per_row, gap="large")
                    # draw the card
                    with cols[n_row%N_cards_per_row]:
                        st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ")
                        st.markdown(f"**{row['titulo'].strip()}**")
                        st.markdown(f"{row['resumen'].strip()}")
                        st.markdown(f"{row['link']}")
if __name__ == "__main__":
    main()