File size: 5,093 Bytes
c188624
 
cd22fcf
c188624
 
 
 
3285b2f
c188624
 
 
 
 
 
 
 
 
 
 
c8e2bd5
 
fb4de31
3a6124d
c188624
 
 
 
a17b65d
 
8ca54ac
 
 
 
 
 
a17b65d
 
f80ab29
a17b65d
 
 
 
c90738f
a17b65d
 
 
 
 
 
 
 
 
f80ab29
a17b65d
 
 
 
faccbfc
c90738f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c188624
 
 
 
3a6124d
083cdef
c188624
 
 
 
 
 
 
 
 
aa06dc2
c46cd83
bc5855e
699c8d1
 
 
4b58234
 
c188624
 
 
 
 
 
 
 
 
 
0bb7e36
c188624
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import streamlit as st
from transformers import *
from carga_articulos import cargar_articulos
from preprocesamiento_articulos import limpieza_articulos
from entrenamiento_modelo import term_document_matrix, tf_idf_score
from resultados_consulta import resultados_consulta, detalles_resultados
import tensorflow as tf

def crear_indice():
    df=cargar_articulos()
    vocab = limpieza_articulos(df)

    td_matrix=term_document_matrix(df, vocab, 'ID', 'titulo')
    td_idf_matrix=tf_idf_score(td_matrix, df.ID.values)

    td_idf_matrix.to_csv('articulos_indexados.csv') 

def load_qa_model():

    tokenizer = AutoTokenizer.from_pretrained('mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es', use_fast="false")
    model = TFDistilBertForQuestionAnswering.from_pretrained("mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es", from_pt=True)
    return tokenizer, model

# 4. Use streamlit to create a web app
def main():

    #crear_indice() 

    

    st.set_page_config(page_title="Buscador de noticias periodicos dominicanos", page_icon="📰")
    st.header('El Repartidor Dominicano')
    st.image('repartidor_periodicos.jpeg', width=150)

    
    # Sidebar
    st.sidebar.header("Acerca de")
    st.sidebar.markdown(
        "[Streamlit](https://streamlit.io) is a Python library that allows the creation of interactive, data-driven web applications in Python."
    )
    
    st.sidebar.header("Artículos Indexados")
    st.sidebar.markdown(
        """
    - [Streamlit Documentation](https://docs.streamlit.io/)
    - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet)
    - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science)
    - [Blog](https://blog.streamlit.io/how-to-master-streamlit-for-data-science/) (How to master Streamlit for data science)
    """
    )
    
    st.sidebar.header("Disclaimer")
    st.sidebar.markdown(
        "You can quickly deploy Streamlit apps using [Streamlit Community Cloud](https://streamlit.io/cloud) in just a few clicks."
    )

    st.sidebar.header("¿Te gustó mi sitio? ¡Cómprame una café!")
    with st.sidebar:
        st.html(
                """
                <div id="donate-button-container">
                <div id="donate-button"></div>
                <script src="https://www.paypalobjects.com/donate/sdk/donate-sdk.js" charset="UTF-8"></script>
                <script>
                PayPal.Donation.Button({
                env:'production',
                hosted_button_id:'VK5ZAB52ZYDNA',
                image: {
                src:'https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif',
                alt:'Donate with PayPal button',
                title:'PayPal - The safer, easier way to pay online!',
                }
                }).render('#donate-button');
                </script>
                </div>
                """
        )

    df=cargar_articulos()
    articulos_indexados = pd.read_csv('articulos_indexados.csv')
    articulos_indexados = articulos_indexados.set_index('Unnamed: 0')
    tokenizer, qa_model = load_qa_model()
    

    query = st.text_input(
        "Escribe tus términos de búsqueda o haz una pregunta terminando con el caracter ?:"
    )

    if query:

        if ('?' in query):
           st.write("Contestando a: ", query)
           text='Un texto es una composición de signos codificados en un sistema de escritura que forma una unidad de sentido.' 
           inputs =  tokenizer(query, text, return_tensors='tf')
           outputs = qa_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
           answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
           answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
           predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] 
           answer=tokenizer.decode(predict_answer_tokens) 
           st.info(answer)     

        else:    

            st.write("Buscando: ", query)
            result = resultados_consulta(df,articulos_indexados, query)

            if result.empty:
                st.info("No se encontraron artículos para la búsqueda solicitada")

            else:
            
                df_results=detalles_resultados(df,result)
                N_cards_per_row = 1
                for n_row, row in df_results.reset_index().iterrows():
                    i = n_row%N_cards_per_row
                    if i==0:
                        st.write("---")
                        cols = st.columns(N_cards_per_row, gap="large")
                    # draw the card
                    with cols[n_row%N_cards_per_row]:
                        st.caption(f"{row['feed'].strip()} - {row['seccion'].strip()} - {row['fecha'].strip()} ")
                        st.markdown(f"**{row['titulo'].strip()}**")
                        st.markdown(f"{row['resumen'].strip()}")
                        st.markdown(f"{row['link']}")
if __name__ == "__main__":
    main()