|
import pandas as pd |
|
import numpy as np |
|
from preprocesamiento_articulos import remove_URL, remove_html_markup, eliminar_puntuacion, eliminar_stopwords, obtener_raices |
|
import streamlit as st |
|
|
|
|
|
|
|
|
|
def query_processing (query): |
|
query=eliminar_puntuacion(query) |
|
query = query.strip().lower() |
|
query = eliminar_stopwords(query) |
|
query = obtener_raices(query) |
|
return query |
|
|
|
|
|
|
|
|
|
|
|
|
|
def query_score(vocab_index, query): |
|
for word in np.unique(query.split()): |
|
freq=query.count(word) |
|
if word in vocab_index.index: |
|
tf_idf = np.log2(1+freq) * np.log2(vocab_index.loc[word].inverse_document_frequency) |
|
vocab_index.loc[word,"query_tf_idf"] = tf_idf |
|
vocab_index['query_tf_idf'].fillna(0, inplace=True) |
|
|
|
return vocab_index |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cosine_similarity(vocab_index, document_index, query_scores): |
|
|
|
cosine_scores = {} |
|
|
|
query_scalar = np.sqrt(sum(vocab_index[query_scores] ** 2)) |
|
|
|
for doc in document_index: |
|
|
|
doc_scalar = np.sqrt(sum(vocab_index[str(doc)] ** 2)) |
|
dot_prod = sum(vocab_index[str(doc)] * vocab_index[query_scores]) |
|
cosine = (dot_prod / (query_scalar * doc_scalar)) |
|
|
|
cosine_scores[doc] = cosine |
|
|
|
return pd.Series(cosine_scores) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def retrieve_index(data,cosine_scores, document_index, topn=10): |
|
|
|
data = data.set_index(document_index) |
|
data['scores'] = cosine_scores |
|
df_top_scores=data.reset_index().sort_values('scores',ascending=False).head(topn) |
|
cutoff=np.average(df_top_scores['scores'])+0.75*np.std(df_top_scores['scores']) |
|
df_top_scores=df_top_scores[df_top_scores['scores'] > cutoff] |
|
|
|
return df_top_scores.index |
|
|
|
|
|
|
|
def resultados_consulta(df,articulos_indexados, query): |
|
indices = pd.Index([], dtype='int64') |
|
query=query_processing(query) |
|
qs=query_score(articulos_indexados,query) |
|
if 'query_tf_idf' in qs.columns: |
|
cosenos = cosine_similarity(qs, df['ID'].values, 'query_tf_idf') |
|
indices = retrieve_index(df, cosenos, 'ID', len(df)) |
|
return indices |
|
|
|
def detalles_resultados(df,indices): |
|
top=df.loc[indices] |
|
top['resumen']=top['resumen'].apply(lambda x: remove_html_markup(x)) |
|
top['resumen']=top['resumen'].apply(lambda x: remove_URL(x)) |
|
top['resumen']=top['resumen'].apply(lambda x: x[0:600] if len(x)>600 else x) |
|
top=top.loc[:,['titulo', 'link', 'fecha', 'resumen', 'seccion', 'feed']] |
|
return top |
|
|