Spaces:
Sleeping
Sleeping
from os import remove | |
from os.path import exists | |
from functools import cache | |
import json | |
import streamlit as st | |
from googleapiclient.discovery import build | |
from slugify import slugify | |
from transformers import pipeline | |
import uuid | |
from beautiful_soup.beautiful_soup import get_url_content | |
def google_search_api_request( query ): | |
api_key = st.secrets["google_search_api_key"] | |
cx = st.secrets["google_search_engine_id"] | |
service = build( | |
"customsearch", | |
"v1", | |
developerKey=api_key, | |
cache_discovery=False | |
) | |
# Exclude PDFs from search results. | |
query = query + ' -filetype:pdf' | |
return service.cse().list( | |
q=query, | |
cx=cx, | |
num=5, | |
).execute() | |
def search_results( query ): | |
file_path = 'search-results/' + slugify( query ) + '.json' | |
results = [] | |
if exists( file_path ): | |
with open( file_path, 'r' ) as results_file: | |
results = json.load( results_file ) | |
else: | |
search_result = google_search_api_request( query ) | |
if ( int( search_result['searchInformation']['totalResults'] ) > 0 ): | |
results = search_result['items'] | |
with open( file_path, 'w' ) as results_file: | |
json.dump( results, results_file ) | |
if ( len( results ) == 0 ) : | |
raise Exception('No results found.') | |
return results | |
def content_summary( url_id, content ): | |
file_path = 'summaries/' + url_id + '.json' | |
if exists( file_path ): | |
with open( file_path, 'r' ) as file: | |
summary = json.load( file ) | |
else: | |
try: | |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
summary = summarizer(content, max_length=130, min_length=30, do_sample=False, truncation=True) | |
except Exception as exception: | |
raise exception | |
with open( file_path, 'w' ) as file: | |
json.dump( summary, file ) | |
return summary | |
def main(): | |
st.title('Google Search') | |
query = st.text_input('Search query') | |
if query : | |
with st.spinner('Loading search results...'): | |
try: | |
results = search_results( query ) | |
except Exception as exception: | |
st.exception(exception) | |
return | |
number_of_results = len( results ) | |
st.success( 'Found {} results.'.format( number_of_results ) ) | |
# if st.button('Search results JSON'): | |
with st.expander("Search results JSON"): | |
st.json( results ) | |
progress_bar = st.progress(0) | |
# for result in results: | |
for index, result in enumerate(results): | |
with st.container(): | |
url_id = uuid.uuid5( uuid.NAMESPACE_URL, result['link'] ).hex | |
st.write(result['link']) | |
st.write(url_id) | |
# if st.button('URL HTML'): | |
# st.json( results ) | |
# if st.button('Page content'): | |
# st.json( results ) | |
try: | |
content = get_url_content( result['link'] ) | |
except Exception as exception: | |
st.exception(exception) | |
progress_bar.progress( ( index + 1 ) / number_of_results ) | |
continue | |
summary = content_summary( url_id, content ) | |
for sentence in summary: | |
st.write(sentence['summary_text']) | |
progress_bar.progress( ( index + 1 ) / number_of_results ) | |
col1, col2 = st.columns([.5,1]) | |
with col1: | |
if st.button('Delete summary cache', key=url_id + 'summary'): | |
remove( 'summaries/' + url_id + '.json' ) | |
with col2: | |
if st.button('Delete content cache', key=url_id + 'content'): | |
remove( 'page-content/' + url_id + '.txt' ) | |
if __name__ == '__main__': | |
main() | |