Spaces:

vladbogo
/

Filtir

Running

App Files Files Community

Filtir / step42_api_fetch_google_search_evidence.py

vladbogo

Upload folder using huggingface_hub

7a8b33f verified over 1 year ago

raw

history blame contribute delete

5.33 kB

	import requests
	from bs4 import BeautifulSoup
	from zsvision.zs_utils import BlockTimer
	import json
	import json5
	import argparse
	import multiprocessing as mp
	from zsvision.zs_multiproc import starmap_with_kwargs
	from datetime import datetime
	import urllib.robotparser
	import urllib.parse
	from urllib.parse import urlunparse
	from utils import get_google_search_results

	import time
	from random import randint
	from fake_useragent import UserAgent
	from newspaper import Article, Config


	class GoogleEvidence:
	def __init__(
	self,
	model="gpt-3.5-turbo",
	limit=0,
	refresh=False,
	num_search_results_to_keep=3,
	filter_str="",
	processes=8,
	):
	self.model = model
	self.limit = limit
	self.refresh = refresh
	self.num_search_results_to_keep = num_search_results_to_keep
	self.filter_str = filter_str
	self.processes = processes

	def can_index(self, url, user_agent_name):
	rp = urllib.robotparser.RobotFileParser()
	robots_url = f"{url.scheme}://{url.netloc}/robots.txt"

	headers = {
	"User-Agent": user_agent_name,
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"DNT": "1",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	}

	try:
	req = urllib.request.Request(robots_url, headers=headers)
	with urllib.request.urlopen(req) as response:
	rp.parse(response.read().decode("utf-8").splitlines())

	ok_to_index = rp.can_fetch(user_agent_name, url.geturl())
	except urllib.error.URLError:
	# If there is no robots.txt or there is an error accessing it, assume it's okay to index
	ok_to_index = True
	except Exception as e:
	print(f"An unexpected error occurred in step42: {e}")
	# going the safe route
	ok_to_index = False
	return ok_to_index

	def fetch_search_results_to_gather_evidence(
	self,
	queryset: dict,
	):
	user_agent = UserAgent()
	config = Config()
	config.fetch_images = False

	user_agent_name = "FiltirBot/1.0 (+https://filtir.com/filtirbot-info)"

	headers = {
	"User-Agent": user_agent_name,
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"DNT": "1",
	"Connection": "keep-alive",
	"Upgrade-Insecure-Requests": "1",
	}

	# we assume some sites won't permit indexing, so we'll skip these
	num_results = self.num_search_results_to_keep + 5
	results = {}

	print(f"Found {len(queryset)} claims to fetch search results for")

	for queryset_idx, item in enumerate(queryset):
	with BlockTimer(
	f"Fetching search results from Google {queryset_idx + 1}/{len(queryset)}"
	):
	search_results = get_google_search_results(
	query_str=item["claim"], num_results=num_results
	)

	if search_results == [{"Result": "No good Google Search Result was found"}]:
	item["search_results"] = []
	continue

	parsed_results = []
	for search_result in search_results:
	if not self.can_index(
	urllib.parse.urlparse(search_result["link"]),
	user_agent_name=user_agent_name,
	):
	print(
	f"Skipping {search_result['link']} because it doesn't permit indexing"
	)
	continue
	try:
	config.browser_user_agent = user_agent.random
	article = Article(
	search_result["link"], language="en", config=config
	)
	article.download()
	article.parse()
	text = article.text
	except Exception as e:
	print(f"Error parsing article: {e}, trying with requests.get...")
	try:
	response = requests.get(
	search_result["link"], timeout=15, headers=headers
	)
	html = response.text
	soup = BeautifulSoup(html, features="html.parser")
	text = soup.get_text()
	except Exception as exception:
	print(f"Error parsing article: {exception}, skipping")
	continue

	search_result["text"] = text
	parsed_results.append(search_result)
	if len(parsed_results) == self.num_search_results_to_keep:
	break
	item["search_results"] = parsed_results

	# update the queryset with new information
	date_str = datetime.now().strftime("%Y-%m-%d")
	results = {"documents": queryset, "dates": {"search_results_fetched": date_str}}

	print(f"Returning web pages for search results for {len(queryset)} queries")
	return results