Bookworm-websearch / scraper.py
kshitijk
Add scrpits
ccbdf6f
raw
history blame
4.97 kB
import requests
from bs4 import BeautifulSoup
import os
import asyncio
import aiohttp
from dotenv import load_dotenv
load_dotenv()
brave_key = os.getenv("BRAVE_KEY")
# print(f"Brave Key: {brave_key}")
import time
import json
MAX_SCRAPED_LEN = 1024
def fetch_urls(response):
urls = []
results_dict = response.json()
# print(results_dict)
# Parse the HTML content of the search results page
soup = BeautifulSoup(response.text, 'html.parser')
attrs = [f"{val} \n\n" for val in soup.contents]
for res in results_dict['web']['results']:
urls.append(res['url'])
return urls
async def fetch_content(session, url):
try:
async with session.get(url) as response:
if response.status == 200:
content = await async_remove_tags(await response.read())
return content
except Exception as e:
print(f"Error fetching content from {url}: {e}")
return None
async def fetch_all(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_content(session, url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
return results
def fetch_context(query):
url = "https://api.search.brave.com/res/v1/web/search"
api_key = brave_key
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip",
"X-Subscription-Token": api_key
}
total_content = []
params = {
"q": query,
"count": 4
}
response = requests.get(url, headers=headers, params=params)
# # Send an HTTP GET request to the search engine
if response.status_code == 200:
urls = fetch_urls(response)
try:
loop = asyncio.get_event_loop()
except:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
results = loop.run_until_complete(fetch_all(urls))
# Process fetched content and summarize
for content in results:
if content:
total_content.append(content[:min(len(content), MAX_SCRAPED_LEN)])
else:
print("Failed to fetch real-time data. Status code:", response.status_code)
return total_content
# Function to remove tags
async def async_remove_tags(html):
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
def remove_tags(html):
# parse html content
soup = BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
def fetch_images(query):
url = "https://api.search.brave.com/res/v1/images/search"
api_key = brave_key
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip",
"X-Subscription-Token": api_key
}
titles = [" + ".join(query.split(','))]
url_list = []
for q in titles:
params = {
"q": q,
"count": 10
}
print(f"Image Query: {q}")
tries = 3
for _ in range(tries):
response = requests.get(url, headers=headers, params=params)
try:
# # Send an HTTP GET request to the search engine
if response.status_code == 200:
results_dict = response.json()
# Parse the HTML content of the search results page
soup = BeautifulSoup(response.text, 'html.parser')
attrs = [f"{val} \n\n" for val in soup.contents]
urls = []
# print(soup.get_text())
for res in results_dict['results']:
urls.append(res['thumbnail']['src'])
for url in urls:
try:
response = requests.get(url)
if response.status_code == 200:
url_list.append(url)
except:
print(f"Invalid url : {url}")
break # Got a result, exit
else:
print("Failed to fetch real-time data. Status code:", response.status_code)
except Exception as e:
print(f"Cant retrieve: {e}")
return url_list
if __name__ == "__main__":
import time
query = "Suggest 3 books by Enid Blyton"
start_ts = time.time()
total_content = fetch_context(query)
for c in total_content:
print("="*100)
print(c)
print("="*100)
end_ts = time.time()
print(f"Time taken {end_ts - start_ts} seconds")