Spaces:
Running
Running
from typing import List | |
from haystack import component, Document | |
from newspaper import Article | |
import requests | |
class HackernewsFetcher(): | |
def run(self, top_k: int): | |
newest_list = requests.get(url='https://hacker-news.firebaseio.com/v0/topstories.json?print=pretty') | |
articles = [] | |
for id in newest_list.json()[0:top_k]: | |
article = requests.get(url=f"https://hacker-news.firebaseio.com/v0/item/{id}.json?print=pretty") | |
if 'url' in article.json(): | |
articles.append(article.json()['url']) | |
docs = [] | |
for url in articles: | |
try: | |
article = Article(url) | |
article.download() | |
article.parse() | |
docs.append(Document(content=article.text, meta={'title': article.title, 'url': url})) | |
except: | |
print(f"Couldn't download {url}, skipped") | |
return {'articles': docs} | |