import requests from bs4 import BeautifulSoup import string def fetch_article_text(url: str): r = requests.get(url) soup = BeautifulSoup(r.text, "html.parser") results = soup.find_all(["h1", "p"]) text = [result.text for result in results] ARTICLE = " ".join(text) ARTICLE = ARTICLE.replace(".", ".") ARTICLE = ARTICLE.replace("!", "!") ARTICLE = ARTICLE.replace("?", "?") sentences = ARTICLE.split("") current_chunk = 0 chunks = [] for sentence in sentences: if len(chunks) == current_chunk + 1: if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500: chunks[current_chunk].extend(sentence.split(" ")) else: current_chunk += 1 chunks.append(sentence.split(" ")) else: print(current_chunk) chunks.append(sentence.split(" ")) for chunk_id in range(len(chunks)): chunks[chunk_id] = " ".join(chunks[chunk_id]) return ARTICLE, chunks def count_tokens(text: str): return len(text.split(" "))