Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup | |
import string | |
def fetch_article_text(url: str): | |
r = requests.get(url) | |
soup = BeautifulSoup(r.text, "html.parser") | |
results = soup.find_all(["h1", "p"]) | |
text = [result.text for result in results] | |
ARTICLE = " ".join(text) | |
ARTICLE = ARTICLE.replace(".", ".<eos>") | |
ARTICLE = ARTICLE.replace("!", "!<eos>") | |
ARTICLE = ARTICLE.replace("?", "?<eos>") | |
sentences = ARTICLE.split("<eos>") | |
current_chunk = 0 | |
chunks = [] | |
for sentence in sentences: | |
if len(chunks) == current_chunk + 1: | |
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500: | |
chunks[current_chunk].extend(sentence.split(" ")) | |
else: | |
current_chunk += 1 | |
chunks.append(sentence.split(" ")) | |
else: | |
print(current_chunk) | |
chunks.append(sentence.split(" ")) | |
for chunk_id in range(len(chunks)): | |
chunks[chunk_id] = " ".join(chunks[chunk_id]) | |
return ARTICLE, chunks | |
def count_tokens(text: str): | |
return len(text.split(" ")) | |