Spaces:
Runtime error
Runtime error
File size: 1,101 Bytes
a4f4f24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import requests
from bs4 import BeautifulSoup
import string
def fetch_article_text(url: str):
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all(["h1", "p"])
text = [result.text for result in results]
ARTICLE = " ".join(text)
ARTICLE = ARTICLE.replace(".", ".<eos>")
ARTICLE = ARTICLE.replace("!", "!<eos>")
ARTICLE = ARTICLE.replace("?", "?<eos>")
sentences = ARTICLE.split("<eos>")
current_chunk = 0
chunks = []
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
chunks[current_chunk].extend(sentence.split(" "))
else:
current_chunk += 1
chunks.append(sentence.split(" "))
else:
print(current_chunk)
chunks.append(sentence.split(" "))
for chunk_id in range(len(chunks)):
chunks[chunk_id] = " ".join(chunks[chunk_id])
return ARTICLE, chunks
def count_tokens(text: str):
return len(text.split(" "))
|