content_summarizer / Utils.py
KevlarVK's picture
simple code to summarize using bart-large-cnn
a4f4f24
raw
history blame
No virus
1.1 kB
import requests
from bs4 import BeautifulSoup
import string
def fetch_article_text(url: str):
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all(["h1", "p"])
text = [result.text for result in results]
ARTICLE = " ".join(text)
ARTICLE = ARTICLE.replace(".", ".<eos>")
ARTICLE = ARTICLE.replace("!", "!<eos>")
ARTICLE = ARTICLE.replace("?", "?<eos>")
sentences = ARTICLE.split("<eos>")
current_chunk = 0
chunks = []
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
chunks[current_chunk].extend(sentence.split(" "))
else:
current_chunk += 1
chunks.append(sentence.split(" "))
else:
print(current_chunk)
chunks.append(sentence.split(" "))
for chunk_id in range(len(chunks)):
chunks[chunk_id] = " ".join(chunks[chunk_id])
return ARTICLE, chunks
def count_tokens(text: str):
return len(text.split(" "))