|
import re |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from nltk.tokenize import sent_tokenize |
|
|
|
emoji_pattern = re.compile( |
|
"[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
"]+", |
|
flags=re.UNICODE, |
|
) |
|
|
|
|
|
def clean_text(x): |
|
|
|
x = x.encode("ascii", "ignore").decode() |
|
x = re.sub(r"https*\S+", " ", x) |
|
x = re.sub(r"@\S+", " ", x) |
|
x = re.sub(r"#\S+", " ", x) |
|
|
|
|
|
|
|
x = re.sub(r"\s{2,}", " ", x) |
|
x = emoji_pattern.sub(r"", x) |
|
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) |
|
|
|
return x |
|
|
|
|
|
def fetch_article_text(url: str): |
|
|
|
r = requests.get(url) |
|
soup = BeautifulSoup(r.text, "html.parser") |
|
results = soup.find_all(["h1", "p"]) |
|
text = [result.text for result in results] |
|
ARTICLE = " ".join(text) |
|
ARTICLE = ARTICLE.replace(".", ".<eos>") |
|
ARTICLE = ARTICLE.replace("!", "!<eos>") |
|
ARTICLE = ARTICLE.replace("?", "?<eos>") |
|
sentences = ARTICLE.split("<eos>") |
|
current_chunk = 0 |
|
chunks = [] |
|
for sentence in sentences: |
|
if len(chunks) == current_chunk + 1: |
|
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500: |
|
chunks[current_chunk].extend(sentence.split(" ")) |
|
else: |
|
current_chunk += 1 |
|
chunks.append(sentence.split(" ")) |
|
else: |
|
print(current_chunk) |
|
chunks.append(sentence.split(" ")) |
|
|
|
for chunk_id in range(len(chunks)): |
|
chunks[chunk_id] = " ".join(chunks[chunk_id]) |
|
|
|
return ARTICLE, chunks |
|
|
|
|
|
def preprocess_text_for_abstractive_summarization(tokenizer, text): |
|
sentences = sent_tokenize(text) |
|
|
|
|
|
length = 0 |
|
chunk = "" |
|
chunks = [] |
|
count = -1 |
|
for sentence in sentences: |
|
count += 1 |
|
combined_length = ( |
|
len(tokenizer.tokenize(sentence)) + length |
|
) |
|
|
|
if combined_length <= tokenizer.max_len_single_sentence: |
|
chunk += sentence + " " |
|
length = combined_length |
|
|
|
|
|
if count == len(sentences) - 1: |
|
chunks.append(chunk.strip()) |
|
|
|
else: |
|
chunks.append(chunk.strip()) |
|
|
|
|
|
length = 0 |
|
chunk = "" |
|
|
|
|
|
chunk += sentence + " " |
|
length = len(tokenizer.tokenize(sentence)) |
|
|
|
return chunks |
|
|
|
|
|
def read_text_from_file(file): |
|
|
|
txt_file = open(file, "r") |
|
file_text = txt_file.read() |
|
txt_file.close() |
|
|
|
return txt_file |
|
|