TSmarizer / utils.py
cagataydag's picture
Update utils.py
c8b3ee6
import re
import requests
import docx2txt
from io import StringIO
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)
def clean_text(x):
# x = x.lower() # lowercase
x = x.encode("ascii", "ignore").decode() # unicode
x = re.sub(r"https*\S+", " ", x) # url
x = re.sub(r"@\S+", " ", x) # mentions
x = re.sub(r"#\S+", " ", x) # hastags
# x = x.replace("'", "") # remove ticks
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
x = re.sub(r"\s{2,}", " ", x) # over spaces
x = emoji_pattern.sub(r"", x) # emojis
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
return x
def fetch_article_text(url: str):
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all(["h1", "p"])
text = [result.text for result in results]
ARTICLE = " ".join(text)
ARTICLE = ARTICLE.replace(".", ".<eos>")
ARTICLE = ARTICLE.replace("!", "!<eos>")
ARTICLE = ARTICLE.replace("?", "?<eos>")
sentences = ARTICLE.split("<eos>")
current_chunk = 0
chunks = []
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
chunks[current_chunk].extend(sentence.split(" "))
else:
current_chunk += 1
chunks.append(sentence.split(" "))
else:
print(current_chunk)
chunks.append(sentence.split(" "))
for chunk_id in range(len(chunks)):
chunks[chunk_id] = " ".join(chunks[chunk_id])
return ARTICLE, chunks
def preprocess_text_for_abstractive_summarization(tokenizer, text):
sentences = sent_tokenize(text)
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
count += 1
combined_length = (
len(tokenizer.tokenize(sentence)) + length
) # add the no. of sentence tokens to the length counter
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
chunk += sentence + " " # add the sentence to the chunk
length = combined_length # update the length counter
# if it is the last sentence
if count == len(sentences) - 1:
chunks.append(chunk.strip()) # save the chunk
else:
chunks.append(chunk.strip()) # save the chunk
# reset
length = 0
chunk = ""
# take care of the overflow sentence
chunk += sentence + " "
length = len(tokenizer.tokenize(sentence))
return chunks
def read_pdf(file):
pdfReader = PdfReader(file)
count = len(pdfReader.pages)
all_page_text = ""
for i in range(count):
page = pdfReader.pages[i]
all_page_text += page.extract_text()
return all_page_text
def read_text_from_file(file):
# read text file
if file.type == "text/plain":
# To convert to a string based IO:
stringio = StringIO(file.getvalue().decode("utf-8"))
# To read file as string:
file_content = stringio.read()
# read pdf file
elif file.type == "application/pdf":
file_content = read_pdf(file)
# read docx file
elif (
file.type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
file_content = docx2txt.process(file)
return file_content