Gladiator's picture
fix bug
015350f
raw
history blame
3.96 kB
import re
import requests
import docx2txt
from io import StringIO
from PyPDF2 import PdfFileReader
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)
def clean_text(x):
# x = x.lower() # lowercase
x = x.encode("ascii", "ignore").decode() # unicode
x = re.sub(r"https*\S+", " ", x) # url
x = re.sub(r"@\S+", " ", x) # mentions
x = re.sub(r"#\S+", " ", x) # hastags
# x = x.replace("'", "") # remove ticks
# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
# x = re.sub(r"\w*\d+\w*", "", x) # numbers
x = re.sub(r"\s{2,}", " ", x) # over spaces
x = emoji_pattern.sub(r"", x) # emojis
x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?
return x
def fetch_article_text(url: str):
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all(["h1", "p"])
text = [result.text for result in results]
ARTICLE = " ".join(text)
ARTICLE = ARTICLE.replace(".", ".<eos>")
ARTICLE = ARTICLE.replace("!", "!<eos>")
ARTICLE = ARTICLE.replace("?", "?<eos>")
sentences = ARTICLE.split("<eos>")
current_chunk = 0
chunks = []
for sentence in sentences:
if len(chunks) == current_chunk + 1:
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
chunks[current_chunk].extend(sentence.split(" "))
else:
current_chunk += 1
chunks.append(sentence.split(" "))
else:
print(current_chunk)
chunks.append(sentence.split(" "))
for chunk_id in range(len(chunks)):
chunks[chunk_id] = " ".join(chunks[chunk_id])
return ARTICLE, chunks
def preprocess_text_for_abstractive_summarization(tokenizer, text):
sentences = sent_tokenize(text)
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
count += 1
combined_length = (
len(tokenizer.tokenize(sentence)) + length
) # add the no. of sentence tokens to the length counter
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
chunk += sentence + " " # add the sentence to the chunk
length = combined_length # update the length counter
# if it is the last sentence
if count == len(sentences) - 1:
chunks.append(chunk.strip()) # save the chunk
else:
chunks.append(chunk.strip()) # save the chunk
# reset
length = 0
chunk = ""
# take care of the overflow sentence
chunk += sentence + " "
length = len(tokenizer.tokenize(sentence))
return chunks
def read_pdf(file):
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_page_text = ""
for i in range(count):
page = pdfReader.getPage(i)
all_page_text += page.extractText()
return all_page_text
def read_text_from_file(file):
# read text file
if file.type == "text/plain":
# To convert to a string based IO:
stringio = StringIO(file.getvalue().decode("utf-8"))
# To read file as string:
file_content = stringio.read()
# read pdf file
elif file.type == "application/pdf":
file_content = read_pdf(file)
# read docx file
elif (
file.type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
file_content = docx2txt.process(file)
return file_content