TSmarizer

Sleeping

App Files Files Community

TSmarizer / utils.py

cagataydag

Update utils.py

c8b3ee6 about 1 year ago

raw history blame contribute delete

No virus

3.96 kB

	import re
	import requests
	import docx2txt
	from io import StringIO
	from PyPDF2 import PdfReader

	from bs4 import BeautifulSoup
	from nltk.tokenize import sent_tokenize

	emoji_pattern = re.compile(
	"["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+",
	flags=re.UNICODE,
	)


	def clean_text(x):
	# x = x.lower() # lowercase
	x = x.encode("ascii", "ignore").decode() # unicode
	x = re.sub(r"https*\S+", " ", x) # url
	x = re.sub(r"@\S+", " ", x) # mentions
	x = re.sub(r"#\S+", " ", x) # hastags
	# x = x.replace("'", "") # remove ticks
	# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
	# x = re.sub(r"\w\d+\w", "", x) # numbers
	x = re.sub(r"\s{2,}", " ", x) # over spaces
	x = emoji_pattern.sub(r"", x) # emojis
	x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?

	return x


	def fetch_article_text(url: str):

	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	results = soup.find_all(["h1", "p"])
	text = [result.text for result in results]
	ARTICLE = " ".join(text)
	ARTICLE = ARTICLE.replace(".", ".<eos>")
	ARTICLE = ARTICLE.replace("!", "!<eos>")
	ARTICLE = ARTICLE.replace("?", "?<eos>")
	sentences = ARTICLE.split("<eos>")
	current_chunk = 0
	chunks = []
	for sentence in sentences:
	if len(chunks) == current_chunk + 1:
	if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
	chunks[current_chunk].extend(sentence.split(" "))
	else:
	current_chunk += 1
	chunks.append(sentence.split(" "))
	else:
	print(current_chunk)
	chunks.append(sentence.split(" "))

	for chunk_id in range(len(chunks)):
	chunks[chunk_id] = " ".join(chunks[chunk_id])

	return ARTICLE, chunks


	def preprocess_text_for_abstractive_summarization(tokenizer, text):
	sentences = sent_tokenize(text)

	# initialize
	length = 0
	chunk = ""
	chunks = []
	count = -1
	for sentence in sentences:
	count += 1
	combined_length = (
	len(tokenizer.tokenize(sentence)) + length
	) # add the no. of sentence tokens to the length counter

	if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
	chunk += sentence + " " # add the sentence to the chunk
	length = combined_length # update the length counter

	# if it is the last sentence
	if count == len(sentences) - 1:
	chunks.append(chunk.strip()) # save the chunk

	else:
	chunks.append(chunk.strip()) # save the chunk

	# reset
	length = 0
	chunk = ""

	# take care of the overflow sentence
	chunk += sentence + " "
	length = len(tokenizer.tokenize(sentence))

	return chunks


	def read_pdf(file):
	pdfReader = PdfReader(file)
	count = len(pdfReader.pages)
	all_page_text = ""
	for i in range(count):
	page = pdfReader.pages[i]
	all_page_text += page.extract_text()

	return all_page_text


	def read_text_from_file(file):

	# read text file
	if file.type == "text/plain":
	# To convert to a string based IO:
	stringio = StringIO(file.getvalue().decode("utf-8"))

	# To read file as string:
	file_content = stringio.read()

	# read pdf file
	elif file.type == "application/pdf":
	file_content = read_pdf(file)

	# read docx file
	elif (
	file.type
	== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	):
	file_content = docx2txt.process(file)

	return file_content