Spaces:

Gladiator
/

Text-Summarizer

Runtime error

App Files Files Community

Text-Summarizer / utils.py

Gladiator

fix bug

015350f over 2 years ago

raw history blame

No virus

3.96 kB

	import re
	import requests
	import docx2txt
	from io import StringIO
	from PyPDF2 import PdfFileReader

	from bs4 import BeautifulSoup
	from nltk.tokenize import sent_tokenize

	emoji_pattern = re.compile(
	"["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+",
	flags=re.UNICODE,
	)


	def clean_text(x):
	# x = x.lower() # lowercase
	x = x.encode("ascii", "ignore").decode() # unicode
	x = re.sub(r"https*\S+", " ", x) # url
	x = re.sub(r"@\S+", " ", x) # mentions
	x = re.sub(r"#\S+", " ", x) # hastags
	# x = x.replace("'", "") # remove ticks
	# x = re.sub("[%s]" % re.escape(string.punctuation), " ", x) # punctuation
	# x = re.sub(r"\w\d+\w", "", x) # numbers
	x = re.sub(r"\s{2,}", " ", x) # over spaces
	x = emoji_pattern.sub(r"", x) # emojis
	x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!?

	return x


	def fetch_article_text(url: str):

	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	results = soup.find_all(["h1", "p"])
	text = [result.text for result in results]
	ARTICLE = " ".join(text)
	ARTICLE = ARTICLE.replace(".", ".<eos>")
	ARTICLE = ARTICLE.replace("!", "!<eos>")
	ARTICLE = ARTICLE.replace("?", "?<eos>")
	sentences = ARTICLE.split("<eos>")
	current_chunk = 0
	chunks = []
	for sentence in sentences:
	if len(chunks) == current_chunk + 1:
	if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
	chunks[current_chunk].extend(sentence.split(" "))
	else:
	current_chunk += 1
	chunks.append(sentence.split(" "))
	else:
	print(current_chunk)
	chunks.append(sentence.split(" "))

	for chunk_id in range(len(chunks)):
	chunks[chunk_id] = " ".join(chunks[chunk_id])

	return ARTICLE, chunks


	def preprocess_text_for_abstractive_summarization(tokenizer, text):
	sentences = sent_tokenize(text)

	# initialize
	length = 0
	chunk = ""
	chunks = []
	count = -1
	for sentence in sentences:
	count += 1
	combined_length = (
	len(tokenizer.tokenize(sentence)) + length
	) # add the no. of sentence tokens to the length counter

	if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
	chunk += sentence + " " # add the sentence to the chunk
	length = combined_length # update the length counter

	# if it is the last sentence
	if count == len(sentences) - 1:
	chunks.append(chunk.strip()) # save the chunk

	else:
	chunks.append(chunk.strip()) # save the chunk

	# reset
	length = 0
	chunk = ""

	# take care of the overflow sentence
	chunk += sentence + " "
	length = len(tokenizer.tokenize(sentence))

	return chunks


	def read_pdf(file):
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	all_page_text = ""
	for i in range(count):
	page = pdfReader.getPage(i)
	all_page_text += page.extractText()

	return all_page_text


	def read_text_from_file(file):

	# read text file
	if file.type == "text/plain":
	# To convert to a string based IO:
	stringio = StringIO(file.getvalue().decode("utf-8"))

	# To read file as string:
	file_content = stringio.read()

	# read pdf file
	elif file.type == "application/pdf":
	file_content = read_pdf(file)

	# read docx file
	elif (
	file.type
	== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	):
	file_content = docx2txt.process(file)

	return file_content