Spaces:

Davidsamuel101
/

PPTGenerator

Runtime error

App Files Files Community

PPTGenerator / src /text_extractor.py

Davidsamuel101

Tidy Up Code

9f2dd14 over 1 year ago

raw

history blame

5.94 kB

	from operator import itemgetter
	from collections import OrderedDict
	from typing import Dict, List, Iterator, Union, Tuple

	import re

	class TextExtractor:
	def __init__(self) -> None:
	pass

	@staticmethod
	def get_font_info(doc: Iterator, granularity=False) -> Tuple[List[Tuple[str, int]], Dict[str, Dict]]:
	"""
	This function return the fonts information inside the pdf such as size and type.

	Args:
	doc (<class 'fitz.fitz.Document'>): A fitz type document of the pdf file.
	granularity (bool, optional): Also use 'font', 'flags' and 'color' to discriminate text. Defaults to False.

	Raises:
	ValueError: Raises Value Error if there are no font detected

	Returns:
	Tuple[List[Tuple[str, int]], Dict[str, Dict]]: _description_
	"""
	styles = {}
	font_counts = {}

	for block in [s for page in doc for b in page.get_text('dict')['blocks'] if b['type'] == 0 for l in b['lines'] for s in l['spans'] if s['text'].strip()]:
	identifier = "{0}_{1}_{2}".format(block['size'], block['flags'], block['font']) if granularity else "{0}".format(block['size'])
	styles[identifier] = {'size': block['size'], 'flags': block['flags'], 'font': block['font'], 'color': block['color']} if granularity else {'size': block['size'], 'font': block['font']}
	font_counts[identifier] = font_counts.get(identifier, 0) + 1
	font_counts = sorted(font_counts.items(), key=lambda x: x[1], reverse=True)

	if not font_counts:
	raise ValueError("Zero discriminating fonts found!")

	return font_counts, styles

	@staticmethod
	def get_font_tags(font_counts, styles):
	"""
	_summary_

	Args:
	font_counts (_type_): _description_
	styles (_type_): _description_

	Returns:
	_type_: _description_
	"""
	p_size = styles[font_counts[0][0]]['size']
	# sorting the font sizes high to low, so that we can append the right integer to each tag
	font_sizes = sorted(set(float(font_size) for font_size, _ in font_counts), reverse=True)
	size_tag = {p_size: "<p>"}
	for i, size in enumerate(font_sizes):
	if size > p_size:
	size_tag[size] = f"<h{i+1}>"
	elif size < p_size:
	size_tag[size] = f"<s{i+1}>"
	return size_tag

	@staticmethod
	def assign_tags(doc, size_tag):
	"""
	Scrapes headers & paragraphs from PDF and return texts with element tags.

	Args:
	doc (<class 'fitz.fitz.Document'>): PDF document to iterate through.
	size_tag (dict): Textual element tags for each size.
	Returns:
	list: Texts with pre-prended element tags
	"""
	texts = []
	previous_s = {}
	block_string = ""
	for b in [b for page in doc for b in page.get_text("dict")["blocks"] if b['type'] == 0]:
	block_string = ""
	for l in b["lines"]:
	for s in l["spans"]:
	text = re.sub(r"[^\w\s]", '', s["text"]).strip()
	if text:
	if not previous_s: # First Span
	previous_s = s
	block_string = size_tag[s['size']] + s['text']
	elif s['size'] == previous_s['size']:
	if not block_string or (block_string and all((c == "\|") for c in block_string)): # New block
	block_string = size_tag[s['size']] + s['text']
	else: # in the same block, so concatenate strings
	block_string += f" {s['text']}"
	else:
	texts.append(block_string)
	block_string = size_tag[s['size']] + s['text']
	previous_s = s
	if block_string:
	block_string += "\|"
	# if block_string:
	texts.append(block_string)
	return texts

	@staticmethod
	def get_slides(texts):
	slides = {}
	section = []
	page = 1

	current_header = ""
	for text, next_text in zip(texts, texts[1:] + [None]):
	tag_match = re.search(r'(?<=<)(.*?)(?=>)', text)
	if tag_match:
	tag = tag_match.group()
	if tag == 'h1':
	section = []
	section.append(('h1', re.sub(r'<.*?>\|\\|', '', text).strip()))
	elif tag.startswith('h'): # non h1 headers
	# Remove tag and pipes from the text
	section.append((tag, re.sub(r'<.*?>\|\\|', '', text).strip()))
	elif tag.startswith('p'):
	text = re.split("((\\|){2,})", text) # If encounter more than 1 pipe than split that text into different paragraphs
	for paragraph in text:
	paragraph = re.sub(r'<.*?>\|\\|', '', paragraph).strip() # Remove any pipe
	paragraph = re.sub(' +', ' ', paragraph) # Remove any double or more spaces into single space
	if paragraph and paragraph[0].islower(): # If a pargraph in a different block is found and the first character isn't an uppercase then concanate with last paragraph
	section[-1][1] += f" {paragraph}"
	elif paragraph:
	section.append([tag, paragraph])
	try:
	if tag_match.group() == 'h1': # Create new page when current text is a type 1 header or title
	slides[f"Page {page}"] = section
	page += 1
	except:
	continue
	return slides