Spaces:

Propelis
/

QC_Rules

Sleeping

App Files Files Community

QC_Rules / src /extract_text /google_document_api.py

Jakecole1

Upload 18 files

863cb78 verified 8 months ago

raw

history blame contribute delete

9.93 kB

	import os
	from typing import Optional, List, Dict, Any
	from google.api_core.client_options import ClientOptions
	from google.cloud import documentai # type: ignore
	from PIL import Image, ImageChops
	from io import BytesIO
	import fitz # PyMuPDF
	import base64

	class GoogleDocumentAPI:
	def __init__(self, credentials_path: str):
	os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

	self.project_id = "649829115993"
	self.location = "us" # Format is "us" or "eu"
	self.processor_id = "7f9fd758484d83fe" # Only use this
	self.mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

	def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
	opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
	client = documentai.DocumentProcessorServiceClient(client_options=opts)

	if processor_version_id:
	name = client.processor_version_path(
	self.project_id, self.location, self.processor_id, processor_version_id
	)
	else:
	name = client.processor_path(self.project_id, self.location, self.processor_id)

	with open(file_path, "rb") as image:
	image_content = image.read()

	raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)

	process_options = documentai.ProcessOptions(
	individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
	pages=[1]
	)
	)

	request = documentai.ProcessRequest(
	name=name,
	raw_document=raw_document,
	field_mask=field_mask,
	process_options=process_options,
	)

	result = client.process_document(request=request)
	return result.document

	def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
	# Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
	return document.pages[page_number].text

	@staticmethod
	def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
	"""Helper function to extract style information for a text anchor."""
	if not hasattr(document, 'text_styles') or not document.text_styles:
	return "N/A"

	styles = []
	# A text anchor can have multiple non-contiguous segments.
	for para_segment in text_anchor.text_segments:
	para_start = int(para_segment.start_index)
	para_end = int(para_segment.end_index)

	for style in document.text_styles:
	for style_segment in style.text_anchor.text_segments:
	style_start = int(style_segment.start_index)
	style_end = int(style_segment.end_index)

	# Check for overlap between the paragraph segment and the style segment
	if max(para_start, style_start) < min(para_end, style_end):
	style_str_parts = []
	if style.font_size and style.font_size.size > 0:
	unit = style.font_size.unit if style.font_size.unit else 'pt'
	style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
	if style.font_weight and style.font_weight.lower() != 'normal':
	style_str_parts.append(f"font weight: {style.font_weight}")
	if style.text_style and style.text_style.lower() != 'normal':
	style_str_parts.append(f"text style: {style.text_style}")
	if style.font_family:
	style_str_parts.append(f'font family: {style.font_family}')

	if style_str_parts:
	styles.append(" ".join(style_str_parts))

	if styles:
	# Using dict.fromkeys to preserve order and get unique styles
	unique_styles = list(dict.fromkeys(styles))
	return ", ".join(unique_styles)

	return "default"

	@staticmethod
	def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
	"""Helper function to extract text from text_anchor."""
	if not text_anchor.text_segments:
	return ""
	return "".join(
	text[int(segment.start_index) : int(segment.end_index)]
	for segment in text_anchor.text_segments
	)

	def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
	"""
	Extracts text and bounding box for each paragraph in the document.

	Args:
	document: The processed documentai.Document object.

	Returns:
	A list of dictionaries, where each dictionary contains:
	- 'page_number': The page number (1-based).
	- 'text': The text of the paragraph.
	- 'bounding_box': A list of normalized vertices for the bounding box.
	- 'style': Style information for the text.
	- 'height': The height of the text block in millimeters (mm).
	"""
	all_paragraphs = []
	full_text = document.text
	pt_to_mm = 0.3528 # Conversion factor from points to millimeters

	for page in document.pages:
	# Get page height in points for height calculation
	page_pts = page.dimension.height

	for paragraph in page.paragraphs:
	p_text = self._get_text(paragraph.layout.text_anchor, full_text)
	style_info = self._get_style_info(paragraph.layout.text_anchor, document)

	# Get the normalized vertices for the bounding box
	vertices = [
	{"x": vertex.x, "y": vertex.y}
	for vertex in paragraph.layout.bounding_poly.normalized_vertices
	]

	# Calculate height in millimeters
	y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
	height_ratio = max(y_coords) - min(y_coords)
	height_pt = height_ratio * page_pts
	height_mm = height_pt * pt_to_mm

	all_paragraphs.append({
	"page_number": page.page_number,
	"text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
	"bounding_box": vertices,
	"style": style_info,
	"height": round(height_mm, 2)
	})
	return all_paragraphs




	def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
	data = self.extract_text_with_bounding_boxes(document)
	return self._create_markdown_table(data)

	def _quantize_coord(self, val, grid_size=1000) -> int:
	"""Converts a float (0-1) to an integer on a grid."""
	return int(val * grid_size)

	def _create_markdown_table(self, data) -> str:
	table = "\| Text ID \| X \| Y \| Text Height (mm) \| Style \| Text \|\\n"
	table += "\|----\|-----\|-----\|--------\|-------\|-------------------------------------------------------------------------\|\\n"
	for i, item in enumerate(data):
	top_left = item['bounding_box'][0]
	x = self._quantize_coord(top_left['x'])
	y = self._quantize_coord(top_left['y'])
	height = round(item.get('height', 0), 2)
	style = item.get('style', 'N/A')
	text = item['text'].replace('\\n', ' ').replace('\|', '\\\\\|').strip()
	table += f"\| {i+1} \| {x} \| {y} \| {height} \| {style} \| {text} \|\\n"
	return table

	def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
	"""
	Extracts bounding boxes for tokens on a specific page.
	"""
	page = document.pages[page_number]
	return [token.layout.bounding_poly for token in page.tokens]

	def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
	"""
	Extracts the height of each line of text from a Google Document AI parsed document
	and returns a list of heights in millimeters (mm).

	Parameters:
	document (google.cloud.documentai.Document): Parsed Document AI response object

	Returns:
	List of tuples: [(page_num, line_text, height_mm), ...]
	"""
	heights = []
	pt_to_mm = 0.3528

	for page_num, page in enumerate(document.pages, start=1):
	page_height_pt = page.dimension.height # e.g., 792 for US Letter

	for line in page.lines:
	layout = line.layout
	vertices = layout.bounding_poly.normalized_vertices

	y_coords = [v.y for v in vertices]
	if not y_coords:
	continue

	height_ratio = max(y_coords) - min(y_coords)
	height_pt = height_ratio * page_height_pt
	height_mm = height_pt * pt_to_mm

	# Extract visible text (optional — may require mapping segments)
	text_segment = layout.text_anchor.text_segments[0]
	start = int(text_segment.start_index)
	end = int(text_segment.end_index)
	line_text = document.text[start:end].strip()

	heights.append((page_num, line_text, round(height_mm, 2)))

	return heights