QC_Rules / src /extract_text /google_document_api.py
Jakecole1's picture
Upload 18 files
863cb78 verified
import os
from typing import Optional, List, Dict, Any
from google.api_core.client_options import ClientOptions
from google.cloud import documentai # type: ignore
from PIL import Image, ImageChops
from io import BytesIO
import fitz # PyMuPDF
import base64
class GoogleDocumentAPI:
def __init__(self, credentials_path: str):
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path
self.project_id = "649829115993"
self.location = "us" # Format is "us" or "eu"
self.processor_id = "7f9fd758484d83fe" # Only use this
self.mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
def process_document(self, file_path: str, field_mask: Optional[str] = None, processor_version_id: Optional[str] = None) -> documentai.Document:
opts = ClientOptions(api_endpoint=f"{self.location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
if processor_version_id:
name = client.processor_version_path(
self.project_id, self.location, self.processor_id, processor_version_id
)
else:
name = client.processor_path(self.project_id, self.location, self.processor_id)
with open(file_path, "rb") as image:
image_content = image.read()
raw_document = documentai.RawDocument(content=image_content, mime_type=self.mime_type)
process_options = documentai.ProcessOptions(
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
pages=[1]
)
)
request = documentai.ProcessRequest(
name=name,
raw_document=raw_document,
field_mask=field_mask,
process_options=process_options,
)
result = client.process_document(request=request)
return result.document
def get_document_text(self, document: documentai.Document, page_number: int = 0) -> str:
# Note: document.pages is 0-indexed. If you request page 1, it will be in document.pages[0]
return document.pages[page_number].text
@staticmethod
def _get_style_info(text_anchor: documentai.Document.TextAnchor, document: documentai.Document) -> str:
"""Helper function to extract style information for a text anchor."""
if not hasattr(document, 'text_styles') or not document.text_styles:
return "N/A"
styles = []
# A text anchor can have multiple non-contiguous segments.
for para_segment in text_anchor.text_segments:
para_start = int(para_segment.start_index)
para_end = int(para_segment.end_index)
for style in document.text_styles:
for style_segment in style.text_anchor.text_segments:
style_start = int(style_segment.start_index)
style_end = int(style_segment.end_index)
# Check for overlap between the paragraph segment and the style segment
if max(para_start, style_start) < min(para_end, style_end):
style_str_parts = []
if style.font_size and style.font_size.size > 0:
unit = style.font_size.unit if style.font_size.unit else 'pt'
style_str_parts.append(f"font size: {round(style.font_size.size)}{unit}")
if style.font_weight and style.font_weight.lower() != 'normal':
style_str_parts.append(f"font weight: {style.font_weight}")
if style.text_style and style.text_style.lower() != 'normal':
style_str_parts.append(f"text style: {style.text_style}")
if style.font_family:
style_str_parts.append(f'font family: {style.font_family}')
if style_str_parts:
styles.append(" ".join(style_str_parts))
if styles:
# Using dict.fromkeys to preserve order and get unique styles
unique_styles = list(dict.fromkeys(styles))
return ", ".join(unique_styles)
return "default"
@staticmethod
def _get_text(text_anchor: documentai.Document.TextAnchor, text: str) -> str:
"""Helper function to extract text from text_anchor."""
if not text_anchor.text_segments:
return ""
return "".join(
text[int(segment.start_index) : int(segment.end_index)]
for segment in text_anchor.text_segments
)
def extract_text_with_bounding_boxes(self, document: documentai.Document) -> List[Dict[str, Any]]:
"""
Extracts text and bounding box for each paragraph in the document.
Args:
document: The processed documentai.Document object.
Returns:
A list of dictionaries, where each dictionary contains:
- 'page_number': The page number (1-based).
- 'text': The text of the paragraph.
- 'bounding_box': A list of normalized vertices for the bounding box.
- 'style': Style information for the text.
- 'height': The height of the text block in millimeters (mm).
"""
all_paragraphs = []
full_text = document.text
pt_to_mm = 0.3528 # Conversion factor from points to millimeters
for page in document.pages:
# Get page height in points for height calculation
page_pts = page.dimension.height
for paragraph in page.paragraphs:
p_text = self._get_text(paragraph.layout.text_anchor, full_text)
style_info = self._get_style_info(paragraph.layout.text_anchor, document)
# Get the normalized vertices for the bounding box
vertices = [
{"x": vertex.x, "y": vertex.y}
for vertex in paragraph.layout.bounding_poly.normalized_vertices
]
# Calculate height in millimeters
y_coords = [vertex.y for vertex in paragraph.layout.bounding_poly.normalized_vertices]
height_ratio = max(y_coords) - min(y_coords)
height_pt = height_ratio * page_pts
height_mm = height_pt * pt_to_mm
all_paragraphs.append({
"page_number": page.page_number,
"text": p_text.strip(), # Use .strip() to remove leading/trailing whitespace
"bounding_box": vertices,
"style": style_info,
"height": round(height_mm, 2)
})
return all_paragraphs
def extract_text_with_markdown_table(self, document: documentai.Document) -> str:
data = self.extract_text_with_bounding_boxes(document)
return self._create_markdown_table(data)
def _quantize_coord(self, val, grid_size=1000) -> int:
"""Converts a float (0-1) to an integer on a grid."""
return int(val * grid_size)
def _create_markdown_table(self, data) -> str:
table = "| Text ID | X | Y | Text Height (mm) | Style | Text |\\n"
table += "|----|-----|-----|--------|-------|-------------------------------------------------------------------------|\\n"
for i, item in enumerate(data):
top_left = item['bounding_box'][0]
x = self._quantize_coord(top_left['x'])
y = self._quantize_coord(top_left['y'])
height = round(item.get('height', 0), 2)
style = item.get('style', 'N/A')
text = item['text'].replace('\\n', ' ').replace('|', '\\\\|').strip()
table += f"| {i+1} | {x} | {y} | {height} | {style} | {text} |\\n"
return table
def get_bounding_boxes(self, document: documentai.Document, page_number: int = 0) -> list[documentai.BoundingPoly]:
"""
Extracts bounding boxes for tokens on a specific page.
"""
page = document.pages[page_number]
return [token.layout.bounding_poly for token in page.tokens]
def extract_text_heights_mm(self, document: documentai.Document) -> List[tuple]:
"""
Extracts the height of each line of text from a Google Document AI parsed document
and returns a list of heights in millimeters (mm).
Parameters:
document (google.cloud.documentai.Document): Parsed Document AI response object
Returns:
List of tuples: [(page_num, line_text, height_mm), ...]
"""
heights = []
pt_to_mm = 0.3528
for page_num, page in enumerate(document.pages, start=1):
page_height_pt = page.dimension.height # e.g., 792 for US Letter
for line in page.lines:
layout = line.layout
vertices = layout.bounding_poly.normalized_vertices
y_coords = [v.y for v in vertices]
if not y_coords:
continue
height_ratio = max(y_coords) - min(y_coords)
height_pt = height_ratio * page_height_pt
height_mm = height_pt * pt_to_mm
# Extract visible text (optional — may require mapping segments)
text_segment = layout.text_anchor.text_segments[0]
start = int(text_segment.start_index)
end = int(text_segment.end_index)
line_text = document.text[start:end].strip()
heights.append((page_num, line_text, round(height_mm, 2)))
return heights