document-part-classification / optimized_feature_engineering.py
sandyyuan's picture
Upload optimized_feature_engineering.py with huggingface_hub
83d04a1 verified
"""
Optimized feature extractor for document classification.
Contains 20 most effective features including contextual patterns from neighboring lines.
"""
import numpy as np
import pandas as pd
import re
class OptimizedFeatureExtractor:
"""Extract 20 optimized features for document line classification with contextual information."""
def __init__(self):
# Keywords that suggest different document types
self.form_keywords = [
'name', 'date', 'address', 'phone', 'email', 'signature',
'number', 'ssn', 'dob', 'zip', ':', '_____'
]
self.table_keywords = [
'total', 'qty', 'quantity', 'price', 'amount', 'item',
'cost', 'subtotal', 'tax', '%', '$'
]
# Selected features (in order of importance)
self.selected_features = ['word_count', 'line_position_ratio', 'line_length', 'avg_word_length', 'column_count', 'prev_line_length', 'digit_ratio', 'next_line_length', 'uppercase_ratio', 'next_line_digit_ratio', 'next_line_word_count', 'surrounded_by_form_pattern', 'prev_line_word_count', 'prev_line_digit_ratio', 'form_keyword_count', 'special_char_count', 'next_line_form_keyword_count', 'next_line_special_char_count', 'prev_line_form_keyword_count', 'prev_line_special_char_count']
def _extract_basic_features(self, line):
"""Extract core text features for a single line."""
# Handle NaN or None values
if not line or pd.isna(line):
line = ""
else:
line = str(line) # Ensure it's a string
words = line.split()
line_lower = line.lower()
# Only compute features that are in our selected set
basic_features = {}
if 'line_length' in self.selected_features:
basic_features['line_length'] = len(line)
if 'word_count' in self.selected_features:
basic_features['word_count'] = len(words)
if 'avg_word_length' in self.selected_features:
basic_features['avg_word_length'] = len(line) / max(len(words), 1)
if 'starts_with_whitespace' in self.selected_features:
basic_features['starts_with_whitespace'] = 1 if line.startswith(' ') else 0
if 'digit_ratio' in self.selected_features:
basic_features['digit_ratio'] = sum(c.isdigit() for c in line) / max(len(line), 1)
if 'uppercase_ratio' in self.selected_features:
basic_features['uppercase_ratio'] = sum(c.isupper() for c in line) / max(sum(c.isalpha() for c in line), 1)
if 'special_char_count' in self.selected_features:
basic_features['special_char_count'] = sum(not c.isalnum() and not c.isspace() for c in line)
if 'ends_with_colon' in self.selected_features:
basic_features['ends_with_colon'] = 1 if line.strip().endswith(':') else 0
if 'has_underscore_field' in self.selected_features:
basic_features['has_underscore_field'] = 1 if '___' in line else 0
if 'is_all_caps' in self.selected_features:
basic_features['is_all_caps'] = 1 if line.isupper() and len(line.strip()) > 1 else 0
if 'has_currency' in self.selected_features:
basic_features['has_currency'] = 1 if '$' in line else 0
if 'has_percentage' in self.selected_features:
basic_features['has_percentage'] = 1 if '%' in line else 0
if 'has_email_pattern' in self.selected_features:
basic_features['has_email_pattern'] = 1 if '@' in line and '.' in line else 0
if 'has_phone_pattern' in self.selected_features:
basic_features['has_phone_pattern'] = 1 if re.search(r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}', line) else 0
if 'column_count' in self.selected_features:
basic_features['column_count'] = len(re.split(r'\s{2,}|\t', line.strip()))
if 'form_keyword_count' in self.selected_features:
basic_features['form_keyword_count'] = sum(1 for word in self.form_keywords if word in line_lower)
if 'table_keyword_count' in self.selected_features:
basic_features['table_keyword_count'] = sum(1 for word in self.table_keywords if word in line_lower)
return basic_features
def extract_features_for_line(self, line, all_lines=None, line_index=0):
"""Extract features for a line including previous/next line context."""
# Get basic features for current line
features = self._extract_basic_features(line)
# Add positional features if selected
if 'line_position_ratio' in self.selected_features:
features['line_position_ratio'] = line_index / max(len(all_lines), 1) if all_lines else 0
if 'is_near_start' in self.selected_features:
features['is_near_start'] = 1 if all_lines and (line_index / max(len(all_lines), 1)) < 0.1 else 0
if 'is_near_end' in self.selected_features:
features['is_near_end'] = 1 if all_lines and (line_index / max(len(all_lines), 1)) > 0.9 else 0
# Add contextual features if selected and available
if all_lines and len(all_lines) > 1:
# Previous line features
if line_index > 0:
prev_line = all_lines[line_index - 1]
prev_features = self._extract_basic_features(prev_line)
for feat_name, feat_value in prev_features.items():
prev_feat_name = f'prev_{feat_name}'
if prev_feat_name in self.selected_features:
features[prev_feat_name] = feat_value
# Next line features
if line_index < len(all_lines) - 1:
next_line = all_lines[line_index + 1]
next_features = self._extract_basic_features(next_line)
for feat_name, feat_value in next_features.items():
next_feat_name = f'next_{feat_name}'
if next_feat_name in self.selected_features:
features[next_feat_name] = feat_value
# Contextual pattern features
if 'follows_label_pattern' in self.selected_features:
features['follows_label_pattern'] = 1 if line_index > 0 and \
self._extract_basic_features(all_lines[line_index - 1]).get('ends_with_colon', 0) and \
features.get('line_length', 0) < 50 else 0
if 'precedes_input_pattern' in self.selected_features:
features['precedes_input_pattern'] = 1 if line_index < len(all_lines) - 1 and \
features.get('ends_with_colon', 0) and \
self._extract_basic_features(all_lines[line_index + 1]).get('has_underscore_field', 0) else 0
if 'surrounded_by_form_pattern' in self.selected_features:
features['surrounded_by_form_pattern'] = 1 if line_index > 0 and line_index < len(all_lines) - 1 and \
(self._extract_basic_features(all_lines[line_index - 1]).get('form_keyword_count', 0) > 0 or \
self._extract_basic_features(all_lines[line_index + 1]).get('form_keyword_count', 0) > 0) else 0
# Fill missing features with 0
for feat_name in self.selected_features:
if feat_name not in features:
features[feat_name] = 0
return features
def extract_features_for_document(self, lines):
"""Extract feature matrix for all lines in a document."""
if not lines:
return np.array([]), []
all_features = []
for i, line in enumerate(lines):
features = self.extract_features_for_line(line, lines, i)
# Convert to list in consistent order
feature_vector = [features[key] for key in sorted(self.selected_features)]
all_features.append(feature_vector)
feature_names = sorted(self.selected_features)
return np.array(all_features), feature_names