|
|
""" |
|
|
Document type validation utility |
|
|
Helps identify if uploaded documents are actually patents |
|
|
""" |
|
|
import re |
|
|
from pathlib import Path |
|
|
from typing import Tuple, List |
|
|
from loguru import logger |
|
|
|
|
|
|
|
|
class DocumentValidator: |
|
|
"""Validate that uploaded documents are patents""" |
|
|
|
|
|
|
|
|
PATENT_KEYWORDS = [ |
|
|
'patent', 'claim', 'claims', 'invention', 'abstract', |
|
|
'field of invention', 'background', 'detailed description', |
|
|
'inventor', 'assignee', 'filing date', 'application' |
|
|
] |
|
|
|
|
|
|
|
|
REQUIRED_SECTIONS = ['abstract', 'claim'] |
|
|
|
|
|
@staticmethod |
|
|
def validate_patent_document(text: str) -> Tuple[bool, List[str]]: |
|
|
""" |
|
|
Validate if document text appears to be a patent |
|
|
|
|
|
Args: |
|
|
text: Extracted document text |
|
|
|
|
|
Returns: |
|
|
Tuple of (is_valid, issues_found) |
|
|
""" |
|
|
text_lower = text.lower() |
|
|
issues = [] |
|
|
|
|
|
|
|
|
if len(text) < 500: |
|
|
issues.append("Document too short (< 500 characters)") |
|
|
|
|
|
|
|
|
keyword_matches = sum(1 for kw in DocumentValidator.PATENT_KEYWORDS |
|
|
if kw in text_lower) |
|
|
|
|
|
if keyword_matches < 3: |
|
|
issues.append(f"Only {keyword_matches} patent keywords found (expected at least 3)") |
|
|
|
|
|
|
|
|
missing_sections = [section for section in DocumentValidator.REQUIRED_SECTIONS |
|
|
if section not in text_lower] |
|
|
|
|
|
if missing_sections: |
|
|
issues.append(f"Missing required sections: {', '.join(missing_sections)}") |
|
|
|
|
|
|
|
|
claim_pattern = r'claim\s+\d+' |
|
|
claims_found = len(re.findall(claim_pattern, text_lower)) |
|
|
|
|
|
if claims_found == 0: |
|
|
issues.append("No numbered claims found") |
|
|
|
|
|
|
|
|
is_valid = len(issues) == 0 or (keyword_matches >= 3 and claims_found > 0) |
|
|
|
|
|
if not is_valid: |
|
|
logger.warning(f"Document validation failed: {issues}") |
|
|
|
|
|
return is_valid, issues |
|
|
|
|
|
@staticmethod |
|
|
def identify_document_type(text: str) -> str: |
|
|
""" |
|
|
Try to identify what type of document this is |
|
|
|
|
|
Returns: |
|
|
Document type description |
|
|
""" |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
if 'microsoft' in text_lower and 'windows' in text_lower: |
|
|
return "Microsoft Windows documentation" |
|
|
|
|
|
if any(term in text_lower for term in ['press release', 'news', 'announcement']): |
|
|
return "Press release or news article" |
|
|
|
|
|
if any(term in text_lower for term in ['whitepaper', 'white paper', 'technical report']): |
|
|
return "Technical whitepaper or report" |
|
|
|
|
|
if any(term in text_lower for term in ['terms of service', 'privacy policy', 'license agreement']): |
|
|
return "Legal agreement or policy document" |
|
|
|
|
|
if 'research paper' in text_lower or 'ieee' in text_lower or 'conference' in text_lower: |
|
|
return "Academic research paper" |
|
|
|
|
|
|
|
|
is_patent, _ = DocumentValidator.validate_patent_document(text) |
|
|
if is_patent: |
|
|
return "Patent document" |
|
|
|
|
|
return "Unknown document type (not a patent)" |
|
|
|
|
|
|
|
|
def validate_and_log(text: str, document_name: str = "document") -> bool: |
|
|
""" |
|
|
Convenience function to validate and log results |
|
|
|
|
|
Args: |
|
|
text: Document text |
|
|
document_name: Name of document for logging |
|
|
|
|
|
Returns: |
|
|
True if valid patent, False otherwise |
|
|
""" |
|
|
is_valid, issues = DocumentValidator.validate_patent_document(text) |
|
|
|
|
|
if not is_valid: |
|
|
doc_type = DocumentValidator.identify_document_type(text) |
|
|
logger.error(f"❌ {document_name} is NOT a valid patent") |
|
|
logger.error(f" Detected type: {doc_type}") |
|
|
logger.error(f" Issues: {', '.join(issues)}") |
|
|
return False |
|
|
|
|
|
logger.success(f"✅ {document_name} appears to be a valid patent") |
|
|
return True |
|
|
|