geo-platform / src /audit.py
3v324v23's picture
initial: geo-platform full stack
5c429d4
import spacy
from collections import Counter
_nlp = None
def load_nlp():
global _nlp
if _nlp is None:
try:
_nlp = spacy.load("en_core_web_sm")
except Exception:
# fall back: prompt user to download model
raise RuntimeError("spaCy model 'en_core_web_sm' not found. Run: python -m spacy download en_core_web_sm")
return _nlp
def heading_hierarchy_ok(headings):
# Check for skipped heading levels (simple heuristic)
levels = [int(h['tag'][1]) for h in headings if h['tag'].startswith('h')]
if not levels:
return False
prev = levels[0]
for lv in levels[1:]:
if lv - prev > 1:
return False
prev = lv
return True
def paragraph_density(paragraphs):
# words per paragraph and average
counts = [len(p.split()) for p in paragraphs]
if not counts:
return { 'avg_words': 0, 'paras': 0 }
return { 'avg_words': sum(counts)/len(counts), 'paras': len(counts) }
def extract_entities(text):
nlp = load_nlp()
doc = nlp(text)
ents = [ { 'text': e.text, 'label': e.label_ } for e in doc.ents ]
freq = Counter([e['label'] for e in ents])
return { 'entities': ents, 'summary': dict(freq) }
def audit_page(page):
headings_ok = heading_hierarchy_ok(page.get('headings', []))
density = paragraph_density(page.get('paragraphs', []))
text_blob = "\n\n".join(page.get('paragraphs', []))[:20000]
entities = extract_entities(text_blob) if text_blob else { 'entities': [], 'summary': {} }
return {
'url': page['url'],
'title': page.get('title',''),
'headings_ok': headings_ok,
'density': density,
'entities': entities
}