Spaces:
Running
Running
| import spacy | |
| from collections import Counter | |
| _nlp = None | |
| def load_nlp(): | |
| global _nlp | |
| if _nlp is None: | |
| try: | |
| _nlp = spacy.load("en_core_web_sm") | |
| except Exception: | |
| # fall back: prompt user to download model | |
| raise RuntimeError("spaCy model 'en_core_web_sm' not found. Run: python -m spacy download en_core_web_sm") | |
| return _nlp | |
| def heading_hierarchy_ok(headings): | |
| # Check for skipped heading levels (simple heuristic) | |
| levels = [int(h['tag'][1]) for h in headings if h['tag'].startswith('h')] | |
| if not levels: | |
| return False | |
| prev = levels[0] | |
| for lv in levels[1:]: | |
| if lv - prev > 1: | |
| return False | |
| prev = lv | |
| return True | |
| def paragraph_density(paragraphs): | |
| # words per paragraph and average | |
| counts = [len(p.split()) for p in paragraphs] | |
| if not counts: | |
| return { 'avg_words': 0, 'paras': 0 } | |
| return { 'avg_words': sum(counts)/len(counts), 'paras': len(counts) } | |
| def extract_entities(text): | |
| nlp = load_nlp() | |
| doc = nlp(text) | |
| ents = [ { 'text': e.text, 'label': e.label_ } for e in doc.ents ] | |
| freq = Counter([e['label'] for e in ents]) | |
| return { 'entities': ents, 'summary': dict(freq) } | |
| def audit_page(page): | |
| headings_ok = heading_hierarchy_ok(page.get('headings', [])) | |
| density = paragraph_density(page.get('paragraphs', [])) | |
| text_blob = "\n\n".join(page.get('paragraphs', []))[:20000] | |
| entities = extract_entities(text_blob) if text_blob else { 'entities': [], 'summary': {} } | |
| return { | |
| 'url': page['url'], | |
| 'title': page.get('title',''), | |
| 'headings_ok': headings_ok, | |
| 'density': density, | |
| 'entities': entities | |
| } | |