| | from transformers import pipeline |
| | import spacy |
| |
|
| | class ClinicalNERProcessor: |
| | """ |
| | A class for Named Entity Recognition and POS tagging. |
| | """ |
| |
|
| | def __init__(self, use_pos=True, use_anatomy=True): |
| | |
| | self.ner_pipeline = pipeline( |
| | "ner", |
| | model="samrawal/bert-base-uncased_clinical-ner", |
| | aggregation_strategy="simple" |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | self.anatomy_pipeline = None |
| | if use_anatomy: |
| | try: |
| | self.anatomy_pipeline = pipeline( |
| | "ner", |
| | model="OpenMed/OpenMed-NER-AnatomyDetect-BioPatient-108M", |
| | aggregation_strategy="simple" |
| | ) |
| | except Exception as e: |
| | print(f"Warning: Could not load anatomy model: {e}") |
| |
|
| | |
| | self.nlp = None |
| | if use_pos: |
| | try: |
| | self.nlp = spacy.load("en_core_web_sm") |
| | except OSError: |
| | print("Warning: spaCy model 'en_core_web_sm' not found.") |
| | print("Install it with: python -m spacy download en_core_web_sm") |
| |
|
| | def _merge_subwords(self, entities): |
| | if not entities: |
| | return [] |
| |
|
| | merged = [] |
| | i = 0 |
| |
|
| | while i < len(entities): |
| | current = entities[i].copy() |
| | word = current['word'] |
| | end = current['end'] |
| |
|
| | |
| | j = i + 1 |
| | while j < len(entities): |
| | next_entity = entities[j] |
| |
|
| | |
| | if (next_entity['word'].startswith('##') and |
| | next_entity['entity_group'] == current['entity_group']): |
| | |
| | word += next_entity['word'][2:] |
| | end = next_entity['end'] |
| | j += 1 |
| | else: |
| | break |
| |
|
| | |
| | current['word'] = word |
| | current['end'] = end |
| | merged.append(current) |
| |
|
| | |
| | i = j |
| |
|
| | return merged |
| |
|
| | def basic_ner(self, text): |
| | """Clinical NER only""" |
| | entities = self.ner_pipeline(text) |
| | return self._merge_subwords(entities) |
| |
|
| | def prolog_ner(self, text): |
| | """Clinical NER as Prolog facts""" |
| | entities = self.ner_pipeline(text) |
| | merged_entities = self._merge_subwords(entities) |
| |
|
| | prolog_facts = [] |
| | for i, entity in enumerate(merged_entities): |
| | |
| | word = entity['word'].replace("'", "\\'") |
| |
|
| | |
| | fact = ( |
| | f"entity({i}, '{entity['entity_group']}', " |
| | f"'{word}', {entity['start']}, " |
| | f"{entity['end']}, {entity['score']:.4f})." |
| | ) |
| | prolog_facts.append(fact) |
| |
|
| | return "\n".join(prolog_facts) |
| |
|
| | def anatomy_ner(self, text): |
| | """Anatomy NER only""" |
| | if self.anatomy_pipeline is None: |
| | raise RuntimeError("Anatomy NER pipeline not initialized.") |
| |
|
| | entities = self.anatomy_pipeline(text) |
| | return self._merge_subwords(entities) |
| |
|
| | def prolog_anatomy(self, text): |
| | """Anatomy NER as Prolog facts""" |
| | if self.anatomy_pipeline is None: |
| | raise RuntimeError("Anatomy NER pipeline not initialized.") |
| |
|
| | entities = self.anatomy_pipeline(text) |
| | merged_entities = self._merge_subwords(entities) |
| |
|
| | prolog_facts = [] |
| | for i, entity in enumerate(merged_entities): |
| | |
| | word = entity['word'].replace("'", "\\'") |
| |
|
| | |
| | fact = ( |
| | f"anatomy({i}, '{entity['entity_group']}', " |
| | f"'{word}', {entity['start']}, " |
| | f"{entity['end']}, {entity['score']:.4f})." |
| | ) |
| | prolog_facts.append(fact) |
| |
|
| | return "\n".join(prolog_facts) |
| |
|
| | def pos_tagging(self, text): |
| | """POS tagging only""" |
| | if self.nlp is None: |
| | raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm") |
| |
|
| | doc = self.nlp(text) |
| |
|
| | pos_results = [] |
| | for token in doc: |
| | pos_results.append({ |
| | 'token': token.text, |
| | 'lemma': token.lemma_, |
| | 'pos': token.pos_, |
| | 'tag': token.tag_, |
| | 'dep': token.dep_, |
| | 'start': token.idx, |
| | 'end': token.idx + len(token.text) |
| | }) |
| |
|
| | return pos_results |
| |
|
| | def prolog_pos(self, text): |
| | """POS tagging as Prolog facts""" |
| | if self.nlp is None: |
| | raise RuntimeError("POS tagger not initialized. Install spaCy model: python -m spacy download en_core_web_sm") |
| |
|
| | pos_results = self.pos_tagging(text) |
| |
|
| | prolog_facts = [] |
| | for i, token_info in enumerate(pos_results): |
| | |
| | token = token_info['token'].replace("'", "\\'") |
| | lemma = token_info['lemma'].replace("'", "\\'") |
| |
|
| | |
| | fact = ( |
| | f"pos({i}, '{token}', '{lemma}', '{token_info['pos']}', " |
| | f"'{token_info['tag']}', '{token_info['dep']}', " |
| | f"{token_info['start']}, {token_info['end']})." |
| | ) |
| | prolog_facts.append(fact) |
| |
|
| | return "\n".join(prolog_facts) |
| |
|
| | def combined_analysis(self, text): |
| | """Combined analysis: Clinical NER + Anatomy NER + POS tagging""" |
| | result = { |
| | 'clinical_entities': self.basic_ner(text), |
| | 'anatomy_entities': [], |
| | 'pos_tags': [] |
| | } |
| |
|
| | if self.anatomy_pipeline: |
| | result['anatomy_entities'] = self.anatomy_ner(text) |
| |
|
| | if self.nlp: |
| | result['pos_tags'] = self.pos_tagging(text) |
| |
|
| | return result |
| |
|
| | def prolog_combined(self, text): |
| | """Combined Prolog output: Clinical NER + Anatomy NER + POS tagging""" |
| | sections = [] |
| |
|
| | |
| | clinical_facts = self.prolog_ner(text) |
| | if clinical_facts: |
| | sections.append(f"% Clinical Entities\n{clinical_facts}") |
| |
|
| | |
| | if self.anatomy_pipeline: |
| | anatomy_facts = self.prolog_anatomy(text) |
| | if anatomy_facts: |
| | sections.append(f"% Anatomy Entities\n{anatomy_facts}") |
| |
|
| | |
| | if self.nlp: |
| | pos_facts = self.prolog_pos(text) |
| | if pos_facts: |
| | sections.append(f"% POS Tags\n{pos_facts}") |
| |
|
| | return "\n\n".join(sections) |