|
import spacy |
|
from spacy.tokens import DocBin |
|
import json |
|
|
|
def compare_ner_pipelines(binary_file_path, pipeline_names, output_file_path): |
|
|
|
nlp_pipelines = [spacy.load(name) for name in pipeline_names] |
|
|
|
|
|
doc_bin = DocBin().from_disk(binary_file_path) |
|
docs = list(doc_bin.get_docs(nlp_pipelines[0].vocab)) |
|
|
|
|
|
def extract_entities(doc): |
|
return {(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents} |
|
|
|
|
|
all_entities_comparison = [] |
|
for doc in docs: |
|
|
|
entities_per_pipeline = [extract_entities(nlp(doc.text)) for nlp in nlp_pipelines] |
|
|
|
|
|
common_entities = set.intersection(*entities_per_pipeline) |
|
unique_entities = [ents - common_entities for ents in entities_per_pipeline] |
|
|
|
|
|
all_entities_comparison.append({ |
|
"document_text": doc.text, |
|
"common_entities": list(common_entities), |
|
"unique_entities_per_pipeline": {i: list(ents) for i, ents in enumerate(unique_entities)}, |
|
}) |
|
|
|
with open(output_file_path, 'w', encoding="utf-16") as f: |
|
json.dump(all_entities_comparison, f, indent=4, ensure_ascii=False) |
|
|
|
print(f"Comparison results saved to {output_file_path}") |
|
|
|
|
|
def main(): |
|
base_path = r"E:\ICIST-2024-models\spacy-tr\spacy-tr" |
|
compare_ner_pipelines("SRP19101_1.spacy", [base_path + r"\output20\model-best", base_path + r"\output17\model-best"], "compare_results_nk.json") |
|
|
|
if __name__ == "__main__": |
|
main() |