import spacy from spacy.tokens import DocBin import json def compare_ner_pipelines(binary_file_path, pipeline_names, output_file_path): # Load SpaCy models based on provided pipeline names nlp_pipelines = [spacy.load(name) for name in pipeline_names] # Load documents from a binary file doc_bin = DocBin().from_disk(binary_file_path) docs = list(doc_bin.get_docs(nlp_pipelines[0].vocab)) # assuming all models share the same vocab # Function to extract entities with their positions def extract_entities(doc): return {(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents} # Compare entities in each document across all pipelines all_entities_comparison = [] for doc in docs: # Include manually annotated entities as the first item in the list entities_per_pipeline = [extract_entities(nlp(doc.text)) for nlp in nlp_pipelines] # Find common and unique entities common_entities = set.intersection(*entities_per_pipeline) unique_entities = [ents - common_entities for ents in entities_per_pipeline] # Append results for each document all_entities_comparison.append({ "document_text": doc.text, "common_entities": list(common_entities), "unique_entities_per_pipeline": {i: list(ents) for i, ents in enumerate(unique_entities)}, }) # Save the results to a file with open(output_file_path, 'w', encoding="utf-16") as f: json.dump(all_entities_comparison, f, indent=4, ensure_ascii=False) print(f"Comparison results saved to {output_file_path}") # Example usage def main(): base_path = r"E:\ICIST-2024-models\spacy-tr\spacy-tr" compare_ner_pipelines("SRP19101_1.spacy", [base_path + r"\output20\model-best", base_path + r"\output17\model-best"], "compare_results_nk.json") if __name__ == "__main__": main()