sr_ner_tesla_bcx / compare.py
Tanor's picture
Update spaCy pipeline
286a29c verified
import spacy
from spacy.tokens import DocBin
import json
def compare_ner_pipelines(binary_file_path, pipeline_names, output_file_path):
# Load SpaCy models based on provided pipeline names
nlp_pipelines = [spacy.load(name) for name in pipeline_names]
# Load documents from a binary file
doc_bin = DocBin().from_disk(binary_file_path)
docs = list(doc_bin.get_docs(nlp_pipelines[0].vocab)) # assuming all models share the same vocab
# Function to extract entities with their positions
def extract_entities(doc):
return {(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}
# Compare entities in each document across all pipelines
all_entities_comparison = []
for doc in docs:
# Include manually annotated entities as the first item in the list
entities_per_pipeline = [extract_entities(nlp(doc.text)) for nlp in nlp_pipelines]
# Find common and unique entities
common_entities = set.intersection(*entities_per_pipeline)
unique_entities = [ents - common_entities for ents in entities_per_pipeline]
# Append results for each document
all_entities_comparison.append({
"document_text": doc.text,
"common_entities": list(common_entities),
"unique_entities_per_pipeline": {i: list(ents) for i, ents in enumerate(unique_entities)},
})
# Save the results to a file
with open(output_file_path, 'w', encoding="utf-16") as f:
json.dump(all_entities_comparison, f, indent=4, ensure_ascii=False)
print(f"Comparison results saved to {output_file_path}")
# Example usage
def main():
base_path = r"E:\ICIST-2024-models\spacy-tr\spacy-tr"
compare_ner_pipelines("SRP19101_1.spacy", [base_path + r"\output20\model-best", base_path + r"\output17\model-best"], "compare_results_nk.json")
if __name__ == "__main__":
main()