| import json |
| import spacy |
| from spacy.tokens import DocBin |
|
|
| def read_in_chunks(file_path, chunk_size=1024): |
| with open(file_path, 'r', encoding='utf-8') as file: |
| while True: |
| data = file.read(chunk_size) |
| if not data: |
| break |
| yield data |
|
|
| def convert_json_to_spacy(json_file_path, spacy_file_path): |
| |
| file_content = "" |
| for chunk in read_in_chunks(json_file_path): |
| file_content += chunk |
|
|
| |
| data = json.loads(file_content) |
|
|
| |
| spacy_format = [] |
|
|
| for item in data: |
| text = item[0] |
| entities = item[1]['entities'] |
| spacy_entities = [(start, end, label) for start, end, label in entities] |
| spacy_format.append({"text": text, "entities": spacy_entities}) |
|
|
| |
| nlp = spacy.blank("en") |
|
|
| |
| doc_bin = DocBin() |
|
|
| |
| for entry in spacy_format: |
| doc = nlp.make_doc(entry["text"]) |
| |
| entities = [] |
| seen_positions = set() |
| for start, end, label in entry["entities"]: |
| |
| if start < 0 or end > len(doc.text) or start >= end: |
| print(f"Invalid span: start={start}, end={end}, label={label}") |
| continue |
|
|
| |
| if not any(start < e_end and end > e_start for e_start, e_end, _ in seen_positions): |
| span = doc.char_span(start, end, label=label) |
| if span is not None: |
| entities.append(span) |
| seen_positions.add((start, end, label)) |
| else: |
| print(f"Overlapping span: start={start}, end={end}, label={label}") |
| |
| |
| doc.ents = entities |
| |
| |
| doc_bin.add(doc) |
|
|
| |
| doc_bin.to_disk(spacy_file_path) |
|
|
| print(f"Data has been successfully saved to {spacy_file_path}!") |
|
|