| from fastapi import FastAPI |
| from pydantic import BaseModel |
| import spacy |
| from contextlib import asynccontextmanager |
|
|
| |
| nlp = None |
|
|
|
|
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| await load_model() |
| yield |
| await cleanup() |
|
|
| app = FastAPI(lifespan=lifespan) |
|
|
| async def load_model(): |
| """ |
| Event to load the NLP model into memory on application startup. |
| """ |
| global nlp |
| nlp = spacy.load("en_core_web_sm") |
| print("NLP model loaded successfully.") |
|
|
|
|
| async def cleanup(): |
| """ |
| Event to clean up resources on application shutdown (if needed). |
| """ |
| global nlp |
| nlp = None |
| print("NLP model unloaded.") |
|
|
|
|
| class NERRequest(BaseModel): |
| chunks: list[str] |
|
|
|
|
| @app.post("/process") |
| async def process_text(request: NERRequest): |
| global nlp |
| metadata_records = [] |
|
|
| |
| seen_entities = {} |
|
|
| for text in request.chunks: |
| doc = nlp(text) |
|
|
| for ent in doc.ents: |
| print(f"{ent.text} - {ent.label_}") |
|
|
| |
| if ent.text in seen_entities: |
| |
| continue |
|
|
| metadata_record = { |
| "personal_info": ent.text, |
| "redaction_type": ent.label_, |
| } |
|
|
| redacted_word = f"REDACTED_{ent.label_}" |
|
|
| |
| seen_entities[ent.text] = redacted_word |
|
|
| metadata_record["redacted_word"] = redacted_word |
| metadata_records.append(metadata_record) |
|
|
| return { |
| "metadata": metadata_records, |
| } |
|
|