Spaces:
Runtime error
Runtime error
File size: 8,198 Bytes
dae4805 3d528d9 dae4805 08842a1 dae4805 436688d dae4805 08842a1 dae4805 3d528d9 dae4805 436688d dae4805 86f1924 dae4805 436688d dae4805 08842a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 |
import os
import re
import spacy
import uvicorn
import docx
import requests
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import (
NlpEngine,
NlpEngineProvider,
)
# import google.generativeai as genai
from dotenv import load_dotenv
from transformers import pipeline
from fastapi.middleware.cors import CORSMiddleware
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from huggingface_hub import login
from pydantic import BaseModel
from fastapi import FastAPI, Request, UploadFile, File
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
load_dotenv()
app = FastAPI(root_path=os.environ.get("ROOT_PATH"))
# genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
# model = genai.GenerativeModel('gemini-pro')
HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
# login(HUGGINGFACE_KEY)
pipe = pipeline("fill-mask", model="./ksp-mask-model")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class TextItem(BaseModel):
text: str
def create_nlp_engine_with_spacy(
model_path: str = "en_core_web_lg",
):
"""
Instantiate an NlpEngine with a spaCy model
:param model_path: path to model / model name.
"""
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": model_path}],
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
"PER": "PERSON",
"PERSON": "PERSON",
"NORP": "NRP",
"FAC": "FACILITY",
"LOC": "LOCATION",
"GPE": "LOCATION",
"LOCATION": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
},
"low_confidence_score_multiplier": 0.4,
"low_score_entity_names": ["ORG", "ORGANIZATION"],
},
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
return nlp_engine, registry
nlp_engine, registry = create_nlp_engine_with_spacy()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
anonymizer = AnonymizerEngine()
@app.get("/")
async def read_root():
return {"message": "Hello World"}
@app.get("/vocab_thresh_masking")
async def vocab_thresh_masking(text, threshold):
ner_model = spacy.load("en_core_web_sm")
doc = ner_model(text)
word_counts = dict()
for token in doc:
word_counts[token.text] = word_counts.get(str(token.text), 0) + 1
threshold = int(threshold)
frequent_words = [word for word, count in word_counts.items() if count >= threshold]
masked_text = []
pii_locations = [] # List to store (start index, end index, type) tuples
for i, token in enumerate(doc):
if str(token.text) in frequent_words:
masked_text.append(str(token.text))
else:
masked_text.append("[MASK]")
# Potentially masked PII, record location and tentative type (UNKNOWN)
pii_locations.append((token.idx, token.idx + len(token.text), "UNKNOWN"))
return " ".join(masked_text), pii_locations
@app.get("/entity_tagger_masking")
async def entity_tagger_masking(text):
ner_model = spacy.load("en_core_web_sm")
doc = ner_model(text)
masked_text = []
pii_locations = []
for token in doc:
if token.ent_type_ == "PERSON":
masked_text.append("[MASK]")
pii_locations.append((token.idx, token.idx + len(token.text), "PERSON"))
elif token.ent_type_ == "LOC":
masked_text.append("[MASK]")
pii_locations.append((token.idx, token.idx + len(token.text), "LOCATION"))
elif token.ent_type_ == "ORG":
masked_text.append("[MASK]")
pii_locations.append((token.idx, token.idx + len(token.text), "ORGANIZATION"))
elif token.ent_type_ == "DATE":
masked_text.append("[MASK]")
pii_locations.append((token.idx, token.idx + len(token.text), "DATE"))
else:
masked_text.append(token.text)
return " ".join(masked_text), pii_locations
@app.get("/email_and_phone")
async def identify_email_and_phone(text):
# use regex to identify emails and phone numbers and mask them
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
phone_pattern = r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"
# find the location of emails and phone numbers
pii_locations = []
for match in re.finditer(email_pattern, text):
pii_locations.append((match.start(), match.end(), "EMAIL"))
for match in re.finditer(phone_pattern, text):
pii_locations.append((match.start(), match.end(), "PHONE NUMBER"))
# mask the emails and phone numbers
text = re.sub(email_pattern, "[MASK]", text)
text = re.sub(phone_pattern, "[MASK]", text)
return text, pii_locations
@app.get("/anonymize_masked_text")
async def anonymize_masked_text(masked_text):
# prompt = f"The following text contains Personal Information Identifiers marked with [MASK]: \n```\n{masked_text}\n```\n Please anonymize these Personal Identity Identifiers by replacing the '[MASK]' with random placeholders while preserving the context so that the text can be used for analysis."
# print(prompt)
# response = model.generate_content(prompt)
# return response.text
API_URL = "https://api-inference.huggingface.co/models/pranavraj1103/ksp-mask-model"
headers = {"Authorization": f"Bearer {HUGGINGFACE_KEY}"}
def query(payload):
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
output = query({
"inputs": "The <mask> to the universe is <mask>.",
})
return output
@app.post("/parse_doc")
async def parse_doc(file: UploadFile):
if file.filename.endswith(".txt"):
return file.file.read()
doc = docx.Document(file.file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return "\n".join(full_text)
@app.post("/presidio_mask")
async def presidio_mask(text: TextItem):
text = text.text
results = analyzer.analyze(text=text, language='en')
# for rec in results:
# print(rec.start)
# print(*[text[res.start : res.end] for res in results])
# anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)
# return anonymized_text, results
return_list = []
seen_set = set()
for rec in results:
if (rec.score < 0.1) or (rec.start, rec.end) in seen_set:
continue
return_list.append({
"start": rec.start,
"end": rec.end,
"entity_type": rec.entity_type,
"text": text[rec.start:rec.end],
"score": rec.score,
})
seen_set.add((rec.start, rec.end))
return return_list
@app.post("/anonymize_text")
async def anonymize_text(text: TextItem):
off_set = 0
mask_list = await presidio_mask(text)
mask_list = sorted(mask_list, key=lambda x: x["start"])
new_mask_list = []
text = text.text
anonymized_text = text
final_text = text
for mask in mask_list:
mask_text = anonymized_text[:mask["start"]] + "<mask>" + anonymized_text[mask["end"]:]
options = pipe(mask_text)
final_text = final_text[:mask["start"] + off_set] + options[0]["token_str"] + final_text[mask["end"] + off_set:]
new_mask_list.append({
"start": mask["start"] + off_set,
"end": mask["start"] + off_set + len(options[0]["token_str"]),
"entity_type": mask["entity_type"],
"options": options,
"original_text": mask["text"],
})
off_set += len(options[0]["token_str"]) - len(mask["text"])
return {"anonymized_text": final_text, "mask_list": new_mask_list} |