Spaces:

pranavraj1103
/

ksp-vanguard_squad

Runtime error

File size: 8,198 Bytes

import os
import re
import spacy
import uvicorn
import docx
import requests
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import (
    NlpEngine,
    NlpEngineProvider,
)
# import google.generativeai as genai
from dotenv import load_dotenv
from transformers import pipeline
from fastapi.middleware.cors import CORSMiddleware
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from huggingface_hub import login

from pydantic import BaseModel
from fastapi import FastAPI, Request, UploadFile, File
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse

load_dotenv()
app = FastAPI(root_path=os.environ.get("ROOT_PATH"))
# genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
# model = genai.GenerativeModel('gemini-pro')
HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
# login(HUGGINGFACE_KEY)

pipe = pipeline("fill-mask", model="./ksp-mask-model")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

class TextItem(BaseModel):
    text: str

def create_nlp_engine_with_spacy(
    model_path: str = "en_core_web_lg",
):
    """
    Instantiate an NlpEngine with a spaCy model
    :param model_path: path to model / model name.
    """
    nlp_configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": "en", "model_name": model_path}],
        "ner_model_configuration": {
            "model_to_presidio_entity_mapping": {
                "PER": "PERSON",
                "PERSON": "PERSON",
                "NORP": "NRP",
                "FAC": "FACILITY",
                "LOC": "LOCATION",
                "GPE": "LOCATION",
                "LOCATION": "LOCATION",
                "ORG": "ORGANIZATION",
                "ORGANIZATION": "ORGANIZATION",
                "DATE": "DATE_TIME",
                "TIME": "DATE_TIME",
            },
            "low_confidence_score_multiplier": 0.4,
            "low_score_entity_names": ["ORG", "ORGANIZATION"],
        },
    }

    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

    registry = RecognizerRegistry()
    registry.load_predefined_recognizers(nlp_engine=nlp_engine)

    return nlp_engine, registry

nlp_engine, registry = create_nlp_engine_with_spacy()

analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
anonymizer = AnonymizerEngine()

@app.get("/")
async def read_root():
    return {"message": "Hello World"}


@app.get("/vocab_thresh_masking")
async def vocab_thresh_masking(text, threshold):
    ner_model = spacy.load("en_core_web_sm")
    doc = ner_model(text)
    word_counts = dict()
    for token in doc:
        word_counts[token.text] = word_counts.get(str(token.text), 0) + 1

    threshold = int(threshold)
    frequent_words = [word for word, count in word_counts.items() if count >= threshold]
    masked_text = []
    pii_locations = []  # List to store (start index, end index, type) tuples
    for i, token in enumerate(doc):
        if str(token.text) in frequent_words:
            masked_text.append(str(token.text))
        else:
            masked_text.append("[MASK]")
            # Potentially masked PII, record location and tentative type (UNKNOWN)
            pii_locations.append((token.idx, token.idx + len(token.text), "UNKNOWN"))
    return " ".join(masked_text), pii_locations


@app.get("/entity_tagger_masking")
async def entity_tagger_masking(text):
    ner_model = spacy.load("en_core_web_sm")
    doc = ner_model(text)
    masked_text = []
    pii_locations = []
    for token in doc:
        if token.ent_type_ == "PERSON":
            masked_text.append("[MASK]")
            pii_locations.append((token.idx, token.idx + len(token.text), "PERSON"))
        elif token.ent_type_ == "LOC":
            masked_text.append("[MASK]")
            pii_locations.append((token.idx, token.idx + len(token.text), "LOCATION"))
        elif token.ent_type_ == "ORG":
            masked_text.append("[MASK]")
            pii_locations.append((token.idx, token.idx + len(token.text), "ORGANIZATION"))
        elif token.ent_type_ == "DATE":
            masked_text.append("[MASK]")
            pii_locations.append((token.idx, token.idx + len(token.text), "DATE"))
        else:
            masked_text.append(token.text)
    return " ".join(masked_text), pii_locations


@app.get("/email_and_phone")
async def identify_email_and_phone(text):
    # use regex to identify emails and phone numbers and mask them
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"
    
    # find the location of emails and phone numbers
    pii_locations = []
    for match in re.finditer(email_pattern, text):
        pii_locations.append((match.start(), match.end(), "EMAIL"))
    for match in re.finditer(phone_pattern, text):
        pii_locations.append((match.start(), match.end(), "PHONE NUMBER"))

    # mask the emails and phone numbers
    text = re.sub(email_pattern, "[MASK]", text)
    text = re.sub(phone_pattern, "[MASK]", text)
    return text, pii_locations


@app.get("/anonymize_masked_text")
async def anonymize_masked_text(masked_text):
    # prompt = f"The following text contains Personal Information Identifiers marked with [MASK]: \n```\n{masked_text}\n```\n Please anonymize these Personal Identity Identifiers by replacing the '[MASK]' with random placeholders while preserving the context so that the text can be used for analysis."
    # print(prompt)
    # response = model.generate_content(prompt)
    # return response.text
    API_URL = "https://api-inference.huggingface.co/models/pranavraj1103/ksp-mask-model"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_KEY}"}

    def query(payload):
        response = requests.post(API_URL, headers=headers, json=payload)
        return response.json()
        
    output = query({
        "inputs": "The <mask> to the universe is <mask>.",
    })

    return output


@app.post("/parse_doc")
async def parse_doc(file: UploadFile):
    if file.filename.endswith(".txt"):
        return file.file.read()
    doc = docx.Document(file.file)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\n".join(full_text)


@app.post("/presidio_mask")
async def presidio_mask(text: TextItem):
    text = text.text
    results = analyzer.analyze(text=text, language='en')
    # for rec in results:
    #     print(rec.start)
    # print(*[text[res.start : res.end] for res in results])
    # anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)
    # return anonymized_text, results

    return_list = []
    seen_set = set()
    for rec in results:
        if (rec.score < 0.1) or (rec.start, rec.end) in seen_set:
            continue
        return_list.append({
            "start": rec.start,
            "end": rec.end,
            "entity_type": rec.entity_type,
            "text": text[rec.start:rec.end],
            "score": rec.score,
        })
        seen_set.add((rec.start, rec.end))
    return return_list


@app.post("/anonymize_text")
async def anonymize_text(text: TextItem):
    off_set = 0
    mask_list = await presidio_mask(text)
    mask_list = sorted(mask_list, key=lambda x: x["start"])
    new_mask_list = []
    text = text.text
    anonymized_text = text
    final_text = text
    for mask in mask_list:
        mask_text = anonymized_text[:mask["start"]] + "<mask>" + anonymized_text[mask["end"]:]
        options = pipe(mask_text)
        final_text = final_text[:mask["start"] + off_set] + options[0]["token_str"] + final_text[mask["end"] + off_set:]
        new_mask_list.append({
            "start": mask["start"] + off_set,
            "end": mask["start"] + off_set + len(options[0]["token_str"]),
            "entity_type": mask["entity_type"],
            "options": options,
            "original_text": mask["text"],
        })
        off_set += len(options[0]["token_str"]) - len(mask["text"])
    return {"anonymized_text": final_text, "mask_list": new_mask_list}