pranavraj1103 commited on
Commit
dae4805
1 Parent(s): d54a6ab

chore: Add Dockerfile and requirements.txt for containerization

Browse files
Files changed (6) hide show
  1. .gitignore +160 -0
  2. Dockerfile +13 -0
  3. app.py +196 -0
  4. note.txt +23 -0
  5. requirements.txt +77 -0
  6. run.py +9 -0
.gitignore ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim-bullseye
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ RUN python -m spacy download en_core_web_lg
10
+
11
+ COPY . .
12
+
13
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import spacy
4
+ import uvicorn
5
+ import docx
6
+ import requests
7
+ import spacy
8
+ from presidio_analyzer import RecognizerRegistry
9
+ from presidio_analyzer.nlp_engine import (
10
+ NlpEngine,
11
+ NlpEngineProvider,
12
+ )
13
+ # import google.generativeai as genai
14
+ from dotenv import load_dotenv
15
+ from transformers import pipeline
16
+ from presidio_analyzer import AnalyzerEngine
17
+ from presidio_anonymizer import AnonymizerEngine
18
+
19
+ from fastapi import FastAPI, Request, UploadFile, File
20
+ from fastapi import FastAPI, Request
21
+ from fastapi.responses import JSONResponse
22
+
23
+ load_dotenv()
24
+ app = FastAPI(root_path=os.environ.get("ROOT_PATH"))
25
+ # genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))
26
+ # model = genai.GenerativeModel('gemini-pro')
27
+ HUGGINGFACE_KEY = os.environ.get("HUGGINGFACE_KEY")
28
+ # pipe = pipeline("fill-mask", model="pranavraj1103/ksp-mask-model")
29
+
30
+
31
+ def create_nlp_engine_with_spacy(
32
+ model_path: str = "en_core_web_sm",
33
+ ):
34
+ """
35
+ Instantiate an NlpEngine with a spaCy model
36
+ :param model_path: path to model / model name.
37
+ """
38
+ nlp_configuration = {
39
+ "nlp_engine_name": "spacy",
40
+ "models": [{"lang_code": "en", "model_name": model_path}],
41
+ "ner_model_configuration": {
42
+ "model_to_presidio_entity_mapping": {
43
+ "PER": "PERSON",
44
+ "PERSON": "PERSON",
45
+ "NORP": "NRP",
46
+ "FAC": "FACILITY",
47
+ "LOC": "LOCATION",
48
+ "GPE": "LOCATION",
49
+ "LOCATION": "LOCATION",
50
+ "ORG": "ORGANIZATION",
51
+ "ORGANIZATION": "ORGANIZATION",
52
+ "DATE": "DATE_TIME",
53
+ "TIME": "DATE_TIME",
54
+ },
55
+ "low_confidence_score_multiplier": 0.4,
56
+ "low_score_entity_names": ["ORG", "ORGANIZATION"],
57
+ },
58
+ }
59
+
60
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
61
+
62
+ registry = RecognizerRegistry()
63
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
64
+
65
+ return nlp_engine, registry
66
+
67
+ nlp_engine, registry = create_nlp_engine_with_spacy()
68
+
69
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
70
+ anonymizer = AnonymizerEngine()
71
+
72
+ @app.get("/")
73
+ async def read_root():
74
+ return {"message": "Hello World"}
75
+
76
+
77
+ @app.get("/vocab_thresh_masking")
78
+ async def vocab_thresh_masking(text, threshold):
79
+ ner_model = spacy.load("en_core_web_sm")
80
+ doc = ner_model(text)
81
+ word_counts = dict()
82
+ for token in doc:
83
+ word_counts[token.text] = word_counts.get(str(token.text), 0) + 1
84
+
85
+ threshold = int(threshold)
86
+ frequent_words = [word for word, count in word_counts.items() if count >= threshold]
87
+ masked_text = []
88
+ pii_locations = [] # List to store (start index, end index, type) tuples
89
+ for i, token in enumerate(doc):
90
+ if str(token.text) in frequent_words:
91
+ masked_text.append(str(token.text))
92
+ else:
93
+ masked_text.append("[MASK]")
94
+ # Potentially masked PII, record location and tentative type (UNKNOWN)
95
+ pii_locations.append((token.idx, token.idx + len(token.text), "UNKNOWN"))
96
+ return " ".join(masked_text), pii_locations
97
+
98
+
99
+ @app.get("/entity_tagger_masking")
100
+ async def entity_tagger_masking(text):
101
+ ner_model = spacy.load("en_core_web_sm")
102
+ doc = ner_model(text)
103
+ masked_text = []
104
+ pii_locations = []
105
+ for token in doc:
106
+ if token.ent_type_ == "PERSON":
107
+ masked_text.append("[MASK]")
108
+ pii_locations.append((token.idx, token.idx + len(token.text), "PERSON"))
109
+ elif token.ent_type_ == "LOC":
110
+ masked_text.append("[MASK]")
111
+ pii_locations.append((token.idx, token.idx + len(token.text), "LOCATION"))
112
+ elif token.ent_type_ == "ORG":
113
+ masked_text.append("[MASK]")
114
+ pii_locations.append((token.idx, token.idx + len(token.text), "ORGANIZATION"))
115
+ elif token.ent_type_ == "DATE":
116
+ masked_text.append("[MASK]")
117
+ pii_locations.append((token.idx, token.idx + len(token.text), "DATE"))
118
+ else:
119
+ masked_text.append(token.text)
120
+ return " ".join(masked_text), pii_locations
121
+
122
+
123
+ @app.get("/email_and_phone")
124
+ async def identify_email_and_phone(text):
125
+ # use regex to identify emails and phone numbers and mask them
126
+ email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
127
+ phone_pattern = r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"
128
+
129
+ # find the location of emails and phone numbers
130
+ pii_locations = []
131
+ for match in re.finditer(email_pattern, text):
132
+ pii_locations.append((match.start(), match.end(), "EMAIL"))
133
+ for match in re.finditer(phone_pattern, text):
134
+ pii_locations.append((match.start(), match.end(), "PHONE NUMBER"))
135
+
136
+ # mask the emails and phone numbers
137
+ text = re.sub(email_pattern, "[MASK]", text)
138
+ text = re.sub(phone_pattern, "[MASK]", text)
139
+ return text, pii_locations
140
+
141
+
142
+ @app.get("/anonymize_masked_text")
143
+ async def anonymize_masked_text(masked_text):
144
+ # prompt = f"The following text contains Personal Information Identifiers marked with [MASK]: \n```\n{masked_text}\n```\n Please anonymize these Personal Identity Identifiers by replacing the '[MASK]' with random placeholders while preserving the context so that the text can be used for analysis."
145
+ # print(prompt)
146
+ # response = model.generate_content(prompt)
147
+ # return response.text
148
+ API_URL = "https://api-inference.huggingface.co/models/pranavraj1103/ksp-mask-model"
149
+ headers = {"Authorization": f"Bearer {HUGGINGFACE_KEY}"}
150
+
151
+ def query(payload):
152
+ response = requests.post(API_URL, headers=headers, json=payload)
153
+ return response.json()
154
+
155
+ output = query({
156
+ "inputs": "The <mask> to the universe is <mask>.",
157
+ })
158
+
159
+ return output
160
+
161
+
162
+ @app.post("/parse_doc")
163
+ async def parse_doc(file: UploadFile):
164
+ if file.filename.endswith(".txt"):
165
+ return file.file.read()
166
+ doc = docx.Document(file.file)
167
+ full_text = []
168
+ for para in doc.paragraphs:
169
+ full_text.append(para.text)
170
+ return "\n".join(full_text)
171
+
172
+
173
+ @app.post("/presidio_mask")
174
+ async def presidio_mask(text):
175
+ results = analyzer.analyze(text=text, language='en')
176
+ # for rec in results:
177
+ # print(rec.start)
178
+ # print(*[text[res.start : res.end] for res in results])
179
+ # anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)
180
+ # return anonymized_text, results
181
+
182
+ return_list = []
183
+ seen_set = set()
184
+ for rec in results:
185
+ if (rec.score < 0.1) or (rec.start, rec.end) in seen_set:
186
+ continue
187
+ return_list.append({
188
+ "start": rec.start,
189
+ "end": rec.end,
190
+ "entity_type": rec.entity_type,
191
+ "text": text[rec.start:rec.end],
192
+ "score": rec.score,
193
+ })
194
+ seen_set.add((rec.start, rec.end))
195
+ return return_list
196
+
note.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #downloading spacy model
2
+ python -m spacy download en_core_web_lg
3
+
4
+ sample_text = "My phone number is 212-555-5555, and my friend number is 9876543210"
5
+ sample_text_2 = """The text in the image is a police report from the Amengad Police Station in Bagalkot, Karnataka, India. The report is dated 10-11-2022 and is about a man named Ramasawamy. The report states that Ramasawamy is a "rowdy" and a "habitual offender" who "disturbs public peace in public places." The report also states that Ramasawamy is "under surveillance."
6
+
7
+ The report is signed by a police officer named SOMAPPA. The report is also stamped with the seal of the Amengad Police Station.
8
+
9
+ Police Report Police Station:
10
+
11
+ Amengad PS Case Number: 2022000003
12
+
13
+ Date: 10-11-2022
14
+
15
+ Subject: Ramasawamy
16
+
17
+ Details: The accused is a rowdy and a habitual offender. He disturbs public peace in public places. He is under surveillance.
18
+
19
+ Action Taken: The accused has been warned. He has been told to stop disturbing public peace.
20
+
21
+ Signature: SOMAPPA Police
22
+
23
+ Officer Seal: Amengad Police Station"""
requirements.txt ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types==0.6.0
2
+ anyio==4.3.0
3
+ blis==0.7.11
4
+ catalogue==2.0.10
5
+ certifi==2024.2.2
6
+ charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ cloudpathlib==0.16.0
9
+ colorama==0.4.6
10
+ confection==0.1.4
11
+ cymem==2.0.8
12
+ dnspython==2.6.1
13
+ email_validator==2.1.1
14
+ en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl#sha256=ab70aeb6172cde82508f7739f35ebc9918a3d07debeed637403c8f794ba3d3dc
15
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
16
+ fastapi==0.111.0
17
+ fastapi-cli==0.0.3
18
+ filelock==3.14.0
19
+ fsspec==2024.5.0
20
+ h11==0.14.0
21
+ httpcore==1.0.5
22
+ httptools==0.6.1
23
+ httpx==0.27.0
24
+ huggingface-hub==0.23.0
25
+ idna==3.7
26
+ Jinja2==3.1.4
27
+ langcodes==3.4.0
28
+ language_data==1.2.0
29
+ lxml==5.2.2
30
+ marisa-trie==1.1.1
31
+ markdown-it-py==3.0.0
32
+ MarkupSafe==2.1.5
33
+ mdurl==0.1.2
34
+ murmurhash==1.0.10
35
+ numpy==1.26.4
36
+ orjson==3.10.3
37
+ packaging==24.0
38
+ phonenumbers==8.13.37
39
+ pillow==10.3.0
40
+ preshed==3.0.9
41
+ presidio-analyzer==2.2.354
42
+ presidio-anonymizer==2.2.354
43
+ pycryptodome==3.20.0
44
+ pydantic==2.7.1
45
+ pydantic_core==2.18.2
46
+ Pygments==2.18.0
47
+ python-docx==1.1.2
48
+ python-dotenv==1.0.1
49
+ python-multipart==0.0.9
50
+ PyYAML==6.0.1
51
+ regex==2024.5.15
52
+ requests==2.31.0
53
+ requests-file==2.0.0
54
+ rich==13.7.1
55
+ safetensors==0.4.3
56
+ shellingham==1.5.4
57
+ smart-open==6.4.0
58
+ sniffio==1.3.1
59
+ spacy==3.7.4
60
+ spacy-legacy==3.0.12
61
+ spacy-loggers==1.0.5
62
+ srsly==2.4.8
63
+ starlette==0.37.2
64
+ thinc==8.2.3
65
+ tldextract==5.1.2
66
+ tokenizers==0.19.1
67
+ tqdm==4.66.4
68
+ transformers==4.40.2
69
+ typer==0.9.4
70
+ typing_extensions==4.11.0
71
+ ujson==5.10.0
72
+ urllib3==2.2.1
73
+ uvicorn==0.29.0
74
+ wasabi==1.1.2
75
+ watchfiles==0.21.0
76
+ weasel==0.3.4
77
+ websockets==12.0
run.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import argparse
4
+ from typing import List, Optional, Union
5
+
6
+ import uvicorn
7
+
8
+ if __name__ == "__main__":
9
+ uvicorn.run("app:app", host="127.0.0.1", port=8000, reload=True)