Spaces:
Sleeping
Sleeping
| import chardet | |
| import spacy | |
| from spacy.cli import download | |
| # ------------------------ | |
| # CONFIGURATION | |
| # ------------------------ | |
| custom_spacy_config = { | |
| "gliner_model": "urchade/gliner_multi_pii-v1", | |
| "labels": [ | |
| "person", "organization", "company", "country", | |
| "medical condition", "credit card brand", | |
| ], | |
| "threshold": 0.39, | |
| "style": "ent", | |
| } | |
| # Load SpaCy and add GLiNER to the pipeline | |
| try: | |
| nlp = spacy.load("en_core_web_lg") | |
| except OSError: | |
| download("en_core_web_lg") | |
| nlp = spacy.load("en_core_web_lg") | |
| nlp.add_pipe("gliner_spacy", config=custom_spacy_config) | |
| def detect_encoding(file_bytes): | |
| result = chardet.detect(file_bytes) | |
| return result.get('encoding', 'utf-8') | |
| def extract_entities_from_file(file_path): | |
| with open(file_path, "rb") as f: | |
| file_bytes = f.read() | |
| encoding = detect_encoding(file_bytes) | |
| text = file_bytes.decode(encoding, errors='ignore') | |
| doc = nlp(text) | |
| results = [(ent.text, ent.label_) for ent in doc.ents] | |
| return results | |