Spaces:
Runtime error
Runtime error
Added example of how to run function from command line. Updated packages. Embedding model default now smaller and at fp16.
34f1e83
| from spacy.cli import download | |
| import spacy | |
| from spacy.pipeline import Sentencizer | |
| from funcs.presidio_analyzer_custom import analyze_dict | |
| spacy.prefer_gpu() | |
| def spacy_model_installed(model_name): | |
| try: | |
| import en_core_web_sm | |
| en_core_web_sm.load() | |
| print("Successfully imported spaCy model") | |
| nlp = spacy.load("en_core_web_sm") | |
| #print(nlp._path) | |
| except: | |
| download(model_name) | |
| nlp = spacy.load(model_name) | |
| print("Successfully imported spaCy model") | |
| #print(nlp._path) | |
| return nlp | |
| #if not is_model_installed(model_name): | |
| # os.system(f"python -m spacy download {model_name}") | |
| model_name = "en_core_web_sm" | |
| nlp = spacy_model_installed(model_name) | |
| import re | |
| import secrets | |
| import base64 | |
| import time | |
| from gradio import Progress | |
| import pandas as pd | |
| from faker import Faker | |
| from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, PatternRecognizer | |
| from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine | |
| from presidio_anonymizer.entities import OperatorConfig | |
| from typing import List | |
| # Function to Split Text and Create DataFrame using SpaCy | |
| def expand_sentences_spacy(df:pd.DataFrame, colname:str, custom_delimiters:List[str]=[], nlp=nlp, progress=Progress(track_tqdm=True)): | |
| ''' | |
| Expand passages into sentences using Spacy's built in NLP capabilities | |
| ''' | |
| expanded_data = [] | |
| df = df.drop('index', axis = 1, errors="ignore").reset_index(names='index') | |
| for index, row in progress.tqdm(df.iterrows(), unit = "rows", desc="Splitting sentences"): | |
| doc = nlp(row[colname]) | |
| for sent in doc.sents: | |
| expanded_data.append({'original_index':row['original_index'],'document_index': row['index'], colname: sent.text}) | |
| return pd.DataFrame(expanded_data) | |
| def anon_consistent_names(df:pd.DataFrame): | |
| # ## Pick out common names and replace them with the same person value | |
| df_dict = df.to_dict(orient="list") | |
| analyzer = AnalyzerEngine() | |
| batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer) | |
| analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en") | |
| analyzer_results = list(analyzer_results) | |
| # + tags=[] | |
| text = analyzer_results[3].value | |
| # + tags=[] | |
| recognizer_result = str(analyzer_results[3].recognizer_results) | |
| # + tags=[] | |
| recognizer_result | |
| # + tags=[] | |
| data_str = recognizer_result # abbreviated for brevity | |
| # Adjusting the parse_dict function to handle trailing ']' | |
| # Splitting the main data string into individual list strings | |
| list_strs = data_str[1:-1].split('], [') | |
| def parse_dict(s): | |
| s = s.strip('[]') # Removing any surrounding brackets | |
| items = s.split(', ') | |
| d = {} | |
| for item in items: | |
| key, value = item.split(': ') | |
| if key == 'score': | |
| d[key] = float(value) | |
| elif key in ['start', 'end']: | |
| d[key] = int(value) | |
| else: | |
| d[key] = value | |
| return d | |
| # Re-running the improved processing code | |
| result = [] | |
| for lst_str in list_strs: | |
| # Splitting each list string into individual dictionary strings | |
| dict_strs = lst_str.split(', type: ') | |
| dict_strs = [dict_strs[0]] + ['type: ' + s for s in dict_strs[1:]] # Prepending "type: " back to the split strings | |
| # Parsing each dictionary string | |
| dicts = [parse_dict(d) for d in dict_strs] | |
| result.append(dicts) | |
| #result | |
| # + tags=[] | |
| names = [] | |
| for idx, paragraph in enumerate(text): | |
| paragraph_texts = [] | |
| for dictionary in result[idx]: | |
| if dictionary['type'] == 'PERSON': | |
| paragraph_texts.append(paragraph[dictionary['start']:dictionary['end']]) | |
| names.append(paragraph_texts) | |
| # + tags=[] | |
| # Flatten the list of lists and extract unique names | |
| unique_names = list(set(name for sublist in names for name in sublist)) | |
| # + tags=[] | |
| fake_names = pd.Series(unique_names).apply(fake_first_name) | |
| # + tags=[] | |
| mapping_df = pd.DataFrame(data={"Unique names":unique_names, | |
| "Fake names": fake_names}) | |
| # + tags=[] | |
| # Convert mapping dataframe to dictionary | |
| # Convert mapping dataframe to dictionary, adding word boundaries for full-word match | |
| name_map = {r'\b' + k + r'\b': v for k, v in zip(mapping_df['Unique names'], mapping_df['Fake names'])} | |
| # + tags=[] | |
| name_map | |
| # + tags=[] | |
| scrubbed_df_consistent_names = df.replace(name_map, regex = True) | |
| # + tags=[] | |
| scrubbed_df_consistent_names | |
| return scrubbed_df_consistent_names | |
| def detect_file_type(filename): | |
| """Detect the file type based on its extension.""" | |
| if (filename.endswith('.csv')) | (filename.endswith('.csv.gz')) | (filename.endswith('.zip')): | |
| return 'csv' | |
| elif filename.endswith('.xlsx'): | |
| return 'xlsx' | |
| elif filename.endswith('.parquet'): | |
| return 'parquet' | |
| else: | |
| raise ValueError("Unsupported file type.") | |
| def read_file(filename): | |
| """Read the file based on its detected type.""" | |
| file_type = detect_file_type(filename) | |
| if file_type == 'csv': | |
| return pd.read_csv(filename, low_memory=False) | |
| elif file_type == 'xlsx': | |
| return pd.read_excel(filename) | |
| elif file_type == 'parquet': | |
| return pd.read_parquet(filename) | |
| def anonymise_script(df, chosen_col, anon_strat): | |
| #print(df.shape) | |
| #df_chosen_col_mask = (df[chosen_col].isnull()) | (df[chosen_col].str.strip() == "") | |
| #print("Length of input series blank at start is: ", df_chosen_col_mask.value_counts()) | |
| # DataFrame to dict | |
| df_dict = pd.DataFrame(data={chosen_col:df[chosen_col].astype(str)}).to_dict(orient="list") | |
| analyzer = AnalyzerEngine() | |
| # Add titles to analyzer list | |
| titles_recognizer = PatternRecognizer(supported_entity="TITLE", | |
| deny_list=["Mr","Mrs","Miss", "Ms", "mr", "mrs", "miss", "ms"]) | |
| analyzer.registry.add_recognizer(titles_recognizer) | |
| batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer) | |
| anonymizer = AnonymizerEngine() | |
| batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine = anonymizer) | |
| print("Identifying personal data") | |
| analyse_tic = time.perf_counter() | |
| #analyzer_results = batch_analyzer.analyze_dict(df_dict, language="en") | |
| analyzer_results = analyze_dict(batch_analyzer, df_dict, language="en") | |
| analyzer_results = list(analyzer_results) | |
| analyse_toc = time.perf_counter() | |
| analyse_time_out = f"Anonymising the text took {analyse_toc - analyse_tic:0.1f} seconds." | |
| print(analyse_time_out) | |
| # Generate a 128-bit AES key. Then encode the key using base64 to get a string representation | |
| key = secrets.token_bytes(16) # 128 bits = 16 bytes | |
| key_string = base64.b64encode(key).decode('utf-8') | |
| # Create faker function (note that it has to receive a value) | |
| fake = Faker("en_UK") | |
| def fake_first_name(x): | |
| return fake.first_name() | |
| # Set up the anonymization configuration WITHOUT DATE_TIME | |
| replace_config = eval('{"DEFAULT": OperatorConfig("replace")}') | |
| redact_config = eval('{"DEFAULT": OperatorConfig("redact")}') | |
| hash_config = eval('{"DEFAULT": OperatorConfig("hash")}') | |
| mask_config = eval('{"DEFAULT": OperatorConfig("mask", {"masking_char":"*", "chars_to_mask":100, "from_end":True})}') | |
| people_encrypt_config = eval('{"PERSON": OperatorConfig("encrypt", {"key": key_string})}') # The encryption is using AES cypher in CBC mode and requires a cryptographic key as an input for both the encryption and the decryption. | |
| fake_first_name_config = eval('{"PERSON": OperatorConfig("custom", {"lambda": fake_first_name})}') | |
| if anon_strat == "replace": chosen_mask_config = replace_config | |
| if anon_strat == "redact": chosen_mask_config = redact_config | |
| if anon_strat == "hash": chosen_mask_config = hash_config | |
| if anon_strat == "mask": chosen_mask_config = mask_config | |
| if anon_strat == "encrypt": chosen_mask_config = people_encrypt_config | |
| elif anon_strat == "fake_first_name": chosen_mask_config = fake_first_name_config | |
| # I think in general people will want to keep date / times - NOT FOR TOPIC MODELLING | |
| #keep_date_config = eval('{"DATE_TIME": OperatorConfig("keep")}') | |
| #combined_config = {**chosen_mask_config, **keep_date_config} | |
| combined_config = {**chosen_mask_config}#, **keep_date_config} | |
| combined_config | |
| print("Anonymising personal data") | |
| anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results, operators=combined_config) | |
| #print(anonymizer_results) | |
| scrubbed_df = pd.DataFrame(data={chosen_col:anonymizer_results[chosen_col]}) | |
| scrubbed_series = scrubbed_df[chosen_col] | |
| #print(scrubbed_series[0:6]) | |
| #print("Length of output series is: ", len(scrubbed_series)) | |
| #print("Length of input series at end is: ", len(df[chosen_col])) | |
| #scrubbed_values_mask = (scrubbed_series.isnull()) | (scrubbed_series.str.strip() == "") | |
| #df_chosen_col_mask = (df[chosen_col].isnull()) | (df[chosen_col].str.strip() == "") | |
| #print("Length of input series blank at end is: ", df_chosen_col_mask.value_counts()) | |
| #print("Length of output series blank is: ", scrubbed_values_mask.value_counts()) | |
| # Create reporting message | |
| out_message = "Successfully anonymised" | |
| if anon_strat == "encrypt": | |
| out_message = out_message + ". Your decryption key is " + key_string + "." | |
| return scrubbed_series, out_message | |
| def do_anonymise(in_file, anon_strat, chosen_cols): | |
| # Load file | |
| anon_df = pd.DataFrame() | |
| if in_file: | |
| for match_file in in_file: | |
| match_temp_file = pd.read_csv(match_file.name, delimiter = ",", low_memory=False)#, encoding='cp1252') | |
| anon_df = pd.concat([anon_df, match_temp_file]) | |
| # Split dataframe to keep only selected columns | |
| all_cols_original_order = list(anon_df.columns) | |
| anon_df_part = anon_df[chosen_cols] | |
| anon_df_remain = anon_df.drop(chosen_cols, axis = 1) | |
| # Anonymise the selected columns | |
| anon_df_part_out, out_message = anonymise_script(anon_df_part, anon_strat) | |
| # Rejoin the dataframe together | |
| anon_df_out = pd.concat([anon_df_part_out, anon_df_remain], axis = 1) | |
| anon_df_out = anon_df_out[all_cols_original_order] | |
| # Export file | |
| out_file_part = re.sub(r'\.csv', '', match_file.name) | |
| anon_export_file_name = out_file_part + "_anon_" + anon_strat + ".csv" | |
| anon_df_out.to_csv(anon_export_file_name, index = None) | |
| return out_message, anon_export_file_name | |