File size: 27,457 Bytes
aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 f957846 aa5c211 44d987c bafcf39 aa5c211 bafcf39 b1f183d aa5c211 bafcf39 b1f183d bafcf39 b1f183d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d aa5c211 d60759d bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 d60759d bafcf39 aa5c211 bafcf39 d60759d 826ed50 3d18b9d 826ed50 d60759d bafcf39 d60759d bafcf39 d60759d 826ed50 3d18b9d 826ed50 bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 d60759d aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d aa5c211 d60759d aa5c211 bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 d60759d bafcf39 aa5c211 bafcf39 d60759d bafcf39 f957846 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 f957846 bafcf39 d60759d f957846 bafcf39 d60759d bafcf39 d60759d f957846 bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 d60759d aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 f957846 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d aa5c211 bafcf39 aa5c211 d60759d aa5c211 bafcf39 aa5c211 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d aa5c211 bafcf39 d60759d aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 826ed50 d60759d bafcf39 d60759d bafcf39 d60759d bafcf39 d60759d 826ed50 aa5c211 d60759d aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 bafcf39 aa5c211 d60759d bafcf39 aa5c211 bafcf39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 |
import os
import time
from pathlib import Path
from typing import Dict, List, Tuple
import gradio as gr
import pandas as pd
from gradio import Progress
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tools.config import (
DO_INITIAL_TABULAR_DATA_CLEAN,
MAX_SIMULTANEOUS_FILES,
MAX_TABLE_ROWS,
REMOVE_DUPLICATE_ROWS,
)
from tools.data_anonymise import initial_clean
from tools.helper_functions import OUTPUT_FOLDER, read_file
from tools.load_spacy_model_custom_recognisers import nlp
from tools.secure_path_utils import secure_join
def clean_and_stem_text_series(
df: pd.DataFrame,
column: str,
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
):
"""
Clean and stem text columns in a data frame for tabular data
"""
# Function to apply lemmatisation and remove stopwords
def _apply_lemmatization(text):
doc = nlp(text)
# Keep only alphabetic tokens and remove stopwords
lemmatized_words = [
token.lemma_ for token in doc if token.is_alpha and not token.is_stop
]
return " ".join(lemmatized_words)
# Always create text_clean column first
if do_initial_clean_dup:
df["text_clean"] = initial_clean(df[column])
else:
df["text_clean"] = df[column]
df["text_clean"] = df["text_clean"].apply(_apply_lemmatization)
df["text_clean"] = df[
"text_clean"
].str.lower() # .str.replace(r'[^\w\s]', '', regex=True)
return df
def convert_tabular_data_to_analysis_format(
df: pd.DataFrame, file_name: str, text_columns: List[str] = None
) -> List[Tuple[str, pd.DataFrame]]:
"""
Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.
Args:
df (pd.DataFrame): The input DataFrame
file_name (str): Name of the file
text_columns (List[str], optional): Columns to analyze for duplicates.
If None, uses all string columns.
Returns:
List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
"""
# if text_columns is None:
# # Auto-detect text columns (string type columns)
# print(f"No text columns given for {file_name}")
# return []
# text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()
text_columns = [col for col in text_columns if col in df.columns]
if not text_columns:
print(f"No text columns found in {file_name}")
return list()
# Create a copy to avoid modifying original
df_copy = df.copy()
# Create a combined text column from all text columns
df_copy["combined_text"] = (
df_copy[text_columns].fillna("").astype(str).agg(" ".join, axis=1)
)
# Add row identifier
df_copy["row_id"] = df_copy.index
# Create the format expected by the duplicate detection system
# Using 'row_number' as row number and 'text' as the combined text
processed_df = pd.DataFrame(
{
"row_number": df_copy["row_id"],
"text": df_copy["combined_text"],
"file": file_name,
}
)
# Add original row data for reference
for col in text_columns:
processed_df[f"original_{col}"] = df_copy[col]
return [(file_name, processed_df)]
def find_duplicate_cells_in_tabular_data(
input_files: List[str],
similarity_threshold: float = 0.95,
min_word_count: int = 3,
text_columns: List[str] = [],
output_folder: str = OUTPUT_FOLDER,
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
in_excel_tabular_sheets: str = "",
progress: Progress = Progress(track_tqdm=True),
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
"""
Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).
Args:
input_files (List[str]): List of file paths to analyze
similarity_threshold (float): Minimum similarity score to consider duplicates
min_word_count (int): Minimum word count for text to be considered
text_columns (List[str], optional): Specific columns to analyze
output_folder (str, optional): Output folder for results
do_initial_clean_dup (bool, optional): Whether to do initial clean of text
progress (Progress): Progress tracking object
Returns:
Tuple containing:
- results_df: DataFrame with duplicate matches
- output_paths: List of output file paths
- full_data_by_file: Dictionary of processed data by file
"""
if not input_files:
raise gr.Error("Please upload files to analyze.")
progress(0.1, desc="Loading and processing files...")
all_data_to_process = list()
full_data_by_file = dict()
file_paths = list()
# Process each file
for file_path in input_files:
try:
if file_path.endswith(".xlsx") or file_path.endswith(".xls"):
temp_df = pd.DataFrame()
# Try finding each sheet in the given list until a match is found
for sheet_name in in_excel_tabular_sheets:
temp_df = read_file(file_path, excel_sheet_name=sheet_name)
# If sheet was successfully_loaded
if not temp_df.empty:
if temp_df.shape[0] > MAX_TABLE_ROWS:
out_message = f"Number of rows in {file_path} for sheet {sheet_name} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
print(out_message)
raise Exception(out_message)
file_name = os.path.basename(file_path) + "_" + sheet_name
file_paths.append(file_path)
# Convert to analysis format
processed_data = convert_tabular_data_to_analysis_format(
temp_df, file_name, text_columns
)
if processed_data:
all_data_to_process.extend(processed_data)
full_data_by_file[file_name] = processed_data[0][1]
temp_df = pd.DataFrame()
else:
temp_df = read_file(file_path)
if temp_df.shape[0] > MAX_TABLE_ROWS:
out_message = f"Number of rows in {file_path} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
print(out_message)
raise Exception(out_message)
file_name = os.path.basename(file_path)
file_paths.append(file_path)
# Convert to analysis format
processed_data = convert_tabular_data_to_analysis_format(
temp_df, file_name, text_columns
)
if processed_data:
all_data_to_process.extend(processed_data)
full_data_by_file[file_name] = processed_data[0][1]
except Exception as e:
print(f"Error processing {file_path}: {e}")
continue
if not all_data_to_process:
raise gr.Error("No valid data found in uploaded files.")
progress(0.2, desc="Combining data...")
# Combine all data
combined_df = pd.concat(
[data[1] for data in all_data_to_process], ignore_index=True
)
combined_df = combined_df.drop_duplicates(subset=["row_number", "file"])
progress(0.3, desc="Cleaning and preparing text...")
# Clean and prepare text
combined_df = clean_and_stem_text_series(
combined_df, "text", do_initial_clean_dup=do_initial_clean_dup
)
# Filter by minimum word count
combined_df["word_count"] = (
combined_df["text_clean"].str.split().str.len().fillna(0)
)
combined_df = combined_df[combined_df["word_count"] >= min_word_count].copy()
if len(combined_df) < 2:
return pd.DataFrame(), [], full_data_by_file
progress(0.4, desc="Calculating similarities...")
# Calculate similarities
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_df["text_clean"])
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
# Find similar pairs
coo_matrix = similarity_matrix.tocoo()
similar_pairs = [
(r, c, v)
for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
if r < c and v >= similarity_threshold
]
if not similar_pairs:
gr.Info("No duplicate cells found.")
return pd.DataFrame(), [], full_data_by_file
progress(0.7, desc="Processing results...")
# Create results DataFrame
results_data = []
for row1, row2, similarity in similar_pairs:
row1_data = combined_df.iloc[row1]
row2_data = combined_df.iloc[row2]
results_data.append(
{
"File1": row1_data["file"],
"Row1": int(row1_data["row_number"]),
"File2": row2_data["file"],
"Row2": int(row2_data["row_number"]),
"Similarity_Score": round(similarity, 3),
"Text1": (
row1_data["text"][:200] + "..."
if len(row1_data["text"]) > 200
else row1_data["text"]
),
"Text2": (
row2_data["text"][:200] + "..."
if len(row2_data["text"]) > 200
else row2_data["text"]
),
"Original_Index1": row1,
"Original_Index2": row2,
}
)
results_df = pd.DataFrame(results_data)
results_df = results_df.sort_values(["File1", "Row1", "File2", "Row2"])
progress(0.9, desc="Saving results...")
# Save results
output_paths = save_tabular_duplicate_results(
results_df,
output_folder,
file_paths,
remove_duplicate_rows=remove_duplicate_rows,
in_excel_tabular_sheets=in_excel_tabular_sheets,
)
gr.Info(f"Found {len(results_df)} duplicate cell matches")
return results_df, output_paths, full_data_by_file
def save_tabular_duplicate_results(
results_df: pd.DataFrame,
output_folder: str,
file_paths: List[str],
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
in_excel_tabular_sheets: List[str] = [],
) -> List[str]:
"""
Save tabular duplicate detection results to files.
Args:
results_df (pd.DataFrame): Results DataFrame
output_folder (str): Output folder path
file_paths (List[str]): List of file paths
remove_duplicate_rows (bool): Whether to remove duplicate rows
in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
Returns:
List[str]: List of output file paths
"""
output_paths = list()
output_folder_path = Path(output_folder)
output_folder_path.mkdir(exist_ok=True)
if results_df.empty:
print("No duplicate matches to save.")
return list()
# Save main results
results_file = output_folder_path / "tabular_duplicate_results.csv"
results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
output_paths.append(str(results_file))
# Group results by original file to handle Excel files properly
excel_files_processed = dict() # Track which Excel files have been processed
# Save per-file duplicate lists
for file_name, group in results_df.groupby("File2"):
# Check for matches with original file names
for original_file in file_paths:
original_file_name = os.path.basename(original_file)
if original_file_name in file_name:
original_file_extension = os.path.splitext(original_file)[-1]
if original_file_extension in [".xlsx", ".xls"]:
# Split the string using secure regex to handle both .xlsx_ and .xls_ delimiters
from tools.secure_regex_utils import safe_split_filename
parts = safe_split_filename(
os.path.basename(file_name), [".xlsx_", ".xls_"]
)
# The sheet name is the last part after splitting
file_sheet_name = parts[-1]
file_path = original_file
# Initialize Excel file tracking if not already done
if file_path not in excel_files_processed:
excel_files_processed[file_path] = {
"sheets_data": dict(),
"all_sheets": list(),
"processed_sheets": set(),
}
# Read the original Excel file to get all sheet names
if not excel_files_processed[file_path]["all_sheets"]:
try:
excel_file = pd.ExcelFile(file_path)
excel_files_processed[file_path][
"all_sheets"
] = excel_file.sheet_names
except Exception as e:
print(f"Error reading Excel file {file_path}: {e}")
continue
# Read the current sheet
df = read_file(file_path, excel_sheet_name=file_sheet_name)
# Create duplicate rows file for this sheet
file_stem = Path(file_name).stem
duplicate_rows_file = (
output_folder_path
/ f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
)
# Get unique row numbers to remove
rows_to_remove = sorted(group["Row2"].unique())
duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
duplicate_df.to_csv(duplicate_rows_file, index=False)
output_paths.append(str(duplicate_rows_file))
# Process the sheet data
df_cleaned = df.copy()
df_cleaned["duplicated"] = False
df_cleaned.loc[rows_to_remove, "duplicated"] = True
if remove_duplicate_rows:
df_cleaned = df_cleaned.drop(index=rows_to_remove)
# Store the processed sheet data
excel_files_processed[file_path]["sheets_data"][
file_sheet_name
] = df_cleaned
excel_files_processed[file_path]["processed_sheets"].add(
file_sheet_name
)
else:
file_sheet_name = ""
file_path = original_file
print("file_path after match:", file_path)
file_base_name = os.path.basename(file_path)
df = read_file(file_path)
file_stem = Path(file_name).stem
duplicate_rows_file = (
output_folder_path / f"{file_stem}_duplicate_rows.csv"
)
# Get unique row numbers to remove
rows_to_remove = sorted(group["Row2"].unique())
duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
duplicate_df.to_csv(duplicate_rows_file, index=False)
output_paths.append(str(duplicate_rows_file))
df_cleaned = df.copy()
df_cleaned["duplicated"] = False
df_cleaned.loc[rows_to_remove, "duplicated"] = True
if remove_duplicate_rows:
df_cleaned = df_cleaned.drop(index=rows_to_remove)
file_ext = os.path.splitext(file_name)[-1]
if file_ext in [".parquet"]:
output_path = secure_join(
output_folder, f"{file_base_name}_deduplicated.parquet"
)
df_cleaned.to_parquet(output_path, index=False)
else:
output_path = secure_join(
output_folder, f"{file_base_name}_deduplicated.csv"
)
df_cleaned.to_csv(
output_path, index=False, encoding="utf-8-sig"
)
output_paths.append(str(output_path))
break
# Process Excel files to create complete deduplicated files
for file_path, file_data in excel_files_processed.items():
try:
# Create output filename
file_base_name = os.path.splitext(os.path.basename(file_path))[0]
file_ext = os.path.splitext(file_path)[-1]
output_path = secure_join(
output_folder, f"{file_base_name}_deduplicated{file_ext}"
)
# Create Excel writer
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
# Write all sheets
for sheet_name in file_data["all_sheets"]:
if sheet_name in file_data["processed_sheets"]:
# Use the processed (deduplicated) version
file_data["sheets_data"][sheet_name].to_excel(
writer, sheet_name=sheet_name, index=False
)
else:
# Use the original sheet (no duplicates found)
original_df = read_file(file_path, excel_sheet_name=sheet_name)
original_df.to_excel(writer, sheet_name=sheet_name, index=False)
output_paths.append(str(output_path))
print(f"Created deduplicated Excel file: {output_path}")
except Exception as e:
print(f"Error creating deduplicated Excel file for {file_path}: {e}")
continue
return output_paths
def remove_duplicate_rows_from_tabular_data(
file_path: str,
duplicate_rows: List[int],
output_folder: str = OUTPUT_FOLDER,
in_excel_tabular_sheets: List[str] = [],
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
) -> str:
"""
Remove duplicate rows from a tabular data file.
Args:
file_path (str): Path to the input file
duplicate_rows (List[int]): List of row indices to remove
output_folder (str): Output folder for cleaned file
in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
remove_duplicate_rows (bool): Whether to remove duplicate rows
Returns:
str: Path to the cleaned file
"""
try:
# Load the file
df = read_file(
file_path,
excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "",
)
# Remove duplicate rows (0-indexed)
df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)
# Save cleaned file
file_name = os.path.basename(file_path)
file_stem = os.path.splitext(file_name)[0]
file_ext = os.path.splitext(file_name)[-1]
output_path = secure_join(output_folder, f"{file_stem}_deduplicated{file_ext}")
if file_ext in [".xlsx", ".xls"]:
df_cleaned.to_excel(
output_path,
index=False,
sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
)
elif file_ext in [".parquet"]:
df_cleaned.to_parquet(output_path, index=False)
else:
df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")
return output_path
except Exception as e:
print(f"Error removing duplicates from {file_path}: {e}")
raise
def run_tabular_duplicate_analysis(
files: List[str],
threshold: float,
min_words: int,
text_columns: List[str] = [],
output_folder: str = OUTPUT_FOLDER,
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
in_excel_tabular_sheets: List[str] = [],
progress: Progress = Progress(track_tqdm=True),
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
"""
Main function to run tabular duplicate analysis.
Args:
files (List[str]): List of file paths
threshold (float): Similarity threshold
min_words (int): Minimum word count
text_columns (List[str], optional): Specific columns to analyze
output_folder (str, optional): Output folder for results
progress (Progress): Progress tracking
Returns:
Tuple containing results DataFrame, output paths, and full data by file
"""
return find_duplicate_cells_in_tabular_data(
input_files=files,
similarity_threshold=threshold,
min_word_count=min_words,
text_columns=text_columns if text_columns else [],
output_folder=output_folder,
do_initial_clean_dup=do_initial_clean_dup,
in_excel_tabular_sheets=(
in_excel_tabular_sheets if in_excel_tabular_sheets else []
),
remove_duplicate_rows=remove_duplicate_rows,
)
# Function to update column choices when files are uploaded
def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
if not files:
return gr.update(choices=[])
all_columns = set()
for file in files:
try:
file_extension = os.path.splitext(file.name)[-1]
if file_extension in [".xlsx", ".xls"]:
for sheet_name in in_excel_tabular_sheets:
df = read_file(file.name, excel_sheet_name=sheet_name)
text_cols = df.select_dtypes(
include=["object", "string"]
).columns.tolist()
all_columns.update(text_cols)
else:
df = read_file(file.name)
text_cols = df.select_dtypes(
include=["object", "string"]
).columns.tolist()
all_columns.update(text_cols)
# Get text columns
text_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()
all_columns.update(text_cols)
except Exception as e:
print(f"Error reading {file.name}: {e}")
continue
return gr.Dropdown(choices=sorted(list(all_columns)))
# Function to handle tabular duplicate detection
def run_tabular_duplicate_detection(
files,
threshold,
min_words,
text_columns,
output_folder: str = OUTPUT_FOLDER,
do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
in_excel_tabular_sheets: List[str] = [],
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
):
if not files:
print("No files uploaded")
return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"
start_time = time.time()
task_textbox = "deduplicate"
# If output folder doesn't end with a forward slash, add one
if not output_folder.endswith("/"):
output_folder = output_folder + "/"
file_paths = list()
if isinstance(files, str):
# If 'files' is a single string, treat it as a list with one element
file_paths.append(files)
elif isinstance(files, list):
# If 'files' is a list, iterate through its elements
for f_item in files:
if isinstance(f_item, str):
# If an element is a string, it's a direct file path
file_paths.append(f_item)
elif hasattr(f_item, "name"):
# If an element has a '.name' attribute (e.g., a Gradio File object), use its name
file_paths.append(f_item.name)
else:
# Log a warning for unexpected element types within the list
print(
f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}"
)
elif hasattr(files, "name"):
# Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
file_paths.append(files.name)
else:
# Raise an error for any other unexpected type of the 'files' argument itself
raise TypeError(
f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object."
)
if len(file_paths) > MAX_SIMULTANEOUS_FILES:
out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
print(out_message)
raise Exception(out_message)
results_df, output_paths, full_data = run_tabular_duplicate_analysis(
files=file_paths,
threshold=threshold,
min_words=min_words,
text_columns=text_columns if text_columns else [],
output_folder=output_folder,
do_initial_clean_dup=do_initial_clean_dup,
in_excel_tabular_sheets=(
in_excel_tabular_sheets if in_excel_tabular_sheets else None
),
remove_duplicate_rows=remove_duplicate_rows,
)
# Update file choices for cleaning
file_choices = list(set([f for f in file_paths]))
end_time = time.time()
processing_time = round(end_time - start_time, 2)
return (
results_df,
output_paths,
gr.Dropdown(choices=file_choices),
processing_time,
task_textbox,
)
# Function to handle row selection for preview
def handle_tabular_row_selection(results_df, evt: gr.SelectData):
if not evt:
return None, "", ""
if not isinstance(results_df, pd.DataFrame):
return None, "", ""
elif results_df.empty:
return None, "", ""
selected_index = evt.index[0]
if selected_index >= len(results_df):
return None, "", ""
row = results_df.iloc[selected_index]
return selected_index, row["Text1"], row["Text2"]
# Function to clean duplicates from selected file
def clean_tabular_duplicates(
file_name,
results_df,
output_folder,
in_excel_tabular_sheets: str = "",
remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
):
if not file_name or results_df.empty:
return None
# Get duplicate rows for this file
file_duplicates = results_df[results_df["File2"] == file_name]["Row2"].tolist()
if not file_duplicates:
return None
try:
# Find the original file path
# This is a simplified approach - in practice you might want to store file paths
cleaned_file = remove_duplicate_rows_from_tabular_data(
file_path=file_name,
duplicate_rows=file_duplicates,
output_folder=output_folder,
in_excel_tabular_sheets=in_excel_tabular_sheets,
remove_duplicate_rows=remove_duplicate_rows,
)
return cleaned_file
except Exception as e:
print(f"Error cleaning duplicates: {e}")
return None
|