File size: 27,457 Bytes
aa5c211
d60759d
bafcf39
 
 
aa5c211
bafcf39
aa5c211
bafcf39
 
 
 
 
 
 
 
 
aa5c211
bafcf39
aa5c211
f957846
aa5c211
44d987c
bafcf39
 
 
 
 
aa5c211
 
 
 
 
 
 
 
bafcf39
 
 
 
 
b1f183d
aa5c211
bafcf39
b1f183d
 
bafcf39
 
 
b1f183d
bafcf39
aa5c211
 
 
bafcf39
aa5c211
bafcf39
aa5c211
 
 
bafcf39
aa5c211
 
 
bafcf39
aa5c211
bafcf39
aa5c211
 
 
d60759d
 
 
 
 
 
 
bafcf39
aa5c211
 
d60759d
bafcf39
aa5c211
 
bafcf39
aa5c211
bafcf39
 
 
 
aa5c211
bafcf39
 
aa5c211
d60759d
bafcf39
 
 
 
 
 
 
 
aa5c211
 
bafcf39
 
aa5c211
 
bafcf39
aa5c211
 
 
 
d60759d
aa5c211
 
d60759d
 
bafcf39
aa5c211
 
d60759d
bafcf39
aa5c211
 
 
 
 
 
 
 
bafcf39
aa5c211
 
 
 
 
 
bafcf39
aa5c211
 
bafcf39
aa5c211
bafcf39
d60759d
 
bafcf39
 
aa5c211
 
 
bafcf39
d60759d
 
 
 
 
 
 
 
826ed50
3d18b9d
 
826ed50
 
 
d60759d
 
bafcf39
d60759d
 
 
 
bafcf39
d60759d
 
 
 
 
 
 
826ed50
3d18b9d
 
826ed50
 
bafcf39
d60759d
 
bafcf39
d60759d
 
 
 
bafcf39
d60759d
 
 
bafcf39
aa5c211
 
 
bafcf39
aa5c211
 
bafcf39
aa5c211
bafcf39
aa5c211
bafcf39
 
 
 
 
d60759d
aa5c211
bafcf39
aa5c211
bafcf39
 
 
 
aa5c211
bafcf39
 
 
 
 
aa5c211
 
bafcf39
aa5c211
bafcf39
aa5c211
 
bafcf39
aa5c211
bafcf39
aa5c211
 
 
bafcf39
 
aa5c211
 
bafcf39
aa5c211
 
 
bafcf39
aa5c211
bafcf39
aa5c211
 
 
 
 
bafcf39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa5c211
bafcf39
 
aa5c211
bafcf39
aa5c211
bafcf39
 
 
 
 
 
 
 
aa5c211
bafcf39
aa5c211
 
bafcf39
 
 
 
 
 
 
 
aa5c211
 
bafcf39
aa5c211
 
 
 
d60759d
 
aa5c211
 
 
d60759d
aa5c211
 
bafcf39
aa5c211
 
d60759d
bafcf39
aa5c211
bafcf39
aa5c211
 
bafcf39
d60759d
bafcf39
 
aa5c211
bafcf39
d60759d
 
 
 
 
 
bafcf39
 
f957846
 
 
 
 
 
d60759d
 
 
 
bafcf39
d60759d
 
 
bafcf39
 
 
d60759d
bafcf39
d60759d
bafcf39
d60759d
 
bafcf39
 
 
d60759d
 
 
bafcf39
d60759d
 
bafcf39
d60759d
 
bafcf39
 
 
 
 
d60759d
bafcf39
 
d60759d
 
bafcf39
d60759d
 
 
 
 
 
bafcf39
d60759d
bafcf39
 
 
 
 
 
 
d60759d
 
 
 
 
 
 
 
bafcf39
 
 
 
d60759d
bafcf39
 
d60759d
 
 
 
 
 
 
 
 
 
 
bafcf39
f957846
bafcf39
 
d60759d
 
f957846
bafcf39
 
 
 
 
 
d60759d
 
bafcf39
d60759d
 
 
 
 
 
f957846
bafcf39
 
 
d60759d
bafcf39
d60759d
bafcf39
 
d60759d
bafcf39
 
d60759d
 
 
 
bafcf39
 
d60759d
 
bafcf39
d60759d
 
 
bafcf39
aa5c211
 
bafcf39
aa5c211
 
 
d60759d
 
bafcf39
aa5c211
 
 
bafcf39
aa5c211
 
 
 
d60759d
 
aa5c211
 
 
 
 
bafcf39
 
 
 
 
aa5c211
 
bafcf39
aa5c211
 
 
d60759d
bafcf39
f957846
bafcf39
 
 
 
 
 
 
 
aa5c211
 
 
bafcf39
aa5c211
bafcf39
aa5c211
 
 
 
bafcf39
aa5c211
 
 
 
d60759d
aa5c211
 
d60759d
 
bafcf39
aa5c211
 
 
bafcf39
aa5c211
 
 
 
 
 
 
bafcf39
aa5c211
 
 
 
 
 
 
d60759d
aa5c211
 
bafcf39
 
 
 
aa5c211
 
 
 
d60759d
aa5c211
 
bafcf39
aa5c211
 
 
d60759d
bafcf39
d60759d
 
bafcf39
 
 
d60759d
 
 
bafcf39
 
 
d60759d
 
aa5c211
bafcf39
d60759d
aa5c211
 
 
 
bafcf39
aa5c211
 
bafcf39
aa5c211
bafcf39
 
 
 
 
 
 
 
 
 
aa5c211
d60759d
 
 
 
 
 
 
 
bafcf39
 
 
826ed50
d60759d
 
 
 
 
 
 
 
 
bafcf39
d60759d
 
 
 
bafcf39
 
 
 
d60759d
 
 
 
bafcf39
 
 
d60759d
826ed50
 
 
 
 
aa5c211
 
 
 
d60759d
aa5c211
d60759d
bafcf39
 
 
 
aa5c211
bafcf39
aa5c211
 
d60759d
 
 
bafcf39
 
 
 
 
 
 
 
 
aa5c211
 
bafcf39
 
aa5c211
 
 
 
 
 
 
bafcf39
aa5c211
 
 
bafcf39
aa5c211
bafcf39
 
aa5c211
 
bafcf39
 
 
 
 
 
 
aa5c211
 
bafcf39
aa5c211
bafcf39
 
aa5c211
 
bafcf39
aa5c211
 
 
 
 
 
d60759d
 
bafcf39
aa5c211
 
 
 
bafcf39
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
import os
import time
from pathlib import Path
from typing import Dict, List, Tuple

import gradio as gr
import pandas as pd
from gradio import Progress
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tools.config import (
    DO_INITIAL_TABULAR_DATA_CLEAN,
    MAX_SIMULTANEOUS_FILES,
    MAX_TABLE_ROWS,
    REMOVE_DUPLICATE_ROWS,
)
from tools.data_anonymise import initial_clean
from tools.helper_functions import OUTPUT_FOLDER, read_file
from tools.load_spacy_model_custom_recognisers import nlp
from tools.secure_path_utils import secure_join


def clean_and_stem_text_series(
    df: pd.DataFrame,
    column: str,
    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
):
    """
    Clean and stem text columns in a data frame for tabular data
    """

    # Function to apply lemmatisation and remove stopwords
    def _apply_lemmatization(text):
        doc = nlp(text)
        # Keep only alphabetic tokens and remove stopwords
        lemmatized_words = [
            token.lemma_ for token in doc if token.is_alpha and not token.is_stop
        ]
        return " ".join(lemmatized_words)

    # Always create text_clean column first
    if do_initial_clean_dup:
        df["text_clean"] = initial_clean(df[column])
    else:
        df["text_clean"] = df[column]

    df["text_clean"] = df["text_clean"].apply(_apply_lemmatization)
    df["text_clean"] = df[
        "text_clean"
    ].str.lower()  # .str.replace(r'[^\w\s]', '', regex=True)

    return df


def convert_tabular_data_to_analysis_format(
    df: pd.DataFrame, file_name: str, text_columns: List[str] = None
) -> List[Tuple[str, pd.DataFrame]]:
    """
    Convert tabular data (CSV/XLSX) to the format needed for duplicate analysis.

    Args:
        df (pd.DataFrame): The input DataFrame
        file_name (str): Name of the file
        text_columns (List[str], optional): Columns to analyze for duplicates.
                                          If None, uses all string columns.

    Returns:
        List[Tuple[str, pd.DataFrame]]: List containing (file_name, processed_df) tuple
    """
    # if text_columns is None:
    #     # Auto-detect text columns (string type columns)
    #     print(f"No text columns given for {file_name}")
    #     return []
    #     text_columns = df.select_dtypes(include=['object', 'string']).columns.tolist()

    text_columns = [col for col in text_columns if col in df.columns]

    if not text_columns:
        print(f"No text columns found in {file_name}")
        return list()

    # Create a copy to avoid modifying original
    df_copy = df.copy()

    # Create a combined text column from all text columns
    df_copy["combined_text"] = (
        df_copy[text_columns].fillna("").astype(str).agg(" ".join, axis=1)
    )

    # Add row identifier
    df_copy["row_id"] = df_copy.index

    # Create the format expected by the duplicate detection system
    # Using 'row_number' as row number and 'text' as the combined text
    processed_df = pd.DataFrame(
        {
            "row_number": df_copy["row_id"],
            "text": df_copy["combined_text"],
            "file": file_name,
        }
    )

    # Add original row data for reference
    for col in text_columns:
        processed_df[f"original_{col}"] = df_copy[col]

    return [(file_name, processed_df)]


def find_duplicate_cells_in_tabular_data(
    input_files: List[str],
    similarity_threshold: float = 0.95,
    min_word_count: int = 3,
    text_columns: List[str] = [],
    output_folder: str = OUTPUT_FOLDER,
    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
    in_excel_tabular_sheets: str = "",
    progress: Progress = Progress(track_tqdm=True),
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
    """
    Find duplicate cells/text in tabular data files (CSV, XLSX, Parquet).

    Args:
        input_files (List[str]): List of file paths to analyze
        similarity_threshold (float): Minimum similarity score to consider duplicates
        min_word_count (int): Minimum word count for text to be considered
        text_columns (List[str], optional): Specific columns to analyze
        output_folder (str, optional): Output folder for results
        do_initial_clean_dup (bool, optional): Whether to do initial clean of text
        progress (Progress): Progress tracking object

    Returns:
        Tuple containing:
        - results_df: DataFrame with duplicate matches
        - output_paths: List of output file paths
        - full_data_by_file: Dictionary of processed data by file
    """

    if not input_files:
        raise gr.Error("Please upload files to analyze.")

    progress(0.1, desc="Loading and processing files...")

    all_data_to_process = list()
    full_data_by_file = dict()
    file_paths = list()

    # Process each file
    for file_path in input_files:
        try:
            if file_path.endswith(".xlsx") or file_path.endswith(".xls"):
                temp_df = pd.DataFrame()

                # Try finding each sheet in the given list until a match is found
                for sheet_name in in_excel_tabular_sheets:
                    temp_df = read_file(file_path, excel_sheet_name=sheet_name)

                    # If sheet was successfully_loaded
                    if not temp_df.empty:

                        if temp_df.shape[0] > MAX_TABLE_ROWS:
                            out_message = f"Number of rows in {file_path} for sheet {sheet_name} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
                            print(out_message)
                            raise Exception(out_message)

                        file_name = os.path.basename(file_path) + "_" + sheet_name
                        file_paths.append(file_path)

                        # Convert to analysis format
                        processed_data = convert_tabular_data_to_analysis_format(
                            temp_df, file_name, text_columns
                        )

                        if processed_data:
                            all_data_to_process.extend(processed_data)
                            full_data_by_file[file_name] = processed_data[0][1]

                    temp_df = pd.DataFrame()
            else:
                temp_df = read_file(file_path)

                if temp_df.shape[0] > MAX_TABLE_ROWS:
                    out_message = f"Number of rows in {file_path} is greater than {MAX_TABLE_ROWS}. Please submit a smaller file."
                    print(out_message)
                    raise Exception(out_message)

                file_name = os.path.basename(file_path)
                file_paths.append(file_path)

                # Convert to analysis format
                processed_data = convert_tabular_data_to_analysis_format(
                    temp_df, file_name, text_columns
                )

                if processed_data:
                    all_data_to_process.extend(processed_data)
                    full_data_by_file[file_name] = processed_data[0][1]

        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

    if not all_data_to_process:
        raise gr.Error("No valid data found in uploaded files.")

    progress(0.2, desc="Combining data...")

    # Combine all data
    combined_df = pd.concat(
        [data[1] for data in all_data_to_process], ignore_index=True
    )

    combined_df = combined_df.drop_duplicates(subset=["row_number", "file"])

    progress(0.3, desc="Cleaning and preparing text...")

    # Clean and prepare text
    combined_df = clean_and_stem_text_series(
        combined_df, "text", do_initial_clean_dup=do_initial_clean_dup
    )

    # Filter by minimum word count
    combined_df["word_count"] = (
        combined_df["text_clean"].str.split().str.len().fillna(0)
    )
    combined_df = combined_df[combined_df["word_count"] >= min_word_count].copy()

    if len(combined_df) < 2:
        return pd.DataFrame(), [], full_data_by_file

    progress(0.4, desc="Calculating similarities...")

    # Calculate similarities
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_df["text_clean"])
    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)

    # Find similar pairs
    coo_matrix = similarity_matrix.tocoo()
    similar_pairs = [
        (r, c, v)
        for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
        if r < c and v >= similarity_threshold
    ]

    if not similar_pairs:
        gr.Info("No duplicate cells found.")
        return pd.DataFrame(), [], full_data_by_file

    progress(0.7, desc="Processing results...")

    # Create results DataFrame
    results_data = []
    for row1, row2, similarity in similar_pairs:
        row1_data = combined_df.iloc[row1]
        row2_data = combined_df.iloc[row2]

        results_data.append(
            {
                "File1": row1_data["file"],
                "Row1": int(row1_data["row_number"]),
                "File2": row2_data["file"],
                "Row2": int(row2_data["row_number"]),
                "Similarity_Score": round(similarity, 3),
                "Text1": (
                    row1_data["text"][:200] + "..."
                    if len(row1_data["text"]) > 200
                    else row1_data["text"]
                ),
                "Text2": (
                    row2_data["text"][:200] + "..."
                    if len(row2_data["text"]) > 200
                    else row2_data["text"]
                ),
                "Original_Index1": row1,
                "Original_Index2": row2,
            }
        )

    results_df = pd.DataFrame(results_data)
    results_df = results_df.sort_values(["File1", "Row1", "File2", "Row2"])

    progress(0.9, desc="Saving results...")

    # Save results
    output_paths = save_tabular_duplicate_results(
        results_df,
        output_folder,
        file_paths,
        remove_duplicate_rows=remove_duplicate_rows,
        in_excel_tabular_sheets=in_excel_tabular_sheets,
    )

    gr.Info(f"Found {len(results_df)} duplicate cell matches")

    return results_df, output_paths, full_data_by_file


def save_tabular_duplicate_results(
    results_df: pd.DataFrame,
    output_folder: str,
    file_paths: List[str],
    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
    in_excel_tabular_sheets: List[str] = [],
) -> List[str]:
    """
    Save tabular duplicate detection results to files.

    Args:
        results_df (pd.DataFrame): Results DataFrame
        output_folder (str): Output folder path
        file_paths (List[str]): List of file paths
        remove_duplicate_rows (bool): Whether to remove duplicate rows
        in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
    Returns:
        List[str]: List of output file paths
    """
    output_paths = list()
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(exist_ok=True)

    if results_df.empty:
        print("No duplicate matches to save.")
        return list()

    # Save main results
    results_file = output_folder_path / "tabular_duplicate_results.csv"
    results_df.to_csv(results_file, index=False, encoding="utf-8-sig")
    output_paths.append(str(results_file))

    # Group results by original file to handle Excel files properly
    excel_files_processed = dict()  # Track which Excel files have been processed

    # Save per-file duplicate lists
    for file_name, group in results_df.groupby("File2"):
        # Check for matches with original file names
        for original_file in file_paths:
            original_file_name = os.path.basename(original_file)

            if original_file_name in file_name:
                original_file_extension = os.path.splitext(original_file)[-1]
                if original_file_extension in [".xlsx", ".xls"]:

                    # Split the string using secure regex to handle both .xlsx_ and .xls_ delimiters
                    from tools.secure_regex_utils import safe_split_filename

                    parts = safe_split_filename(
                        os.path.basename(file_name), [".xlsx_", ".xls_"]
                    )
                    # The sheet name is the last part after splitting
                    file_sheet_name = parts[-1]

                    file_path = original_file

                    # Initialize Excel file tracking if not already done
                    if file_path not in excel_files_processed:
                        excel_files_processed[file_path] = {
                            "sheets_data": dict(),
                            "all_sheets": list(),
                            "processed_sheets": set(),
                        }

                    # Read the original Excel file to get all sheet names
                    if not excel_files_processed[file_path]["all_sheets"]:
                        try:
                            excel_file = pd.ExcelFile(file_path)
                            excel_files_processed[file_path][
                                "all_sheets"
                            ] = excel_file.sheet_names
                        except Exception as e:
                            print(f"Error reading Excel file {file_path}: {e}")
                            continue

                    # Read the current sheet
                    df = read_file(file_path, excel_sheet_name=file_sheet_name)

                    # Create duplicate rows file for this sheet
                    file_stem = Path(file_name).stem
                    duplicate_rows_file = (
                        output_folder_path
                        / f"{file_stem}_{file_sheet_name}_duplicate_rows.csv"
                    )

                    # Get unique row numbers to remove
                    rows_to_remove = sorted(group["Row2"].unique())
                    duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
                    duplicate_df.to_csv(duplicate_rows_file, index=False)
                    output_paths.append(str(duplicate_rows_file))

                    # Process the sheet data
                    df_cleaned = df.copy()
                    df_cleaned["duplicated"] = False
                    df_cleaned.loc[rows_to_remove, "duplicated"] = True
                    if remove_duplicate_rows:
                        df_cleaned = df_cleaned.drop(index=rows_to_remove)

                    # Store the processed sheet data
                    excel_files_processed[file_path]["sheets_data"][
                        file_sheet_name
                    ] = df_cleaned
                    excel_files_processed[file_path]["processed_sheets"].add(
                        file_sheet_name
                    )

                else:
                    file_sheet_name = ""
                    file_path = original_file
                    print("file_path after match:", file_path)
                    file_base_name = os.path.basename(file_path)
                    df = read_file(file_path)

                    file_stem = Path(file_name).stem
                    duplicate_rows_file = (
                        output_folder_path / f"{file_stem}_duplicate_rows.csv"
                    )

                    # Get unique row numbers to remove
                    rows_to_remove = sorted(group["Row2"].unique())
                    duplicate_df = pd.DataFrame({"Row_to_Remove": rows_to_remove})
                    duplicate_df.to_csv(duplicate_rows_file, index=False)
                    output_paths.append(str(duplicate_rows_file))

                    df_cleaned = df.copy()
                    df_cleaned["duplicated"] = False
                    df_cleaned.loc[rows_to_remove, "duplicated"] = True
                    if remove_duplicate_rows:
                        df_cleaned = df_cleaned.drop(index=rows_to_remove)

                    file_ext = os.path.splitext(file_name)[-1]

                    if file_ext in [".parquet"]:
                        output_path = secure_join(
                            output_folder, f"{file_base_name}_deduplicated.parquet"
                        )
                        df_cleaned.to_parquet(output_path, index=False)
                    else:
                        output_path = secure_join(
                            output_folder, f"{file_base_name}_deduplicated.csv"
                        )
                        df_cleaned.to_csv(
                            output_path, index=False, encoding="utf-8-sig"
                        )

                    output_paths.append(str(output_path))
                break

    # Process Excel files to create complete deduplicated files
    for file_path, file_data in excel_files_processed.items():
        try:
            # Create output filename
            file_base_name = os.path.splitext(os.path.basename(file_path))[0]
            file_ext = os.path.splitext(file_path)[-1]
            output_path = secure_join(
                output_folder, f"{file_base_name}_deduplicated{file_ext}"
            )

            # Create Excel writer
            with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
                # Write all sheets
                for sheet_name in file_data["all_sheets"]:
                    if sheet_name in file_data["processed_sheets"]:
                        # Use the processed (deduplicated) version
                        file_data["sheets_data"][sheet_name].to_excel(
                            writer, sheet_name=sheet_name, index=False
                        )
                    else:
                        # Use the original sheet (no duplicates found)
                        original_df = read_file(file_path, excel_sheet_name=sheet_name)
                        original_df.to_excel(writer, sheet_name=sheet_name, index=False)

            output_paths.append(str(output_path))
            print(f"Created deduplicated Excel file: {output_path}")

        except Exception as e:
            print(f"Error creating deduplicated Excel file for {file_path}: {e}")
            continue

    return output_paths


def remove_duplicate_rows_from_tabular_data(
    file_path: str,
    duplicate_rows: List[int],
    output_folder: str = OUTPUT_FOLDER,
    in_excel_tabular_sheets: List[str] = [],
    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
) -> str:
    """
    Remove duplicate rows from a tabular data file.

    Args:
        file_path (str): Path to the input file
        duplicate_rows (List[int]): List of row indices to remove
        output_folder (str): Output folder for cleaned file
        in_excel_tabular_sheets (str): Name of the Excel sheet to save the results to
        remove_duplicate_rows (bool): Whether to remove duplicate rows
    Returns:
        str: Path to the cleaned file
    """
    try:
        # Load the file
        df = read_file(
            file_path,
            excel_sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else "",
        )

        # Remove duplicate rows (0-indexed)
        df_cleaned = df.drop(index=duplicate_rows).reset_index(drop=True)

        # Save cleaned file
        file_name = os.path.basename(file_path)
        file_stem = os.path.splitext(file_name)[0]
        file_ext = os.path.splitext(file_name)[-1]

        output_path = secure_join(output_folder, f"{file_stem}_deduplicated{file_ext}")

        if file_ext in [".xlsx", ".xls"]:
            df_cleaned.to_excel(
                output_path,
                index=False,
                sheet_name=in_excel_tabular_sheets if in_excel_tabular_sheets else [],
            )
        elif file_ext in [".parquet"]:
            df_cleaned.to_parquet(output_path, index=False)
        else:
            df_cleaned.to_csv(output_path, index=False, encoding="utf-8-sig")

        return output_path

    except Exception as e:
        print(f"Error removing duplicates from {file_path}: {e}")
        raise


def run_tabular_duplicate_analysis(
    files: List[str],
    threshold: float,
    min_words: int,
    text_columns: List[str] = [],
    output_folder: str = OUTPUT_FOLDER,
    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
    in_excel_tabular_sheets: List[str] = [],
    progress: Progress = Progress(track_tqdm=True),
) -> Tuple[pd.DataFrame, List[str], Dict[str, pd.DataFrame]]:
    """
    Main function to run tabular duplicate analysis.

    Args:
        files (List[str]): List of file paths
        threshold (float): Similarity threshold
        min_words (int): Minimum word count
        text_columns (List[str], optional): Specific columns to analyze
        output_folder (str, optional): Output folder for results
        progress (Progress): Progress tracking

    Returns:
        Tuple containing results DataFrame, output paths, and full data by file
    """
    return find_duplicate_cells_in_tabular_data(
        input_files=files,
        similarity_threshold=threshold,
        min_word_count=min_words,
        text_columns=text_columns if text_columns else [],
        output_folder=output_folder,
        do_initial_clean_dup=do_initial_clean_dup,
        in_excel_tabular_sheets=(
            in_excel_tabular_sheets if in_excel_tabular_sheets else []
        ),
        remove_duplicate_rows=remove_duplicate_rows,
    )


# Function to update column choices when files are uploaded
def update_tabular_column_choices(files, in_excel_tabular_sheets: List[str] = []):
    if not files:
        return gr.update(choices=[])

    all_columns = set()
    for file in files:
        try:
            file_extension = os.path.splitext(file.name)[-1]
            if file_extension in [".xlsx", ".xls"]:
                for sheet_name in in_excel_tabular_sheets:
                    df = read_file(file.name, excel_sheet_name=sheet_name)
                    text_cols = df.select_dtypes(
                        include=["object", "string"]
                    ).columns.tolist()
                    all_columns.update(text_cols)
            else:
                df = read_file(file.name)
                text_cols = df.select_dtypes(
                    include=["object", "string"]
                ).columns.tolist()
                all_columns.update(text_cols)

            # Get text columns
            text_cols = df.select_dtypes(include=["object", "string"]).columns.tolist()

            all_columns.update(text_cols)
        except Exception as e:
            print(f"Error reading {file.name}: {e}")
            continue

    return gr.Dropdown(choices=sorted(list(all_columns)))


# Function to handle tabular duplicate detection
def run_tabular_duplicate_detection(
    files,
    threshold,
    min_words,
    text_columns,
    output_folder: str = OUTPUT_FOLDER,
    do_initial_clean_dup: bool = DO_INITIAL_TABULAR_DATA_CLEAN,
    in_excel_tabular_sheets: List[str] = [],
    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
):
    if not files:
        print("No files uploaded")
        return pd.DataFrame(), [], gr.Dropdown(choices=[]), 0, "deduplicate"

    start_time = time.time()

    task_textbox = "deduplicate"

    # If output folder doesn't end with a forward slash, add one
    if not output_folder.endswith("/"):
        output_folder = output_folder + "/"

    file_paths = list()
    if isinstance(files, str):
        # If 'files' is a single string, treat it as a list with one element
        file_paths.append(files)
    elif isinstance(files, list):
        # If 'files' is a list, iterate through its elements
        for f_item in files:
            if isinstance(f_item, str):
                # If an element is a string, it's a direct file path
                file_paths.append(f_item)
            elif hasattr(f_item, "name"):
                # If an element has a '.name' attribute (e.g., a Gradio File object), use its name
                file_paths.append(f_item.name)
            else:
                # Log a warning for unexpected element types within the list
                print(
                    f"Warning: Skipping an element in 'files' list that is neither a string nor has a '.name' attribute: {type(f_item)}"
                )
    elif hasattr(files, "name"):
        # Handle the case where a single file object (e.g., gr.File) is passed directly, not in a list
        file_paths.append(files.name)
    else:
        # Raise an error for any other unexpected type of the 'files' argument itself
        raise TypeError(
            f"Unexpected type for 'files' argument: {type(files)}. Expected str, list of str/file objects, or a single file object."
        )

    if len(file_paths) > MAX_SIMULTANEOUS_FILES:
        out_message = f"Number of files to deduplicate is greater than {MAX_SIMULTANEOUS_FILES}. Please submit a smaller number of files."
        print(out_message)
        raise Exception(out_message)

    results_df, output_paths, full_data = run_tabular_duplicate_analysis(
        files=file_paths,
        threshold=threshold,
        min_words=min_words,
        text_columns=text_columns if text_columns else [],
        output_folder=output_folder,
        do_initial_clean_dup=do_initial_clean_dup,
        in_excel_tabular_sheets=(
            in_excel_tabular_sheets if in_excel_tabular_sheets else None
        ),
        remove_duplicate_rows=remove_duplicate_rows,
    )

    # Update file choices for cleaning
    file_choices = list(set([f for f in file_paths]))

    end_time = time.time()
    processing_time = round(end_time - start_time, 2)

    return (
        results_df,
        output_paths,
        gr.Dropdown(choices=file_choices),
        processing_time,
        task_textbox,
    )


# Function to handle row selection for preview
def handle_tabular_row_selection(results_df, evt: gr.SelectData):

    if not evt:
        return None, "", ""

    if not isinstance(results_df, pd.DataFrame):
        return None, "", ""
    elif results_df.empty:
        return None, "", ""

    selected_index = evt.index[0]
    if selected_index >= len(results_df):
        return None, "", ""

    row = results_df.iloc[selected_index]
    return selected_index, row["Text1"], row["Text2"]


# Function to clean duplicates from selected file
def clean_tabular_duplicates(
    file_name,
    results_df,
    output_folder,
    in_excel_tabular_sheets: str = "",
    remove_duplicate_rows: bool = REMOVE_DUPLICATE_ROWS,
):
    if not file_name or results_df.empty:
        return None

    # Get duplicate rows for this file
    file_duplicates = results_df[results_df["File2"] == file_name]["Row2"].tolist()

    if not file_duplicates:
        return None

    try:
        # Find the original file path
        # This is a simplified approach - in practice you might want to store file paths
        cleaned_file = remove_duplicate_rows_from_tabular_data(
            file_path=file_name,
            duplicate_rows=file_duplicates,
            output_folder=output_folder,
            in_excel_tabular_sheets=in_excel_tabular_sheets,
            remove_duplicate_rows=remove_duplicate_rows,
        )
        return cleaned_file
    except Exception as e:
        print(f"Error cleaning duplicates: {e}")
        return None