Spaces:
Running
Running
Upload 8 files
Browse files- app.py +117 -0
- duplicate.yaml +46 -0
- output/output.csv +0 -0
- static/script.js +28 -0
- static/styles.css +54 -0
- templates/index.html +29 -0
- uploads/Vendor Master File.csv +0 -0
- uploads/Vendor Master File_input.csv +0 -0
app.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request, redirect, url_for
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from fuzzywuzzy import fuzz
|
| 4 |
+
import os
|
| 5 |
+
from flask import send_file
|
| 6 |
+
|
| 7 |
+
app = Flask(__name__)
|
| 8 |
+
app.config['UPLOAD_FOLDER'] = 'uploads'
|
| 9 |
+
app.config['OUTPUT_FOLDER'] = 'output'
|
| 10 |
+
output_file = None
|
| 11 |
+
|
| 12 |
+
def process_csv(input_path):
|
| 13 |
+
global output_file
|
| 14 |
+
df = pd.read_csv(input_path)
|
| 15 |
+
|
| 16 |
+
df['Address'] = df['Address (street)'].astype(str) + '-' + df['Postal code'].astype(str) + '-' + df['City'].astype(
|
| 17 |
+
str) + '-' + df['Country'].astype(str) + df['Region'].astype(str)
|
| 18 |
+
# df['Name'] = df['NAME_FIRST'].astype(str) + '-' + df['NAME_LAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str)
|
| 19 |
+
df['Name'] = df['Vendor Name'].astype(str)
|
| 20 |
+
|
| 21 |
+
df['Name'] = df['Name'].str.lower()
|
| 22 |
+
df['Address'] = df['Address'].str.lower()
|
| 23 |
+
|
| 24 |
+
df.sort_values(['Name'], inplace=True)
|
| 25 |
+
df = df.reset_index(drop=True)
|
| 26 |
+
|
| 27 |
+
df['name_fuzzy_ratio'] = ''
|
| 28 |
+
df['address_fuzzy_ratio'] = ''
|
| 29 |
+
df['name_based_group'] = ''
|
| 30 |
+
df['address_based_group'] = ''
|
| 31 |
+
|
| 32 |
+
last_row_index = len(df) - 1
|
| 33 |
+
df.at[0, 'name_fuzzy_ratio'] = 100
|
| 34 |
+
df.at[0, 'address_fuzzy_ratio'] = 100
|
| 35 |
+
df.at[last_row_index, 'name_fuzzy_ratio'] = 100
|
| 36 |
+
df.at[last_row_index, 'address_fuzzy_ratio'] = 100
|
| 37 |
+
|
| 38 |
+
for i in range(1, last_row_index):
|
| 39 |
+
current_name = df['Name'].iloc[i]
|
| 40 |
+
previous_name = df['Name'].iloc[i - 1]
|
| 41 |
+
fuzzy_ratio = fuzz.ratio(previous_name, current_name)
|
| 42 |
+
df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio
|
| 43 |
+
|
| 44 |
+
df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
|
| 45 |
+
|
| 46 |
+
group_counter = 1
|
| 47 |
+
df.at[0, 'name_based_group'] = group_counter
|
| 48 |
+
|
| 49 |
+
for i in range(1, len(df)):
|
| 50 |
+
if df.at[i, 'name_fuzzy_ratio'] > 80:
|
| 51 |
+
df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group']
|
| 52 |
+
else:
|
| 53 |
+
group_counter += 1
|
| 54 |
+
df.at[i, 'name_based_group'] = group_counter
|
| 55 |
+
|
| 56 |
+
group = df.at[0, 'name_based_group']
|
| 57 |
+
|
| 58 |
+
df.sort_values(['name_based_group', 'Address'], inplace=True)
|
| 59 |
+
df = df.reset_index(drop=True)
|
| 60 |
+
|
| 61 |
+
for i in range(1, last_row_index):
|
| 62 |
+
current_address = df['Address'].iloc[i]
|
| 63 |
+
previous_address = df['Address'].iloc[i - 1]
|
| 64 |
+
fuzzy_ratio = fuzz.ratio(previous_address, current_address)
|
| 65 |
+
df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio
|
| 66 |
+
|
| 67 |
+
df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
|
| 68 |
+
|
| 69 |
+
address_group_counter = 1
|
| 70 |
+
df.at[0, 'address_based_group'] = str(address_group_counter)
|
| 71 |
+
|
| 72 |
+
for i in range(1, len(df)):
|
| 73 |
+
if df.at[i, 'address_fuzzy_ratio'] > 70:
|
| 74 |
+
df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group']
|
| 75 |
+
else:
|
| 76 |
+
if df.at[i, 'name_based_group'] != group:
|
| 77 |
+
address_group_counter = 1
|
| 78 |
+
group = df.at[i, 'name_based_group']
|
| 79 |
+
else:
|
| 80 |
+
address_group_counter += 1
|
| 81 |
+
df.at[i, 'address_based_group'] = str(address_group_counter)
|
| 82 |
+
|
| 83 |
+
# Concatenate for unique group name
|
| 84 |
+
df['Group'] = df.apply(lambda row: 'Group_{}_{}'.format(row['name_based_group'], row['address_based_group']),
|
| 85 |
+
axis=1)
|
| 86 |
+
|
| 87 |
+
columns_to_drop = ['name_fuzzy_ratio', 'address_fuzzy_ratio', 'Address', 'Name']
|
| 88 |
+
df.drop(columns=columns_to_drop, inplace=True)
|
| 89 |
+
|
| 90 |
+
output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
|
| 91 |
+
df.to_csv(output_path, index=False)
|
| 92 |
+
output_file = 'output.csv'
|
| 93 |
+
return output_path
|
| 94 |
+
|
| 95 |
+
@app.route('/', methods=['GET', 'POST'])
|
| 96 |
+
def upload_file():
|
| 97 |
+
global output_file
|
| 98 |
+
if request.method == 'POST':
|
| 99 |
+
file = request.files['file']
|
| 100 |
+
if file:
|
| 101 |
+
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
|
| 102 |
+
file.save(file_path)
|
| 103 |
+
output_file = process_csv(file_path)
|
| 104 |
+
return redirect(url_for('upload_file'))
|
| 105 |
+
|
| 106 |
+
return render_template('index.html', output_file=output_file)
|
| 107 |
+
|
| 108 |
+
@app.route('/downloads/output.csv')
|
| 109 |
+
def download_file():
|
| 110 |
+
output_file = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
|
| 111 |
+
return send_file(output_file, as_attachment=True)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
if __name__ == '__main__':
|
| 115 |
+
app.run(debug=True)
|
| 116 |
+
|
| 117 |
+
|
duplicate.yaml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: duplicate_removal
|
| 2 |
+
channels:
|
| 3 |
+
- anaconda
|
| 4 |
+
- defaults
|
| 5 |
+
dependencies:
|
| 6 |
+
- blas=1.0=mkl
|
| 7 |
+
- bottleneck=1.3.7=py310h9128911_0
|
| 8 |
+
- bzip2=1.0.8=h2bbff1b_5
|
| 9 |
+
- ca-certificates=2023.12.12=haa95532_0
|
| 10 |
+
- click=8.1.7=py310haa95532_0
|
| 11 |
+
- colorama=0.4.6=py310haa95532_0
|
| 12 |
+
- flask=2.2.2=py310haa95532_0
|
| 13 |
+
- fuzzywuzzy=0.18.0=py310haa95532_0
|
| 14 |
+
- intel-openmp=2023.1.0=h59b6b97_46320
|
| 15 |
+
- itsdangerous=2.0.1=pyhd3eb1b0_0
|
| 16 |
+
- jinja2=3.1.3=py310haa95532_0
|
| 17 |
+
- libffi=3.4.4=hd77b12b_0
|
| 18 |
+
- markupsafe=2.1.3=py310h2bbff1b_0
|
| 19 |
+
- mkl=2023.1.0=h6b88ed4_46358
|
| 20 |
+
- mkl-service=2.4.0=py310h2bbff1b_1
|
| 21 |
+
- mkl_fft=1.3.8=py310h2bbff1b_0
|
| 22 |
+
- mkl_random=1.2.4=py310h59b6b97_0
|
| 23 |
+
- numexpr=2.8.7=py310h2cd9be0_0
|
| 24 |
+
- numpy=1.26.4=py310h055cbcc_0
|
| 25 |
+
- numpy-base=1.26.4=py310h65a83cf_0
|
| 26 |
+
- openssl=3.0.13=h2bbff1b_0
|
| 27 |
+
- pandas=2.2.1=py310h5da7b33_0
|
| 28 |
+
- pip=23.3.1=py310haa95532_0
|
| 29 |
+
- python=3.10.13=he1021f5_0
|
| 30 |
+
- python-dateutil=2.8.2=pyhd3eb1b0_0
|
| 31 |
+
- python-levenshtein=0.12.2=py310h2bbff1b_0
|
| 32 |
+
- python-tzdata=2023.3=pyhd3eb1b0_0
|
| 33 |
+
- pytz=2023.3.post1=py310haa95532_0
|
| 34 |
+
- setuptools=68.2.2=py310haa95532_0
|
| 35 |
+
- six=1.16.0=pyhd3eb1b0_1
|
| 36 |
+
- sqlite=3.41.2=h2bbff1b_0
|
| 37 |
+
- tbb=2021.8.0=h59b6b97_0
|
| 38 |
+
- tk=8.6.12=h2bbff1b_0
|
| 39 |
+
- tzdata=2024a=h04d1e81_0
|
| 40 |
+
- vc=14.2=h21ff451_1
|
| 41 |
+
- vs2015_runtime=14.27.29016=h5e58377_2
|
| 42 |
+
- werkzeug=2.3.8=py310haa95532_0
|
| 43 |
+
- wheel=0.41.2=py310haa95532_0
|
| 44 |
+
- xz=5.4.6=h8cc25b3_0
|
| 45 |
+
- zlib=1.2.13=h8cc25b3_0
|
| 46 |
+
prefix: C:\Users\snigd\.conda\envs\duplicate_removal
|
output/output.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
static/script.js
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
function submitForm() {
|
| 2 |
+
var fileInput = document.getElementById('csvFile');
|
| 3 |
+
var processingMsg = document.getElementById('processingMsg');
|
| 4 |
+
|
| 5 |
+
if (fileInput.files.length === 0) {
|
| 6 |
+
alert('Please select a CSV file.');
|
| 7 |
+
return;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
var formData = new FormData();
|
| 11 |
+
formData.append('csvFile', fileInput.files[0]);
|
| 12 |
+
|
| 13 |
+
// Show processing message
|
| 14 |
+
document.getElementById('uploadForm').classList.add('hidden');
|
| 15 |
+
processingMsg.classList.remove('hidden');
|
| 16 |
+
|
| 17 |
+
// Simulate backend processing (replace with actual AJAX call)
|
| 18 |
+
setTimeout(function() {
|
| 19 |
+
// After processing (simulated with setTimeout), show success message
|
| 20 |
+
processingMsg.innerHTML = '<p>File processed successfully. <a href="#" onclick="downloadProcessedFile()">Download processed file</a></p>';
|
| 21 |
+
}, 2000);
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
function downloadProcessedFile() {
|
| 25 |
+
// Here you can add code to download the processed file
|
| 26 |
+
alert('Downloading processed file...');
|
| 27 |
+
// Replace this alert with your actual download logic
|
| 28 |
+
}
|
static/styles.css
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
body {
|
| 2 |
+
font-family: Arial, sans-serif;
|
| 3 |
+
background-color: #f0f0f0;
|
| 4 |
+
margin: 0;
|
| 5 |
+
padding: 200px;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
.container {
|
| 9 |
+
max-width: 600px;
|
| 10 |
+
margin: 0 auto;
|
| 11 |
+
align-items: center;
|
| 12 |
+
background-color: #fff;
|
| 13 |
+
padding: 20px;
|
| 14 |
+
border-radius: 5px;
|
| 15 |
+
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
.btn {
|
| 19 |
+
padding: 10px 200px;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
h1 {
|
| 23 |
+
text-align: center;
|
| 24 |
+
color: #333;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
form {
|
| 28 |
+
display: flex;
|
| 29 |
+
flex-direction: column;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
input[type="file"] {
|
| 33 |
+
margin-bottom: 10px;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
button {
|
| 37 |
+
padding: 10px 20px;
|
| 38 |
+
background-color: #007bff;
|
| 39 |
+
color: #fff;
|
| 40 |
+
border: none;
|
| 41 |
+
cursor: pointer;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
button:hover {
|
| 45 |
+
background-color: #0056b3;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
.hidden {
|
| 49 |
+
display: none;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
#processingMsg {
|
| 53 |
+
text-align: center;
|
| 54 |
+
}
|
templates/index.html
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Duplicate Removal</title>
|
| 7 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<div class="container">
|
| 11 |
+
<h1>Duplicate Removal</h1>
|
| 12 |
+
<form id="uploadForm" enctype="multipart/form-data" action="/" method="POST">
|
| 13 |
+
<input type="file" name="file" id="csvFile" accept=".csv">
|
| 14 |
+
<button type="submit" id="submitBtn">Submit</button>
|
| 15 |
+
</form>
|
| 16 |
+
<div id="processingMsg" class="hidden">
|
| 17 |
+
<p>Processing...</p>
|
| 18 |
+
</div>
|
| 19 |
+
{% if output_file %}
|
| 20 |
+
<div class="btn">
|
| 21 |
+
<a href="{{ url_for('download_file', filename=output_file) }}" download>
|
| 22 |
+
<button>Download Processed CSV</button>
|
| 23 |
+
</a>
|
| 24 |
+
</div>
|
| 25 |
+
{% endif %}
|
| 26 |
+
</div>
|
| 27 |
+
<script src="{{ url_for('static', filename='script.js') }}"></script>
|
| 28 |
+
</body>
|
| 29 |
+
</html>
|
uploads/Vendor Master File.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
uploads/Vendor Master File_input.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|