Sambit20030731 commited on
Commit
f218e1e
·
verified ·
1 Parent(s): 158e939

Upload 8 files

Browse files
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, redirect, url_for
2
+ import pandas as pd
3
+ from fuzzywuzzy import fuzz
4
+ import os
5
+ from flask import send_file
6
+
7
+ app = Flask(__name__)
8
+ app.config['UPLOAD_FOLDER'] = 'uploads'
9
+ app.config['OUTPUT_FOLDER'] = 'output'
10
+ output_file = None
11
+
12
+ def process_csv(input_path):
13
+ global output_file
14
+ df = pd.read_csv(input_path)
15
+
16
+ df['Address'] = df['Address (street)'].astype(str) + '-' + df['Postal code'].astype(str) + '-' + df['City'].astype(
17
+ str) + '-' + df['Country'].astype(str) + df['Region'].astype(str)
18
+ # df['Name'] = df['NAME_FIRST'].astype(str) + '-' + df['NAME_LAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str)
19
+ df['Name'] = df['Vendor Name'].astype(str)
20
+
21
+ df['Name'] = df['Name'].str.lower()
22
+ df['Address'] = df['Address'].str.lower()
23
+
24
+ df.sort_values(['Name'], inplace=True)
25
+ df = df.reset_index(drop=True)
26
+
27
+ df['name_fuzzy_ratio'] = ''
28
+ df['address_fuzzy_ratio'] = ''
29
+ df['name_based_group'] = ''
30
+ df['address_based_group'] = ''
31
+
32
+ last_row_index = len(df) - 1
33
+ df.at[0, 'name_fuzzy_ratio'] = 100
34
+ df.at[0, 'address_fuzzy_ratio'] = 100
35
+ df.at[last_row_index, 'name_fuzzy_ratio'] = 100
36
+ df.at[last_row_index, 'address_fuzzy_ratio'] = 100
37
+
38
+ for i in range(1, last_row_index):
39
+ current_name = df['Name'].iloc[i]
40
+ previous_name = df['Name'].iloc[i - 1]
41
+ fuzzy_ratio = fuzz.ratio(previous_name, current_name)
42
+ df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio
43
+
44
+ df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
45
+
46
+ group_counter = 1
47
+ df.at[0, 'name_based_group'] = group_counter
48
+
49
+ for i in range(1, len(df)):
50
+ if df.at[i, 'name_fuzzy_ratio'] > 80:
51
+ df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group']
52
+ else:
53
+ group_counter += 1
54
+ df.at[i, 'name_based_group'] = group_counter
55
+
56
+ group = df.at[0, 'name_based_group']
57
+
58
+ df.sort_values(['name_based_group', 'Address'], inplace=True)
59
+ df = df.reset_index(drop=True)
60
+
61
+ for i in range(1, last_row_index):
62
+ current_address = df['Address'].iloc[i]
63
+ previous_address = df['Address'].iloc[i - 1]
64
+ fuzzy_ratio = fuzz.ratio(previous_address, current_address)
65
+ df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio
66
+
67
+ df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
68
+
69
+ address_group_counter = 1
70
+ df.at[0, 'address_based_group'] = str(address_group_counter)
71
+
72
+ for i in range(1, len(df)):
73
+ if df.at[i, 'address_fuzzy_ratio'] > 70:
74
+ df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group']
75
+ else:
76
+ if df.at[i, 'name_based_group'] != group:
77
+ address_group_counter = 1
78
+ group = df.at[i, 'name_based_group']
79
+ else:
80
+ address_group_counter += 1
81
+ df.at[i, 'address_based_group'] = str(address_group_counter)
82
+
83
+ # Concatenate for unique group name
84
+ df['Group'] = df.apply(lambda row: 'Group_{}_{}'.format(row['name_based_group'], row['address_based_group']),
85
+ axis=1)
86
+
87
+ columns_to_drop = ['name_fuzzy_ratio', 'address_fuzzy_ratio', 'Address', 'Name']
88
+ df.drop(columns=columns_to_drop, inplace=True)
89
+
90
+ output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
91
+ df.to_csv(output_path, index=False)
92
+ output_file = 'output.csv'
93
+ return output_path
94
+
95
+ @app.route('/', methods=['GET', 'POST'])
96
+ def upload_file():
97
+ global output_file
98
+ if request.method == 'POST':
99
+ file = request.files['file']
100
+ if file:
101
+ file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
102
+ file.save(file_path)
103
+ output_file = process_csv(file_path)
104
+ return redirect(url_for('upload_file'))
105
+
106
+ return render_template('index.html', output_file=output_file)
107
+
108
+ @app.route('/downloads/output.csv')
109
+ def download_file():
110
+ output_file = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
111
+ return send_file(output_file, as_attachment=True)
112
+
113
+
114
+ if __name__ == '__main__':
115
+ app.run(debug=True)
116
+
117
+
duplicate.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: duplicate_removal
2
+ channels:
3
+ - anaconda
4
+ - defaults
5
+ dependencies:
6
+ - blas=1.0=mkl
7
+ - bottleneck=1.3.7=py310h9128911_0
8
+ - bzip2=1.0.8=h2bbff1b_5
9
+ - ca-certificates=2023.12.12=haa95532_0
10
+ - click=8.1.7=py310haa95532_0
11
+ - colorama=0.4.6=py310haa95532_0
12
+ - flask=2.2.2=py310haa95532_0
13
+ - fuzzywuzzy=0.18.0=py310haa95532_0
14
+ - intel-openmp=2023.1.0=h59b6b97_46320
15
+ - itsdangerous=2.0.1=pyhd3eb1b0_0
16
+ - jinja2=3.1.3=py310haa95532_0
17
+ - libffi=3.4.4=hd77b12b_0
18
+ - markupsafe=2.1.3=py310h2bbff1b_0
19
+ - mkl=2023.1.0=h6b88ed4_46358
20
+ - mkl-service=2.4.0=py310h2bbff1b_1
21
+ - mkl_fft=1.3.8=py310h2bbff1b_0
22
+ - mkl_random=1.2.4=py310h59b6b97_0
23
+ - numexpr=2.8.7=py310h2cd9be0_0
24
+ - numpy=1.26.4=py310h055cbcc_0
25
+ - numpy-base=1.26.4=py310h65a83cf_0
26
+ - openssl=3.0.13=h2bbff1b_0
27
+ - pandas=2.2.1=py310h5da7b33_0
28
+ - pip=23.3.1=py310haa95532_0
29
+ - python=3.10.13=he1021f5_0
30
+ - python-dateutil=2.8.2=pyhd3eb1b0_0
31
+ - python-levenshtein=0.12.2=py310h2bbff1b_0
32
+ - python-tzdata=2023.3=pyhd3eb1b0_0
33
+ - pytz=2023.3.post1=py310haa95532_0
34
+ - setuptools=68.2.2=py310haa95532_0
35
+ - six=1.16.0=pyhd3eb1b0_1
36
+ - sqlite=3.41.2=h2bbff1b_0
37
+ - tbb=2021.8.0=h59b6b97_0
38
+ - tk=8.6.12=h2bbff1b_0
39
+ - tzdata=2024a=h04d1e81_0
40
+ - vc=14.2=h21ff451_1
41
+ - vs2015_runtime=14.27.29016=h5e58377_2
42
+ - werkzeug=2.3.8=py310haa95532_0
43
+ - wheel=0.41.2=py310haa95532_0
44
+ - xz=5.4.6=h8cc25b3_0
45
+ - zlib=1.2.13=h8cc25b3_0
46
+ prefix: C:\Users\snigd\.conda\envs\duplicate_removal
output/output.csv ADDED
The diff for this file is too large to render. See raw diff
 
static/script.js ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function submitForm() {
2
+ var fileInput = document.getElementById('csvFile');
3
+ var processingMsg = document.getElementById('processingMsg');
4
+
5
+ if (fileInput.files.length === 0) {
6
+ alert('Please select a CSV file.');
7
+ return;
8
+ }
9
+
10
+ var formData = new FormData();
11
+ formData.append('csvFile', fileInput.files[0]);
12
+
13
+ // Show processing message
14
+ document.getElementById('uploadForm').classList.add('hidden');
15
+ processingMsg.classList.remove('hidden');
16
+
17
+ // Simulate backend processing (replace with actual AJAX call)
18
+ setTimeout(function() {
19
+ // After processing (simulated with setTimeout), show success message
20
+ processingMsg.innerHTML = '<p>File processed successfully. <a href="#" onclick="downloadProcessedFile()">Download processed file</a></p>';
21
+ }, 2000);
22
+ }
23
+
24
+ function downloadProcessedFile() {
25
+ // Here you can add code to download the processed file
26
+ alert('Downloading processed file...');
27
+ // Replace this alert with your actual download logic
28
+ }
static/styles.css ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Arial, sans-serif;
3
+ background-color: #f0f0f0;
4
+ margin: 0;
5
+ padding: 200px;
6
+ }
7
+
8
+ .container {
9
+ max-width: 600px;
10
+ margin: 0 auto;
11
+ align-items: center;
12
+ background-color: #fff;
13
+ padding: 20px;
14
+ border-radius: 5px;
15
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
16
+ }
17
+
18
+ .btn {
19
+ padding: 10px 200px;
20
+ }
21
+
22
+ h1 {
23
+ text-align: center;
24
+ color: #333;
25
+ }
26
+
27
+ form {
28
+ display: flex;
29
+ flex-direction: column;
30
+ }
31
+
32
+ input[type="file"] {
33
+ margin-bottom: 10px;
34
+ }
35
+
36
+ button {
37
+ padding: 10px 20px;
38
+ background-color: #007bff;
39
+ color: #fff;
40
+ border: none;
41
+ cursor: pointer;
42
+ }
43
+
44
+ button:hover {
45
+ background-color: #0056b3;
46
+ }
47
+
48
+ .hidden {
49
+ display: none;
50
+ }
51
+
52
+ #processingMsg {
53
+ text-align: center;
54
+ }
templates/index.html ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Duplicate Removal</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <div class="container">
11
+ <h1>Duplicate Removal</h1>
12
+ <form id="uploadForm" enctype="multipart/form-data" action="/" method="POST">
13
+ <input type="file" name="file" id="csvFile" accept=".csv">
14
+ <button type="submit" id="submitBtn">Submit</button>
15
+ </form>
16
+ <div id="processingMsg" class="hidden">
17
+ <p>Processing...</p>
18
+ </div>
19
+ {% if output_file %}
20
+ <div class="btn">
21
+ <a href="{{ url_for('download_file', filename=output_file) }}" download>
22
+ <button>Download Processed CSV</button>
23
+ </a>
24
+ </div>
25
+ {% endif %}
26
+ </div>
27
+ <script src="{{ url_for('static', filename='script.js') }}"></script>
28
+ </body>
29
+ </html>
uploads/Vendor Master File.csv ADDED
The diff for this file is too large to render. See raw diff
 
uploads/Vendor Master File_input.csv ADDED
The diff for this file is too large to render. See raw diff