Spaces:

mike23415
/

Data-analytics

Sleeping

App Files Files Community

Data-analytics / app.py

mike23415

Create app.py

77bf716 verified about 2 months ago

raw

history blame

5.55 kB

	from flask import Flask, request, jsonify, send_file
	from flask_cors import CORS
	import pandas as pd
	import os
	import threading
	import time
	import re

	app = Flask(__name__)
	CORS(app)

	UPLOAD_FOLDER = "/tmp"
	SESSION_KEY_PREFIX = "data_tool_session_id"
	app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
	app.config['MAX_CONTENT_LENGTH'] = 512 * 1024 * 1024 # 512 MB

	# Cleanup function runs every 10 mins and deletes files older than 60 mins
	def clean_old_files(folder=UPLOAD_FOLDER, max_age=60):
	while True:
	now = time.time()
	for f in os.listdir(folder):
	path = os.path.join(folder, f)
	if os.path.isfile(path):
	if now - os.path.getmtime(path) > max_age * 60:
	try:
	os.remove(path)
	print(f"[Cleanup] Deleted: {path}")
	except Exception as e:
	print(f"[Cleanup Error] {e}")
	time.sleep(600) # Run every 10 minutes

	# Start cleanup thread at launch
	threading.Thread(target=clean_old_files, daemon=True).start()

	def apply_instruction(df, instruction):
	instruction = instruction.lower()

	try:
	# Drop column
	match = re.search(r"drop column (\w+)", instruction)
	if match:
	df = df.drop(columns=[match.group(1)])

	# Remove duplicates
	if "remove duplicates" in instruction:
	df = df.drop_duplicates()

	# Drop missing values
	if "drop missing" in instruction or "remove null" in instruction:
	df = df.dropna()

	# Fill missing values
	match = re.search(r"fill missing.*with ([\w\.]+)", instruction)
	if match:
	val = match.group(1)
	try:
	val = float(val)
	except:
	pass
	df = df.fillna(val)

	# Sort
	match = re.search(r"sort by (\w+)( descending\| desc)?", instruction)
	if match:
	col = match.group(1)
	ascending = not bool(match.group(2))
	df = df.sort_values(by=col, ascending=ascending)

	# Rename
	match = re.search(r"rename column (\w+) to (\w+)", instruction)
	if match:
	old, new = match.group(1), match.group(2)
	df = df.rename(columns={old: new})

	# Filter where col > val
	match = re.search(r"filter where (\w+) > (\d+)", instruction)
	if match:
	col, val = match.group(1), float(match.group(2))
	df = df[df[col] > val]

	# Group by and sum
	match = re.search(r"group by (\w+) and sum (\w+)", instruction)
	if match:
	group_col, sum_col = match.group(1), match.group(2)
	df = df.groupby(group_col)[sum_col].sum().reset_index()

	# Add column as sum
	match = re.search(r"add column (\w+) as (\w+) \+ (\w+)", instruction)
	if match:
	new_col, col1, col2 = match.group(1), match.group(2), match.group(3)
	df[new_col] = df[col1] + df[col2]

	# Normalize column
	match = re.search(r"normalize column (\w+)", instruction)
	if match:
	col = match.group(1)
	df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

	# Standardize column
	match = re.search(r"standardize column (\w+)", instruction)
	if match:
	col = match.group(1)
	df[col] = (df[col] - df[col].mean()) / df[col].std()

	# Split column by comma
	match = re.search(r"split column (\w+) by comma", instruction)
	if match:
	col = match.group(1)
	df[[f"{col}_1", f"{col}_2"]] = df[col].str.split(",", expand=True)

	# Remove special characters
	match = re.search(r"remove special characters from (\w+)", instruction)
	if match:
	col = match.group(1)
	df[col] = df[col].astype(str).str.replace(r"[^a-zA-Z0-9]", "", regex=True)

	except Exception as e:
	return df, f"Error: {e}"

	return df, "success"

	@app.route("/process", methods=["POST"])
	def process_file():
	if "file" not in request.files or "instruction" not in request.form or "session_id" not in request.form:
	return jsonify({"error": "Missing file, instruction, or session_id"}), 400

	file = request.files["file"]
	instruction = request.form["instruction"]
	session_id = request.form["session_id"]

	try:
	df = pd.read_csv(file) if file.filename.endswith(".csv") else pd.read_excel(file)
	except Exception as e:
	return jsonify({"error": f"Failed to read file: {str(e)}"}), 400

	df, status = apply_instruction(df, instruction)

	filename = f"cleaned_{session_id}_{file.filename}"
	output_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
	df.to_csv(output_path, index=False)

	preview = df.head(10).to_dict(orient="records")
	return jsonify({
	"preview": preview,
	"download_url": f"/download/{filename}",
	"status": status
	})

	@app.route("/download/<filename>", methods=["GET"])
	def download_file(filename):
	session_id = request.args.get("session_id")
	if not session_id or f"_{session_id}_" not in filename:
	return jsonify({"error": "Unauthorized download attempt"}), 403

	path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
	if os.path.exists(path):
	return send_file(path, as_attachment=True)
	return jsonify({"error": "File not found"}), 404

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)