Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

App Files Files Community

Pledge_Tracker / app.py

yulongchen

Add system

bf32721 4 months ago

raw

history blame

11.3 kB

	from flask import Flask, jsonify, send_file, request, send_from_directory
	from flask_cors import CORS
	import os, json, uuid, time
	import pandas as pd
	from datetime import datetime, timedelta
	from huggingface_hub import HfApi
	import sys
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from system.pledge_tracking import run_pipeline
	from huggingface_hub import hf_hub_download
	import spacy
	import traceback
	import threading

	nlp = spacy.load("en_core_web_sm")

	app = Flask(__name__, static_folder='.')
	CORS(app)

	HF_DATASET_REPO = "PledgeTracker/demo_feedback"
	HF_TOKEN = os.environ.get("HF_TOKEN")
	TMP_DIR = "tmp"
	FEEDBACK_DIR = "feedback_logs"
	os.makedirs(TMP_DIR, exist_ok=True)
	os.makedirs(FEEDBACK_DIR, exist_ok=True)

	REFERENCE_PLEDGES = []

	REFERENCE_PLEDGE_PATH = hf_hub_download(
	repo_id="PledgeTracker/demo_feedback",
	filename="existing_pledges.txt",
	repo_type="dataset",
	token=os.environ["HF_TOKEN"]
	)

	if os.path.exists(REFERENCE_PLEDGE_PATH):
	with open(REFERENCE_PLEDGE_PATH, "r") as f:
	REFERENCE_PLEDGES = [line.strip() for line in f if line.strip()]
	else:
	print(f"Missing reference pledge file: {REFERENCE_PLEDGE_PATH}")


	def lemmatize(text):
	doc = nlp(text)
	return " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_space])


	@app.route("/api/similar-pledges", methods=["POST"])
	def similar_pledges():
	data = request.get_json()
	claim = data.get("claim", "").strip()
	if not claim or not REFERENCE_PLEDGES:
	return jsonify({"suggestions": []})

	all_pledges = [claim] + REFERENCE_PLEDGES
	lemmatized_pledges = [lemmatize(p) for p in all_pledges]

	vectorizer = TfidfVectorizer().fit_transform(lemmatized_pledges)
	similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
	filtered = [(i, similarities[i]) for i in range(len(similarities)) if similarities[i] > 0.3]
	top_filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:5]

	suggestions = [
	{"text": REFERENCE_PLEDGES[i], "index": int(i)}
	for i, score in top_filtered
	]

	return jsonify({"suggestions": suggestions})


	def calculate_time_range(option: str, pledge_date: str = None):
	today = datetime.today()
	# pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")

	if isinstance(pledge_date, str):
	pledge_date = datetime.strptime(pledge_date, "%Y-%m-%d")
	elif not isinstance(pledge_date, datetime):
	raise ValueError("pledge_date must be a str or datetime")

	if option == "week":
	one_week_ago = today - timedelta(days=7)
	start = max(one_week_ago, pledge_date)
	elif option == "month":
	one_month_ago = today - timedelta(days=30)
	start = max(one_month_ago, pledge_date)
	elif option == "since_pledge_date":
	if not pledge_date:
	raise ValueError("Pledge date is required for 'since_pledge_date' option")
	start = pledge_date
	else:
	raise ValueError("Invalid time range option")
	print(start, pledge_date)
	return start.strftime("%Y%m%d"), today.strftime("%Y%m%d")

	@app.route("/")
	def serve_html():
	return send_from_directory('.', 'test.html')

	@app.route("/api/status")
	def check_status():
	user_id = request.args.get("user_id")
	timestamp = request.args.get("timestamp")
	log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")
	if not os.path.exists(log_file_path):
	return jsonify({"status": {}}), 200
	try:
	with open(log_file_path, "r") as f:
	status = json.load(f)
	except Exception:
	status = {}

	return jsonify({"status": status})


	@app.route("/api/run-model", methods=["POST"])
	def run_model():
	data = request.get_json()
	claim = data.get("claim", "no input")
	time_range_option = data.get("time_range", "month")
	system_start_time = datetime.now()

	suggestion_meta = data.get("suggestion_meta")
	pledge_date = data.get("pledge_date", "")
	pledge_author = data.get("pledge_author", "")
	timestamp = data.get("timestamp") or time.strftime("%Y-%m-%d_%H-%M-%S")
	user_id = data.get("user_id") or str(uuid.uuid4())[:8]

	log_file_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}_status.log")

	status_lock = threading.Lock()

	def update_status(step_id, msg):
	print(f"[STATUS] Step {step_id}: {msg}")
	with status_lock:
	if os.path.exists(log_file_path):
	try:
	with open(log_file_path, "r") as f:
	current = json.load(f)
	except Exception:
	current = {}
	else:
	current = {}
	current[str(step_id)] = f"{msg}"
	with open(log_file_path, "w") as f:
	json.dump(current, f, indent=2)

	try:
	time_start, time_end = calculate_time_range(time_range_option, pledge_date=pledge_date)
	print(f"[DEMO] Received claim: {claim}")
	print(f"[DEMO] Time range: {time_start} ~ {time_end}")
	print(f"[DEMO] Pledge date range: {pledge_date}")

	# user_id = str(uuid.uuid4())[:8]
	# outputs = run_pipeline(claim, pledge_date, pledge_author, time_start, timestamp, user_id)


	update_status(0, "📌 Starting the system ...")
	print(suggestion_meta)

	outputs = run_pipeline(
	claim, pledge_date, pledge_author, time_start, timestamp, user_id,
	update_fn=update_status, suggestion_meta=suggestion_meta
	)

	df = pd.read_excel(outputs["sorted_events"])
	json_path = os.path.join(TMP_DIR, f"{timestamp}_{user_id}.json")
	df.to_json(json_path, orient="records", indent=2)


	system_end_time = datetime.now()
	runtime = system_end_time - system_start_time

	events = df.to_dict(orient="records")
	log_entry = {
	"requested_time": timestamp,
	"user_id": user_id,
	"pledge": claim,
	"suggestion_meta": suggestion_meta,
	"time_start": time_start,
	"time_end": time_end,
	"runtime": runtime.total_seconds(),
	"pledge_author": pledge_author,
	"pledge_date": pledge_date,
	"events": events
	}
	default_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"

	with open(default_log_path, "w") as f:
	f.write(json.dumps(log_entry, indent=1))

	tsv_path = outputs["augmented_tsv_file"]

	try:
	api = HfApi()
	api.upload_file(
	path_or_fileobj=default_log_path,
	path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN
	)
	api.upload_file(
	path_or_fileobj=tsv_path,
	path_in_repo=f"logs/augmented_{timestamp}_{user_id}.tsv",
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN
	)


	except Exception as e:
	traceback.print_exc()
	print(f"[Default Feedback Upload Error] {e}")

	return jsonify({
	"status": "success",
	"file": f"{timestamp}_{user_id}.json",
	"user_id": user_id,
	"timestamp": timestamp
	})
	except Exception as e:
	traceback.print_exc()
	return jsonify({"status": "error", "detail": str(e)}), 500

	@app.route("/api/events")
	def get_events():
	filename = request.args.get("file")
	file_path = os.path.join(TMP_DIR, filename)

	if not os.path.exists(file_path):
	return jsonify({"error": "File not found"}), 404

	with open(file_path, "r") as f:
	events = json.load(f)

	return jsonify(events)


	@app.route("/api/feedback", methods=["POST"])
	def receive_feedback():
	data = request.get_json()
	pledge = data.get("pledge", "no_pledge_text")
	feedback_list = data.get("feedback", [])
	filename = data.get("file")
	file_path = os.path.join(TMP_DIR, filename)

	timestamp = data.get("timestamp")
	user_id = data.get("user_id")

	if not user_id or not timestamp:
	return jsonify({'status': 'error', 'detail': 'Missing user_id or timestamp'}), 400

	if not os.path.exists(file_path):
	return jsonify({"error": "Event file not found"}), 400

	with open(file_path, "r") as f:
	events = json.load(f)

	suggestion_meta = None
	time_start = None
	time_end = None
	try:
	prev_log_path = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
	with open(prev_log_path, "r") as f:
	previous_log = json.load(f)
	suggestion_meta = previous_log.get("suggestion_meta")
	time_start = previous_log.get("time_start")
	time_end = previous_log.get("time_end")
	pledge_author = previous_log.get("pledge_author")
	pledge_date = previous_log.get("pledge_date")
	runtime = previous_log.get("runtime")
	except Exception:
	pass

	feedback_dict = {int(item['eventIndex']): item['answer'] for item in feedback_list}
	for idx, event in enumerate(events):
	event["user_feedback"] = feedback_dict.get(idx)

	log_entry = {
	"requested_time": timestamp,
	"user_id": user_id,
	"pledge": pledge,
	"suggestion_meta": suggestion_meta,
	"time_start": time_start,
	"time_end": time_end,
	"runtime": runtime,
	"pledge_author": pledge_author,
	"pledge_date": pledge_date,
	"events": events
	}

	local_filename = f"{FEEDBACK_DIR}/feedback_{timestamp}_{user_id}.jsonl"
	with open(local_filename, "w") as f:
	f.write(json.dumps(log_entry, indent=1))

	try:
	api = HfApi()
	api.upload_file(
	path_or_fileobj=local_filename,
	path_in_repo=f"logs/feedback_{timestamp}_{user_id}.jsonl",
	repo_id=HF_DATASET_REPO,
	repo_type="dataset",
	token=HF_TOKEN
	)
	except Exception as e:
	return jsonify({'status': 'partial_success', 'error': str(e)}), 500

	return jsonify({'status': 'success'})


	@app.route("/download-feedback/<filename>")
	def download_feedback_file(filename):
	return send_from_directory(FEEDBACK_DIR, filename, as_attachment=True)

	@app.route("/feedback-files")
	def list_feedback_files():
	files = os.listdir(FEEDBACK_DIR)
	return jsonify(sorted(files))

	@app.route("/download")
	def download_excel():
	file = request.args.get("file")
	if not file:
	return "Missing file param", 400

	json_path = os.path.join(TMP_DIR, file)
	if not os.path.exists(json_path):
	return "Event file not found", 404

	with open(json_path, "r") as f:
	data = json.load(f)

	df = pd.DataFrame(data)
	xlsx_path = os.path.join(TMP_DIR, file.replace(".json", ".xlsx"))
	df.to_excel(xlsx_path, index=False)

	return send_file(xlsx_path, as_attachment=True)


	if __name__ == '__main__':
	app.run(host="0.0.0.0", port=7860)