Spaces:

Clivora
/

data-upload

Sleeping

App Files Files Community

data-upload / app.py

DevClivora

Update app.py

a57c24b verified 6 months ago

raw

history blame contribute delete

5.27 kB

	# app.py
	# A Flask API updated to use the proper Flask logger instead of print().

	import os
	import pandas as pd
	import io
	from flask import Flask, request, jsonify
	from flask_cors import CORS
	from huggingface_hub import HfApi
	from datetime import datetime
	import logging

	# --- Initialization ---
	app = Flask(__name__)
	CORS(app)

	# Configure logging
	app.logger.setLevel(logging.INFO)
	app.logger.info("--- Flask app.py is starting up! ---")

	# --- Configuration ---
	app.logger.info("Loading environment variables...")
	try:
	HF_TOKEN = os.environ["HF_TOKEN"]
	AUDIO_DATASET_REPO_ID = os.environ["AUDIO_DATASET_REPO_ID"]
	TEXT_DATASET_REPO_ID = os.environ["TEXT_DATASET_REPO_ID"]
	app.logger.info("Successfully loaded all required environment variables.")
	except KeyError as e:
	app.logger.error(f"FATAL ERROR: Missing secret environment variable: {e}")
	HF_TOKEN, AUDIO_DATASET_REPO_ID, TEXT_DATASET_REPO_ID = None, None, None

	# --- Hugging Face API Client ---
	if HF_TOKEN:
	app.logger.info("Initializing HfApi client...")
	api = HfApi(token=HF_TOKEN)
	app.logger.info("HfApi client initialized.")
	else:
	api = None
	app.logger.warning("Warning: HfApi not initialized because HF_TOKEN is not set.")


	# --- Helper Function ---
	def get_unique_filename():
	"""Generates a unique filename based on the current timestamp to avoid collisions."""
	return f"data_{datetime.utcnow().strftime('%Y%m%d_%H%M%S_%f')}.parquet"


	# --- API Endpoints ---

	@app.route('/')
	def index():
	"""A simple index route to confirm the API is running."""
	app.logger.info("Request received for / route.")
	return "Hugging Face Data Uploader API is running."

	@app.route('/add-audio', methods=['POST'])
	def add_audio_data():
	"""
	Receives an audio/transcription pair, converts to Parquet, and uploads to the audio dataset.
	"""
	app.logger.info("Request received for /add-audio route.")
	if not api:
	app.logger.error("API client not available for /add-audio.")
	return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
	try:
	app.logger.info("Attempting to process /add-audio data...")
	data = request.get_json()
	if not data or 'audio' not in data or 'transcription' not in data:
	app.logger.warning("Invalid payload received for /add-audio.")
	return jsonify({"error": "Invalid payload. 'audio' and 'transcription' fields are required."}), 400

	app.logger.info("Data validated. Creating DataFrame.")
	df = pd.DataFrame([data])
	buffer = io.BytesIO()
	df.to_parquet(buffer, index=False, engine='pyarrow')
	buffer.seek(0)

	app.logger.info(f"Uploading file to audio dataset: {AUDIO_DATASET_REPO_ID}")
	api.upload_file(
	path_or_fileobj=buffer,
	path_in_repo=get_unique_filename(),
	repo_id=AUDIO_DATASET_REPO_ID,
	repo_type="dataset",
	commit_message="Add new audio-transcription pair"
	)
	app.logger.info("File successfully uploaded to audio dataset.")
	return jsonify({"message": "Audio data added successfully."}), 201

	except Exception as e:
	app.logger.error(f"---! UNEXPECTED ERROR in /add-audio !---: {e}", exc_info=True)
	return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500


	@app.route('/add-text', methods=['POST'])
	def add_text_data():
	"""
	Receives a transcription/summary pair, converts to Parquet, and uploads to the text dataset.
	"""
	app.logger.info("Request received for /add-text route.")
	if not api:
	app.logger.error("API client not available for /add-text.")
	return jsonify({"error": "Server is not configured with Hugging Face credentials."}), 500
	try:
	app.logger.info("Attempting to process /add-text data...")
	data = request.get_json()
	if not data or 'transcription' not in data or 'summary' not in data:
	app.logger.warning("Invalid payload received for /add-text.")
	return jsonify({"error": "Invalid payload. 'transcription' and 'summary' fields are required."}), 400

	app.logger.info("Data validated. Creating DataFrame.")
	df = pd.DataFrame([data])
	buffer = io.BytesIO()
	df.to_parquet(buffer, index=False, engine='pyarrow')
	buffer.seek(0)

	app.logger.info(f"Uploading file to text dataset: {TEXT_DATASET_REPO_ID}")
	api.upload_file(
	path_or_fileobj=buffer,
	path_in_repo=get_unique_filename(),
	repo_id=TEXT_DATASET_REPO_ID,
	repo_type="dataset",
	commit_message="Add new transcription-summary pair"
	)
	app.logger.info("File successfully uploaded to text dataset.")
	return jsonify({"message": "Text data added successfully."}), 201

	except Exception as e:
	app.logger.error(f"---! UNEXPECTED ERROR in /add-text !---: {e}", exc_info=True)
	return jsonify({"error": "An internal error occurred.", "details": str(e)}), 500

	# To run on Hugging Face Spaces
	if __name__ == '__main__':
	app.logger.info("Starting Flask development server...")
	app.run(host='0.0.0.0', port=7860)