Spaces:

mgyigit
/

misinfo

Sleeping

App Files Files Community

misinfo / src /data_loader /download_data_mocheg.py

gyigit

update

54e8a79 about 1 month ago

raw

history blame

2.4 kB

	import os
	import requests
	import tarfile
	from tqdm import tqdm

	DATA_URL: str = (
	"http://nlplab1.cs.vt.edu/~menglong/project/multimodal/fact_checking/MOCHEG/dataset/latest_dataset/mocheg_with_tweet_2023_03.tar.gz"
	)
	RAW_DATA_DIR: str = "data/raw"
	ARCHIVE_NAME: str = "mocheg_with_tweet_2023_03.tar.gz"
	CHUNK_SIZE: int = 16 * 1024 * 1024 # 16 MB

	# Ensure the raw data directory exists
	os.makedirs(RAW_DATA_DIR, exist_ok=True)
	archive_path: str = os.path.join(RAW_DATA_DIR, ARCHIVE_NAME)


	def check_disk_space(required_space_gb: int) -> bool:
	"""Check if there is enough free disk space."""
	stat = os.statvfs(RAW_DATA_DIR)
	free_space_gb: float = (stat.f_bavail * stat.f_frsize) / (1024**3)
	return free_space_gb > required_space_gb


	def download_data() -> None:
	"""Download the data if not already present and extract it."""
	# Check if the data file already exists
	if os.path.exists(archive_path):
	print(f"Data already downloaded at {archive_path}. Skipping download.")
	return

	# Ensure enough disk space (approximate)
	required_space_gb: int = 80 # Adjust based on expected file size + extraction space
	if not check_disk_space(required_space_gb):
	print(f"Not enough disk space. At least {required_space_gb} GB required.")
	return

	# Download the data in larger chunks
	print(f"Downloading data from {DATA_URL}...")
	response = requests.get(DATA_URL, stream=True)
	response.raise_for_status() # Ensure the URL is accessible

	total_size: int = int(response.headers.get("content-length", 0))
	with open(archive_path, "wb") as file, tqdm(
	desc=ARCHIVE_NAME,
	total=total_size,
	unit="B",
	unit_scale=True,
	unit_divisor=1024,
	) as progress_bar:
	for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
	if chunk:
	file.write(chunk)
	progress_bar.update(len(chunk))

	print(f"Download completed: {archive_path}")

	# Extract the tar.gz file
	extract_data(archive_path)


	def extract_data(archive_path: str) -> None:
	"""Extract the downloaded tar.gz file."""
	print(f"Extracting data from {archive_path}...")
	with tarfile.open(archive_path, "r:gz") as tar:
	tar.extractall(path=RAW_DATA_DIR)
	print(f"Data extracted to {RAW_DATA_DIR}")


	if __name__ == "__main__":
	download_data()