Sadashiv's picture
Upload 146 files
17c5137 verified
raw
history blame contribute delete
No virus
2.68 kB
import os
import sys
from six.moves import urllib
import zipfile
from src.exception import PlantException
from src.logger import logging
from src.entity.config_entity import DataIngestionConfig
from src.entity.artifact_entity import DataIngestionArtifact
from huggingface_hub import hf_hub_download
from tqdm import tqdm
class DataIngestion:
def __init__(
self, data_ingestion_config: DataIngestionConfig = DataIngestionConfig()
):
try:
self.data_ingestion_config = data_ingestion_config
except Exception as e:
raise PlantException(e, sys)
def download_dataset(self):
# The path to the downloaded file in the cache.
print(f"Commencing the dataset download from the hub...")
logging.info(f"Commencing the dataset download from the hub...")
filepath = hf_hub_download(repo_id=self.data_ingestion_config.huggingface_repo_id,
filename=self.data_ingestion_config.huggingface_file_name,
repo_type="dataset")
# Create the destination directory if it doesn't exist.
os.makedirs(self.data_ingestion_config.feature_store_file_path, exist_ok=True)
# Save the file to the specified location.
destination_path = os.path.join(self.data_ingestion_config.feature_store_file_path, self.data_ingestion_config.huggingface_file_name)
with open(destination_path, "wb") as f_dest, open(filepath, "rb") as f_src:
f_dest.write(f_src.read())
return destination_path
def extract_and_move_zip(self, zip_file_path):
logging.info(f"Zip file extraction has begun.")
destination_dir=self.data_ingestion_config.dataset_location
# Extract the zip file.
with zipfile.ZipFile(zip_file_path, "r") as zip_file:
zip_file.extractall(destination_dir)
logging.info(f"Zip file extraction has complete.")
def initiate_data_ingestion(self) -> DataIngestionArtifact:
logging.info("Entered the initiate_data_ingestion method of the Data_Ingestion class.")
try:
zip_file_path = self.download_dataset()
self.extract_and_move_zip(zip_file_path=zip_file_path)
data_ingestion_artifact = DataIngestionArtifact(dataset_path=self.data_ingestion_config.dataset_location,
feature_store_path=zip_file_path)
logging.info("Data Ingestion Artifacts Genereated")
return data_ingestion_artifact
except Exception as e:
raise PlantException(e, sys)