|
import os |
|
from typing import List |
|
from urllib.request import urlretrieve |
|
from zipfile import ZipFile |
|
|
|
import pandas as pd |
|
|
|
from src.utils.logging_utils import get_pylogger |
|
|
|
log = get_pylogger(__name__) |
|
|
|
|
|
def filter_dataframe_and_get_column( |
|
dataframe: pd.DataFrame, filter_column: str, filter_value: str, select_column: str |
|
) -> List[str]: |
|
return dataframe[dataframe[filter_column] == filter_value][select_column].tolist() |
|
|
|
|
|
def download_and_unzip( |
|
url: str, target_path: str, force_download: bool = False, remove_tmp_file: bool = False |
|
): |
|
log.warning(f"download zip file from {url} to {target_path} ...") |
|
if not (url.startswith("http://") or url.startswith("https://")): |
|
raise ValueError(f"url needs to point to a http(s) address, but it is: {url}") |
|
tmp_file = os.path.join(target_path, os.path.basename(url)) |
|
if os.path.exists(tmp_file) and not force_download: |
|
log.warning(f"tmp file {tmp_file} already exists, skip downloading {url}") |
|
else: |
|
urlretrieve(url, tmp_file) |
|
with ZipFile(tmp_file, "r") as zfile: |
|
zfile.extractall(target_path) |
|
if remove_tmp_file: |
|
os.remove(tmp_file) |
|
|