Spaces:
Running
Running
File size: 2,748 Bytes
c4c7cee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
with ProgressBar():
ddf = dd.read_csv(
"../datasets/YFCC100M/yfcc100m_dataset",
names=[
"photo_id",
"user_nsid",
"user_nickname",
"date_taken",
"date_uploaded",
"capture_device",
"title",
"description",
"user_tags",
"machine_tags",
"longitude",
"latitude",
"accuracy",
"page_url",
"download_url",
"license_name",
"license_url",
"server_id",
"farm_id",
"secret",
"secret_original",
"extension",
"media_type",
],
dtype={
"photo_id": str,
"user_nsid": str,
"user_nickname": str,
"user_tags": str,
"machine_tags": str,
"longitude": float,
"latitude": float,
"accuracy": float,
"server_id": str,
"farm_id": str,
"secret": str,
"secret_original": str,
"extension": str,
"media_type": float,
},
sep="\t",
)
ddf = ddf[
[
"photo_id",
"longitude",
"latitude",
"accuracy",
"extension",
"download_url",
"media_type",
]
]
filtered_ddf = ddf[
ddf["longitude"].notnull()
& ddf["latitude"].notnull()
& (ddf["media_type"] == 0)
]
del ddf["media_type"]
hash_ddf = dd.read_csv(
"../datasets/YFCC100M/yfcc100m_hash",
names=["photo_id", "hash"],
dtype={"photo_id": str, "hash": str},
sep="\t",
)
filtered_ddf = filtered_ddf.merge(hash_ddf, on="photo_id", how="left")
# Read the 4k photo IDs
with open("../datasets/YFCC100M/yfcc_4k_ids.txt", "r") as f:
test_photo_ids = set(f.read().splitlines())
# Split the dataframe based on whether photo_id is in test set
filter = filtered_ddf["photo_id"].isin(test_photo_ids)
test_ddf = filtered_ddf[filter]
train_ddf = filtered_ddf[~filter]
train_ddf = train_ddf[train_ddf["accuracy"] >= 12]
# Save the split dataframes
test_ddf.to_csv(
"../datasets/YFCC100M/yfcc_4k_dataset_with_gps.csv",
sep="\t",
index=False,
single_file=True,
)
train_ddf = train_ddf.repartition(npartitions=len(train_ddf) // 100000 + 1)
train_ddf.to_csv(
"../datasets/YFCC100M/yfcc100m_dataset_with_gps_train/*.csv",
sep="\t",
index=False,
single_file=False,
)
|