Galuh Sahid
Add download_logs and scripts
ba7a003
raw
history blame
1.81 kB
import sys
import os
from datetime import datetime
import pandas as pd
import contexttimer
from urllib.request import urlopen
import requests
from PIL import Image
import torch
from torchvision.transforms import functional as TF
from multiprocessing import Pool
from tqdm import tqdm
import logging
import sys
import numpy as np
from nltk.tag import CRFTagger
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
headers = {
"User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
"X-Forwarded-For": "64.18.15.200",
}
# Setup
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
'''if len(sys.argv) != 3:
print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
exit(1)'''
# Load data
print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
with contexttimer.Timer(prefix="Loading from tsv"):
df = pd.read_csv(sys.argv[1], delimiter='\t')
df = df[["caption", "url"]]
def drop_no(text):
try:
if len(text)==0:
return True
elif len(text) > 96:
return True
text = text.split()
result = ct.tag_sents([text])
nnp_cnt = 0
total = len(result[0])
for x in result[0]:
if x[1] == "NNP":
nnp_cnt += 1
if (nnp_cnt/total)>=0.8:
return True
return False
except Exception as e:
print(e)
return True
df["to_drop"]=df["caption"].apply(drop_no)
df = df[df["to_drop"]==False]
df = df.drop("to_drop",axis=1)
df["index_row"] = df.index
df.to_csv(sys.argv[2], sep='\t',index=False)