|
import sys |
|
import os |
|
from datetime import datetime |
|
import pandas as pd |
|
import contexttimer |
|
from urllib.request import urlopen |
|
import requests |
|
from PIL import Image |
|
import torch |
|
from torchvision.transforms import functional as TF |
|
from multiprocessing import Pool |
|
from tqdm import tqdm |
|
import logging |
|
import sys |
|
import numpy as np |
|
|
|
|
|
|
|
from nltk.tag import CRFTagger |
|
ct = CRFTagger() |
|
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') |
|
|
|
headers = { |
|
"User-Agent": "Googlebot-Image/1.0", |
|
"X-Forwarded-For": "64.18.15.200", |
|
} |
|
|
|
|
|
logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO) |
|
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) |
|
|
|
'''if len(sys.argv) != 3: |
|
print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training") |
|
exit(1)''' |
|
|
|
|
|
print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}') |
|
|
|
with contexttimer.Timer(prefix="Loading from tsv"): |
|
df = pd.read_csv(sys.argv[1], delimiter='\t') |
|
df = df[["caption", "url"]] |
|
|
|
def drop_no(text): |
|
try: |
|
if len(text)==0: |
|
return True |
|
elif len(text) > 96: |
|
return True |
|
text = text.split() |
|
result = ct.tag_sents([text]) |
|
nnp_cnt = 0 |
|
total = len(result[0]) |
|
|
|
for x in result[0]: |
|
if x[1] == "NNP": |
|
nnp_cnt += 1 |
|
|
|
if (nnp_cnt/total)>=0.8: |
|
return True |
|
return False |
|
except Exception as e: |
|
print(e) |
|
return True |
|
|
|
df["to_drop"]=df["caption"].apply(drop_no) |
|
df = df[df["to_drop"]==False] |
|
df = df.drop("to_drop",axis=1) |
|
|
|
df["index_row"] = df.index |
|
|
|
df.to_csv(sys.argv[2], sep='\t',index=False) |
|
|
|
|