import os import random import requests import pickle from urllib.request import urlretrieve user_agents_url = 'https://raw.githubusercontent.com/danielmiessler/SecLists/master/Fuzzing/User-Agents/UserAgents-IE.txt' os.makedirs('./tmp', exist_ok=True) download_path = './tmp/user_agents.txt' # download user-agents if not os.path.exists(download_path): urlretrieve(user_agents_url, download_path) # reading user-agents file with open(download_path, 'r') as f: user_agents = f.readlines() user_agents = list(map(lambda x: x.strip('\n'), user_agents)) def return_user_agent(): """ this function returns different user agent randomly """ ua = random.choice(user_agents) # using this header to pretend as regular user so that we are not blocked by website headers = { 'User-Agent': ua } return headers # function to return 2 def return2(): return 2 class Tokenizer: """Tokenizer class for tokenizing captions in the Flicker8k dataset. Parameters ---------- root : str root directory where dataset is stored """ def __init__(self, root): self.vocab = ['', '', '', ''] self.count = 3 self.idx2val = {} self.val2idx = {'': 0, '': 1, '': 2, '': 3} self.root = root def add(self, text): for i in text.lower().strip().split(): if i not in self.val2idx.keys(): self.count += 1 self.vocab.append(i) self.val2idx.update({i: self.count}) def tokenize(self, fname): print(f'tokenizing file {fname}...') temp = read_file(os.path.join(self.root, fname)) df = pd.DataFrame(temp, columns=['id']) for i in df['id']: captions = self.caption_df[self.caption_df['id'] == i].reset_index(drop=True)['caption'] for caption in captions: self.add(caption) self.complete() def complete(self): self.idx2val = {key: value for value, key in self.val2idx.items()} self.val2idx = defaultdict(return2, self.val2idx) def pickle_tokenizer(self, fname): print(f"saving to file {fname}") with open(fname, 'wb') as f: state_dict = {'idx2val': self.idx2val, 'val2idx': self.val2idx, 'vocab': self.vocab} pickle.dump(state_dict, f) def load_tokenizer(self, fname): print(f"loading from file {fname}...") with open(fname, 'rb') as f: state_dict = pickle.load(f) self.vocab = state_dict['vocab'] self.val2idx = state_dict['val2idx'] self.idx2val = state_dict['idx2val'] def __len__(self): return len(self.vocab)