#!/usr/bin/env python # # File Name : ptbtokenizer.py # # Description : Do the PTB Tokenization and remove punctuations. # # Creation Date : 29-12-2014 # Last Modified : Thu Mar 19 09:53:35 2015 # Authors : Hao Fang and Tsung-Yi Lin import os import sys import subprocess import tempfile import itertools # path to the stanford corenlp jar STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' # punctuations to be removed from the sentences PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ ".", "?", "!", ",", ":", "-", "--", "...", ";"] class PTBTokenizer: """Python wrapper of Stanford PTBTokenizer""" def tokenize(self, captions_for_image): cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 'edu.stanford.nlp.process.PTBTokenizer', \ '-preserveLines', '-lowerCase'] # ====================================================== # prepare data for PTB Tokenizer # ====================================================== final_tokenized_captions_for_image = {} image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] sentences = '\n'.join([c.replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) # ====================================================== # save sentences to temporary file # ====================================================== path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) tmp_file.write(sentences.encode()) tmp_file.close() # ====================================================== # tokenize sentence # ====================================================== cmd.append(os.path.basename(tmp_file.name)) p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ stdout=subprocess.PIPE) token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] token_lines = token_lines.decode() lines = token_lines.split('\n') # remove temp file os.remove(tmp_file.name) # ====================================================== # create dictionary for tokenized captions # ====================================================== for k, line in zip(image_id, lines): if not k in final_tokenized_captions_for_image: final_tokenized_captions_for_image[k] = [] tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ if w not in PUNCTUATIONS]) final_tokenized_captions_for_image[k].append(tokenized_caption) return final_tokenized_captions_for_image