File size: 2,850 Bytes
3eb682b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
#!/usr/bin/env python
#
# File Name : ptbtokenizer.py
#
# Description : Do the PTB Tokenization and remove punctuations.
#
# Creation Date : 29-12-2014
# Last Modified : Thu Mar 19 09:53:35 2015
# Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
import os
import sys
import subprocess
import tempfile
import itertools
# path to the stanford corenlp jar
STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
# punctuations to be removed from the sentences
PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
".", "?", "!", ",", ":", "-", "--", "...", ";"]
class PTBTokenizer:
"""Python wrapper of Stanford PTBTokenizer"""
def tokenize(self, captions_for_image):
cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
'edu.stanford.nlp.process.PTBTokenizer', \
'-preserveLines', '-lowerCase']
# ======================================================
# prepare data for PTB Tokenizer
# ======================================================
final_tokenized_captions_for_image = {}
image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
sentences = '\n'.join([c.replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
# ======================================================
# save sentences to temporary file
# ======================================================
path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
tmp_file.write(sentences.encode())
tmp_file.close()
# ======================================================
# tokenize sentence
# ======================================================
cmd.append(os.path.basename(tmp_file.name))
p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
stdout=subprocess.PIPE)
token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
token_lines = token_lines.decode()
lines = token_lines.split('\n')
# remove temp file
os.remove(tmp_file.name)
# ======================================================
# create dictionary for tokenized captions
# ======================================================
for k, line in zip(image_id, lines):
if not k in final_tokenized_captions_for_image:
final_tokenized_captions_for_image[k] = []
tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
if w not in PUNCTUATIONS])
final_tokenized_captions_for_image[k].append(tokenized_caption)
return final_tokenized_captions_for_image
|