Spaces:
Paused
Paused
File size: 1,768 Bytes
3bbe5bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import random
import nltk
nltk.data.path.append('/mnt/data/nltk_data')
import numpy as np
from utils.constants import IMAGENET_DEFAULT_TEMPLATES
def get_tag(tokenized, tags):
if not isinstance(tags, (list, tuple)):
tags = [tags]
ret = []
for (word, pos) in nltk.pos_tag(tokenized):
for tag in tags:
if pos == tag:
ret.append(word)
return ret
def get_noun_phrase(tokenized):
# Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = nltk.RegexpParser(grammar)
chunked = chunker.parse(nltk.pos_tag(tokenized))
continuous_chunk = []
current_chunk = []
for subtree in chunked:
if isinstance(subtree, nltk.Tree):
current_chunk.append(' '.join([token for token, pos in subtree.leaves()]))
elif current_chunk:
named_entity = ' '.join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
def text_noun_with_prompt_all(text, phrase_prob=0.0, append_text=True):
tokenized = nltk.word_tokenize(text)
if random.random() >= phrase_prob:
nouns = get_tag(tokenized, ['NN', 'NNS', 'NNP'])
else:
nouns = get_noun_phrase(tokenized)
prompt_texts = [np.random.choice(IMAGENET_DEFAULT_TEMPLATES).format(noun) for noun in nouns]
if append_text:
prompt_texts += [text]
nouns += [text]
return prompt_texts, nouns |