akhaliq's picture
akhaliq HF staff
add files
c80917c
from random import uniform
import numpy as np
from collections import OrderedDict, defaultdict
from itertools import tee
import time
# -----------------------------------------------
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def compute_div_n(caps,n=1):
aggr_div = []
for k in caps:
all_ngrams = set()
lenT = 0.
for c in caps[k]:
tkns = c.split()
lenT += len(tkns)
ng = find_ngrams(tkns, n)
all_ngrams.update(ng)
aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
return np.array(aggr_div).mean(), np.array(aggr_div)
def compute_global_div_n(caps,n=1):
aggr_div = []
all_ngrams = set()
lenT = 0.
for k in caps:
for c in caps[k]:
tkns = c.split()
lenT += len(tkns)
ng = find_ngrams(tkns, n)
all_ngrams.update(ng)
if n == 1:
aggr_div.append(float(len(all_ngrams)))
else:
aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
return aggr_div[0], np.repeat(np.array(aggr_div),len(caps))