akhaliq's picture
akhaliq HF staff
add files
c80917c
raw history blame
No virus
1.08 kB
from random import uniform
import numpy as np
from collections import OrderedDict, defaultdict
from itertools import tee
import time
# -----------------------------------------------
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def compute_div_n(caps,n=1):
aggr_div = []
for k in caps:
all_ngrams = set()
lenT = 0.
for c in caps[k]:
tkns = c.split()
lenT += len(tkns)
ng = find_ngrams(tkns, n)
all_ngrams.update(ng)
aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
return np.array(aggr_div).mean(), np.array(aggr_div)
def compute_global_div_n(caps,n=1):
aggr_div = []
all_ngrams = set()
lenT = 0.
for k in caps:
for c in caps[k]:
tkns = c.split()
lenT += len(tkns)
ng = find_ngrams(tkns, n)
all_ngrams.update(ng)
if n == 1:
aggr_div.append(float(len(all_ngrams)))
else:
aggr_div.append(float(len(all_ngrams))/ (1e-6 + float(lenT)))
return aggr_div[0], np.repeat(np.array(aggr_div),len(caps))