#!/usr/bin/env python # -*- coding: utf-8 -*- # Author: Rico Sennrich """Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary of a text to a configurable number of symbols, with only a small increase in the number of tokens. This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets, indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py). Reference: Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. """ import re import sys import collections def get_stats(vocab): pairs = collections.defaultdict(int) for word, freq in vocab.items(): symbols = word.split() for i in range(len(symbols)-1): pairs[symbols[i],symbols[i+1]] += freq return pairs def merge_vocab(pair, v_in): v_out = {} bigram_pattern = re.escape(' '.join(pair)) p = re.compile(r'(?' : 5, 'l o w e r' : 2, 'n e w e s t' : 6, 'w i d e s t' : 3} num_merges = 15 for i in range(num_merges): pairs = get_stats(vocab) try: best = max(pairs, key=pairs.get) except ValueError: break if pairs[best] < 2: sys.stderr.write('no pair has frequency > 1. Stopping\n') break vocab = merge_vocab(best, vocab) print(best)