| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #ifndef moses_ListCoders_h |
| | #define moses_ListCoders_h |
| |
|
| | #include <cmath> |
| | #include <cassert> |
| |
|
| | namespace Moses2 |
| | { |
| |
|
| | template<typename T = unsigned int> |
| | class VarIntType |
| | { |
| | private: |
| | template<typename IntType, typename OutIt> |
| | static void EncodeSymbol(IntType input, OutIt output) { |
| | if (input == 0) { |
| | *output = 0; |
| | output++; |
| | return; |
| | } |
| |
|
| | T msb = 1 << (sizeof(T) * 8 - 1); |
| | IntType mask = ~msb; |
| | IntType shift = (sizeof(T) * 8 - 1); |
| |
|
| | while (input) { |
| | T res = input & mask; |
| | input >>= shift; |
| | if (input) res |= msb; |
| | *output = res; |
| | output++; |
| | } |
| | } |
| | ; |
| |
|
| | template<typename InIt, typename IntType> |
| | static void DecodeSymbol(InIt &it, InIt end, IntType &output) { |
| | T msb = 1 << (sizeof(T) * 8 - 1); |
| | IntType shift = (sizeof(T) * 8 - 1); |
| |
|
| | output = 0; |
| | size_t i = 0; |
| | while (it != end && *it & msb) { |
| | IntType temp = *it & ~msb; |
| | temp <<= shift * i; |
| | output |= temp; |
| | it++; |
| | i++; |
| | } |
| | assert(it != end); |
| |
|
| | IntType temp = *it; |
| | temp <<= shift * i; |
| | output |= temp; |
| | it++; |
| | } |
| |
|
| | public: |
| |
|
| | template<typename InIt, typename OutIt> |
| | static void Encode(InIt it, InIt end, OutIt outIt) { |
| | while (it != end) { |
| | EncodeSymbol(*it, outIt); |
| | it++; |
| | } |
| | } |
| |
|
| | template<typename InIt, typename OutIt> |
| | static void Decode(InIt &it, InIt end, OutIt outIt) { |
| | while (it != end) { |
| | size_t output; |
| | DecodeSymbol(it, end, output); |
| | *outIt = output; |
| | outIt++; |
| | } |
| | } |
| |
|
| | template<typename InIt> |
| | static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { |
| | size_t sum = 0; |
| | size_t curr = 0; |
| |
|
| | while (it != end && curr < num) { |
| | size_t output; |
| | DecodeSymbol(it, end, output); |
| | sum += output; |
| | curr++; |
| | } |
| |
|
| | return sum; |
| | } |
| |
|
| | }; |
| |
|
| | typedef VarIntType<unsigned char> VarByte; |
| |
|
| | typedef VarByte VarInt8; |
| | typedef VarIntType<unsigned short> VarInt16; |
| | typedef VarIntType<unsigned int> VarInt32; |
| |
|
| | class Simple9 |
| | { |
| | private: |
| | typedef unsigned int uint; |
| |
|
| | template<typename InIt> |
| | inline static void EncodeSymbol(uint &output, InIt it, InIt end) { |
| | uint length = end - it; |
| |
|
| | uint type = 0; |
| | uint bitlength = 0; |
| |
|
| | switch (length) { |
| | case 1: |
| | type = 1; |
| | bitlength = 28; |
| | break; |
| | case 2: |
| | type = 2; |
| | bitlength = 14; |
| | break; |
| | case 3: |
| | type = 3; |
| | bitlength = 9; |
| | break; |
| | case 4: |
| | type = 4; |
| | bitlength = 7; |
| | break; |
| | case 5: |
| | type = 5; |
| | bitlength = 5; |
| | break; |
| | case 7: |
| | type = 6; |
| | bitlength = 4; |
| | break; |
| | case 9: |
| | type = 7; |
| | bitlength = 3; |
| | break; |
| | case 14: |
| | type = 8; |
| | bitlength = 2; |
| | break; |
| | case 28: |
| | type = 9; |
| | bitlength = 1; |
| | break; |
| | } |
| |
|
| | output = 0; |
| | output |= (type << 28); |
| |
|
| | uint i = 0; |
| | while (it != end) { |
| | UTIL_THROW_IF2(*it > 268435455, |
| | "You are trying to encode " << *it |
| | << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)"); |
| |
|
| | uint l = bitlength * (length - i - 1); |
| | output |= *it << l; |
| | it++; |
| | i++; |
| | } |
| | } |
| |
|
| | template<typename OutIt> |
| | static inline void DecodeSymbol(uint input, OutIt outIt) { |
| | uint type = (input >> 28); |
| |
|
| | uint bitlen = 0; |
| | uint shift = 0; |
| | uint mask = 0; |
| |
|
| | switch (type) { |
| | case 1: |
| | bitlen = 28; |
| | shift = 0; |
| | mask = 268435455; |
| | break; |
| | case 2: |
| | bitlen = 14; |
| | shift = 14; |
| | mask = 16383; |
| | break; |
| | case 3: |
| | bitlen = 9; |
| | shift = 18; |
| | mask = 511; |
| | break; |
| | case 4: |
| | bitlen = 7; |
| | shift = 21; |
| | mask = 127; |
| | break; |
| | case 5: |
| | bitlen = 5; |
| | shift = 20; |
| | mask = 31; |
| | break; |
| | case 6: |
| | bitlen = 4; |
| | shift = 24; |
| | mask = 15; |
| | break; |
| | case 7: |
| | bitlen = 3; |
| | shift = 24; |
| | mask = 7; |
| | break; |
| | case 8: |
| | bitlen = 2; |
| | shift = 26; |
| | mask = 3; |
| | break; |
| | case 9: |
| | bitlen = 1; |
| | shift = 27; |
| | mask = 1; |
| | break; |
| | } |
| |
|
| | while (shift > 0) { |
| | *outIt = (input >> shift) & mask; |
| | shift -= bitlen; |
| | outIt++; |
| | } |
| | *outIt = input & mask; |
| | outIt++; |
| | } |
| |
|
| | static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) { |
| | uint type = (input >> 28); |
| |
|
| | uint bitlen = 0; |
| | uint shift = 0; |
| | uint mask = 0; |
| |
|
| | switch (type) { |
| | case 1: |
| | bitlen = 28; |
| | shift = 0; |
| | mask = 268435455; |
| | break; |
| | case 2: |
| | bitlen = 14; |
| | shift = 14; |
| | mask = 16383; |
| | break; |
| | case 3: |
| | bitlen = 9; |
| | shift = 18; |
| | mask = 511; |
| | break; |
| | case 4: |
| | bitlen = 7; |
| | shift = 21; |
| | mask = 127; |
| | break; |
| | case 5: |
| | bitlen = 5; |
| | shift = 20; |
| | mask = 31; |
| | break; |
| | case 6: |
| | bitlen = 4; |
| | shift = 24; |
| | mask = 15; |
| | break; |
| | case 7: |
| | bitlen = 3; |
| | shift = 24; |
| | mask = 7; |
| | break; |
| | case 8: |
| | bitlen = 2; |
| | shift = 26; |
| | mask = 3; |
| | break; |
| | case 9: |
| | bitlen = 1; |
| | shift = 27; |
| | mask = 1; |
| | break; |
| | } |
| |
|
| | size_t sum = 0; |
| | while (shift > 0) { |
| | sum += (input >> shift) & mask; |
| | shift -= bitlen; |
| | if (++curr == num) return sum; |
| | } |
| | sum += input & mask; |
| | curr++; |
| | return sum; |
| | } |
| |
|
| | public: |
| | template<typename InIt, typename OutIt> |
| | static void Encode(InIt it, InIt end, OutIt outIt) { |
| | uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 }; |
| |
|
| | uint buffer[28]; |
| | for (InIt i = it; i < end; i++) { |
| | uint lastbit = 1; |
| | uint lastpos = 0; |
| | uint lastyes = 0; |
| | uint j = 0; |
| |
|
| | double log2 = log(2); |
| | while (j < 9 && lastpos < 28 && (i + lastpos) < end) { |
| | if (lastpos >= parts[j]) j++; |
| |
|
| | buffer[lastpos] = *(i + lastpos); |
| |
|
| | uint reqbit = ceil(log(buffer[lastpos] + 1) / log2); |
| | assert(reqbit <= 28); |
| |
|
| | uint bit = 28 / floor(28 / reqbit); |
| | if (lastbit < bit) lastbit = bit; |
| |
|
| | if (parts[j] > 28 / lastbit) break; |
| | else if (lastpos == parts[j] - 1) lastyes = lastpos; |
| |
|
| | lastpos++; |
| | } |
| | i += lastyes; |
| |
|
| | uint length = lastyes + 1; |
| | uint output; |
| | EncodeSymbol(output, buffer, buffer + length); |
| |
|
| | *outIt = output; |
| | outIt++; |
| | } |
| | } |
| |
|
| | template<typename InIt, typename OutIt> |
| | static void Decode(InIt &it, InIt end, OutIt outIt) { |
| | while (it != end) { |
| | DecodeSymbol(*it, outIt); |
| | it++; |
| | } |
| | } |
| |
|
| | template<typename InIt> |
| | static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { |
| | size_t sum = 0; |
| | size_t curr = 0; |
| | while (it != end && curr < num) { |
| | sum += DecodeAndSumSymbol(*it, num, curr); |
| | it++; |
| | } |
| | assert(curr == num); |
| | return sum; |
| | } |
| | }; |
| |
|
| | } |
| |
|
| | #endif |
| |
|