|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef NO_PYTHON |
|
#define _GNU_SOURCE |
|
#include <string.h> |
|
#include <math.h> |
|
|
|
#include <stdio.h> |
|
#else |
|
#define _LEV_STATIC_PY static |
|
#define lev_wchar Py_UNICODE |
|
#include <Python.h> |
|
#endif |
|
|
|
#if PY_MAJOR_VERSION >= 3 |
|
#define LEV_PYTHON3 |
|
#define PyString_Type PyBytes_Type |
|
#define PyString_GET_SIZE PyBytes_GET_SIZE |
|
#define PyString_AS_STRING PyBytes_AS_STRING |
|
#define PyString_Check PyBytes_Check |
|
#define PyString_FromStringAndSize PyBytes_FromStringAndSize |
|
#define PyString_InternFromString PyUnicode_InternFromString |
|
#define PyInt_AS_LONG PyLong_AsLong |
|
#define PyInt_FromLong PyLong_FromLong |
|
#define PyInt_Check PyLong_Check |
|
#define PY_INIT_MOD(module, name, doc, methods) \ |
|
static struct PyModuleDef moduledef = { \ |
|
PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ |
|
module = PyModule_Create(&moduledef); |
|
#define PY_MOD_INIT_FUNC_DEF(name) PyObject* PyInit_##name(void) |
|
#else |
|
#define PY_INIT_MOD(module, name, doc, methods) \ |
|
Py_InitModule3(name, methods, doc); |
|
#define PY_MOD_INIT_FUNC_DEF(name) void init##name(void) |
|
#endif |
|
|
|
#include <assert.h> |
|
#include "_levenshtein.h" |
|
|
|
|
|
|
|
|
|
#ifndef __GNUC__ |
|
# ifdef _MSC_VER |
|
# define inline __inline |
|
# else |
|
# define inline |
|
# endif |
|
# define __attribute__(x) |
|
#endif |
|
|
|
#define LEV_EPSILON 1e-14 |
|
#define LEV_INFINITY 1e100 |
|
|
|
|
|
|
|
|
|
#define PYARGCFIX(x) (char*)(x) |
|
|
|
|
|
static size_t* |
|
munkers_blackman(size_t n1, |
|
size_t n2, |
|
double *dists); |
|
|
|
#define TAUS113_LCG(n) ((69069UL * n) & 0xffffffffUL) |
|
#define TAUS113_MASK 0xffffffffUL |
|
|
|
typedef struct { |
|
unsigned long int z1, z2, z3, z4; |
|
} taus113_state_t; |
|
|
|
static inline unsigned long int |
|
taus113_get(taus113_state_t *state); |
|
|
|
static void |
|
taus113_set(taus113_state_t *state, |
|
unsigned long int s); |
|
|
|
#ifndef NO_PYTHON |
|
|
|
|
|
static PyObject* distance_py(PyObject *self, PyObject *args); |
|
static PyObject* ratio_py(PyObject *self, PyObject *args); |
|
static PyObject* hamming_py(PyObject *self, PyObject *args); |
|
static PyObject* jaro_py(PyObject *self, PyObject *args); |
|
static PyObject* jaro_winkler_py(PyObject *self, PyObject *args); |
|
static PyObject* median_py(PyObject *self, PyObject *args); |
|
static PyObject* median_improve_py(PyObject *self, PyObject *args); |
|
static PyObject* quickmedian_py(PyObject *self, PyObject *args); |
|
static PyObject* setmedian_py(PyObject *self, PyObject *args); |
|
static PyObject* seqratio_py(PyObject *self, PyObject *args); |
|
static PyObject* setratio_py(PyObject *self, PyObject *args); |
|
static PyObject* editops_py(PyObject *self, PyObject *args); |
|
static PyObject* opcodes_py(PyObject *self, PyObject *args); |
|
static PyObject* inverse_py(PyObject *self, PyObject *args); |
|
static PyObject* apply_edit_py(PyObject *self, PyObject *args); |
|
static PyObject* matching_blocks_py(PyObject *self, PyObject *args); |
|
static PyObject* subtract_edit_py(PyObject *self, PyObject *args); |
|
|
|
#define Levenshtein_DESC \ |
|
"A C extension module for fast computation of:\n" \ |
|
"- Levenshtein (edit) distance and edit sequence manipulation\n" \ |
|
"- string similarity\n" \ |
|
"- approximate median strings, and generally string averaging\n" \ |
|
"- string sequence and set similarity\n" \ |
|
"\n" \ |
|
"Levenshtein has a some overlap with difflib (SequenceMatcher). It\n" \ |
|
"supports only strings, not arbitrary sequence types, but on the\n" \ |
|
"other hand it's much faster.\n" \ |
|
"\n" \ |
|
"It supports both normal and Unicode strings, but can't mix them, all\n" \ |
|
"arguments to a function (method) have to be of the same type (or its\n" \ |
|
"subclasses).\n" |
|
|
|
#define distance_DESC \ |
|
"Compute absolute Levenshtein distance of two strings.\n" \ |
|
"\n" \ |
|
"distance(string1, string2)\n" \ |
|
"\n" \ |
|
"Examples (it's hard to spell Levenshtein correctly):\n" \ |
|
"\n" \ |
|
">>> distance('Levenshtein', 'Lenvinsten')\n" \ |
|
"4\n" \ |
|
">>> distance('Levenshtein', 'Levensthein')\n" \ |
|
"2\n" \ |
|
">>> distance('Levenshtein', 'Levenshten')\n" \ |
|
"1\n" \ |
|
">>> distance('Levenshtein', 'Levenshtein')\n" \ |
|
"0\n" \ |
|
"\n" \ |
|
"Yeah, we've managed it at last.\n" |
|
|
|
#define ratio_DESC \ |
|
"Compute similarity of two strings.\n" \ |
|
"\n" \ |
|
"ratio(string1, string2)\n" \ |
|
"\n" \ |
|
"The similarity is a number between 0 and 1, it's usually equal or\n" \ |
|
"somewhat higher than difflib.SequenceMatcher.ratio(), because it's\n" \ |
|
"based on real minimal edit distance.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> ratio('Hello world!', 'Holly grail!') # doctest: +ELLIPSIS\n" \ |
|
"0.583333...\n" \ |
|
">>> ratio('Brian', 'Jesus')\n" \ |
|
"0.0\n" \ |
|
"\n" \ |
|
"Really? I thought there was some similarity.\n" |
|
|
|
#define hamming_DESC \ |
|
"Compute Hamming distance of two strings.\n" \ |
|
"\n" \ |
|
"hamming(string1, string2)\n" \ |
|
"\n" \ |
|
"The Hamming distance is simply the number of differing characters.\n" \ |
|
"That means the length of the strings must be the same.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
">>> hamming('Hello world!', 'Holly grail!')\n" \ |
|
"7\n" \ |
|
">>> hamming('Brian', 'Jesus')\n" \ |
|
"5\n" |
|
|
|
#define jaro_DESC \ |
|
"Compute Jaro string similarity metric of two strings.\n" \ |
|
"\n" \ |
|
"jaro(string1, string2)\n" \ |
|
"\n" \ |
|
"The Jaro string similarity metric is intended for short strings like\n" \ |
|
"personal last names. It is 0 for completely different strings and\n" \ |
|
"1 for identical strings.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
">>> jaro('Brian', 'Jesus')\n" \ |
|
"0.0\n" \ |
|
">>> jaro('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \ |
|
"0.779761...\n" \ |
|
">>> jaro('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \ |
|
"0.708333...\n" |
|
|
|
#define jaro_winkler_DESC \ |
|
"Compute Jaro string similarity metric of two strings.\n" \ |
|
"\n" \ |
|
"jaro_winkler(string1, string2[, prefix_weight])\n" \ |
|
"\n" \ |
|
"The Jaro-Winkler string similarity metric is a modification of Jaro\n" \ |
|
"metric giving more weight to common prefix, as spelling mistakes are\n" \ |
|
"more likely to occur near ends of words.\n" \ |
|
"\n" \ |
|
"The prefix weight is inverse value of common prefix length sufficient\n" \ |
|
"to consider the strings *identical*. If no prefix weight is\n" \ |
|
"specified, 1/10 is used.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> jaro_winkler('Brian', 'Jesus')\n" \ |
|
"0.0\n" \ |
|
">>> jaro_winkler('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \ |
|
"0.867857...\n" \ |
|
">>> jaro_winkler('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \ |
|
"0.7375...\n" \ |
|
">>> jaro_winkler('Thorkel', 'Thorgier', 0.25)\n" \ |
|
"1.0\n" |
|
|
|
#define median_DESC \ |
|
"Find an approximate generalized median string using greedy algorithm.\n" \ |
|
"\n" \ |
|
"median(string_sequence[, weight_sequence])\n" \ |
|
"\n" \ |
|
"You can optionally pass a weight for each string as the second\n" \ |
|
"argument. The weights are interpreted as item multiplicities,\n" \ |
|
"although any non-negative real numbers are accepted. Use them to\n" \ |
|
"improve computation speed when strings often appear multiple times\n" \ |
|
"in the sequence.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> median(['SpSm', 'mpamm', 'Spam', 'Spa', 'Sua', 'hSam'])\n" \ |
|
"'Spam'\n" \ |
|
">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ |
|
"... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ |
|
"... 'Levenhtin', 'evenshtei']\n" \ |
|
">>> median(fixme)\n" \ |
|
"'Levenshtein'\n" \ |
|
"\n" \ |
|
"Hm. Even a computer program can spell Levenshtein better than me.\n" |
|
|
|
#define median_improve_DESC \ |
|
"Improve an approximate generalized median string by perturbations.\n" \ |
|
"\n" \ |
|
"median_improve(string, string_sequence[, weight_sequence])\n" \ |
|
"\n" \ |
|
"The first argument is the estimated generalized median string you\n" \ |
|
"want to improve, the others are the same as in median(). It returns\n" \ |
|
"a string with total distance less or equal to that of the given string.\n" \ |
|
"\n" \ |
|
"Note this is much slower than median(). Also note it performs only\n" \ |
|
"one improvement step, calling median_improve() again on the result\n" \ |
|
"may improve it further, though this is unlikely to happen unless the\n" \ |
|
"given string was not very similar to the actual generalized median.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ |
|
"... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ |
|
"... 'Levenhtin', 'evenshtei']\n" \ |
|
">>> median_improve('spam', fixme)\n" \ |
|
"'enhtein'\n" \ |
|
">>> median_improve(median_improve('spam', fixme), fixme)\n" \ |
|
"'Levenshtein'\n" \ |
|
"\n" \ |
|
"It takes some work to change spam to Levenshtein.\n" |
|
|
|
#define quickmedian_DESC \ |
|
"Find a very approximate generalized median string, but fast.\n" \ |
|
"\n" \ |
|
"quickmedian(string[, weight_sequence])\n" \ |
|
"\n" \ |
|
"See median() for argument description.\n" \ |
|
"\n" \ |
|
"This method is somewhere between setmedian() and picking a random\n" \ |
|
"string from the set; both speedwise and quality-wise.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ |
|
"... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ |
|
"... 'Levenhtin', 'evenshtei']\n" \ |
|
">>> quickmedian(fixme)\n" \ |
|
"'Levnshein'\n" |
|
|
|
#define setmedian_DESC \ |
|
"Find set median of a string set (passed as a sequence).\n" \ |
|
"\n" \ |
|
"setmedian(string[, weight_sequence])\n" \ |
|
"\n" \ |
|
"See median() for argument description.\n" \ |
|
"\n" \ |
|
"The returned string is always one of the strings in the sequence.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> setmedian(['ehee', 'cceaes', 'chees', 'chreesc',\n" \ |
|
"... 'chees', 'cheesee', 'cseese', 'chetese'])\n" \ |
|
"'chees'\n" \ |
|
"\n" \ |
|
"You haven't asked me about Limburger, sir.\n" |
|
|
|
#define seqratio_DESC \ |
|
"Compute similarity ratio of two sequences of strings.\n" \ |
|
"\n" \ |
|
"seqratio(string_sequence1, string_sequence2)\n" \ |
|
"\n" \ |
|
"This is like ratio(), but for string sequences. A kind of ratio()\n" \ |
|
"is used to to measure the cost of item change operation for the\n" \ |
|
"strings.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> seqratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \ |
|
"... ['caribou', 'sausage', 'gorn', 'woody'])\n" \ |
|
"0.21517857142857144\n" |
|
|
|
#define setratio_DESC \ |
|
"Compute similarity ratio of two strings sets (passed as sequences).\n" \ |
|
"\n" \ |
|
"setratio(string_sequence1, string_sequence2)\n" \ |
|
"\n" \ |
|
"The best match between any strings in the first set and the second\n" \ |
|
"set (passed as sequences) is attempted. I.e., the order doesn't\n" \ |
|
"matter here.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> setratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \ |
|
"... ['caribou', 'sausage', 'gorn', 'woody']) # doctest: +ELLIPSIS\n" \ |
|
"0.281845...\n" \ |
|
"\n" \ |
|
"No, even reordering doesn't help the tinny words to match the\n" \ |
|
"woody ones.\n" |
|
|
|
#define editops_DESC \ |
|
"Find sequence of edit operations transforming one string to another.\n" \ |
|
"\n" \ |
|
"editops(source_string, destination_string)\n" \ |
|
"editops(edit_operations, source_length, destination_length)\n" \ |
|
"\n" \ |
|
"The result is a list of triples (operation, spos, dpos), where\n" \ |
|
"operation is one of 'equal', 'replace', 'insert', or 'delete'; spos\n" \ |
|
"and dpos are position of characters in the first (source) and the\n" \ |
|
"second (destination) strings. These are operations on signle\n" \ |
|
"characters. In fact the returned list doesn't contain the 'equal',\n" \ |
|
"but all the related functions accept both lists with and without\n" \ |
|
"'equal's.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> editops('spam', 'park')\n" \ |
|
"[('delete', 0, 0), ('insert', 3, 2), ('replace', 3, 3)]\n" \ |
|
"\n" \ |
|
"The alternate form editops(opcodes, source_string, destination_string)\n" \ |
|
"can be used for conversion from opcodes (5-tuples) to editops (you can\n" \ |
|
"pass strings or their lengths, it doesn't matter).\n" |
|
|
|
#define opcodes_DESC \ |
|
"Find sequence of edit operations transforming one string to another.\n" \ |
|
"\n" \ |
|
"opcodes(source_string, destination_string)\n" \ |
|
"opcodes(edit_operations, source_length, destination_length)\n" \ |
|
"\n" \ |
|
"The result is a list of 5-tuples with the same meaning as in\n" \ |
|
"SequenceMatcher's get_opcodes() output. But since the algorithms\n" \ |
|
"differ, the actual sequences from Levenshtein and SequenceMatcher\n" \ |
|
"may differ too.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> for x in opcodes('spam', 'park'):\n" \ |
|
"... print(x)\n" \ |
|
"...\n" \ |
|
"('delete', 0, 1, 0, 0)\n" \ |
|
"('equal', 1, 3, 0, 2)\n" \ |
|
"('insert', 3, 3, 2, 3)\n" \ |
|
"('replace', 3, 4, 3, 4)\n" \ |
|
"\n" \ |
|
"The alternate form opcodes(editops, source_string, destination_string)\n" \ |
|
"can be used for conversion from editops (triples) to opcodes (you can\n" \ |
|
"pass strings or their lengths, it doesn't matter).\n" |
|
|
|
#define inverse_DESC \ |
|
"Invert the sense of an edit operation sequence.\n" \ |
|
"\n" \ |
|
"inverse(edit_operations)\n" \ |
|
"\n" \ |
|
"In other words, it returns a list of edit operations transforming the\n" \ |
|
"second (destination) string to the first (source). It can be used\n" \ |
|
"with both editops and opcodes.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> inverse(editops('spam', 'park'))\n" \ |
|
"[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" \ |
|
">>> editops('park', 'spam')\n" \ |
|
"[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" |
|
|
|
#define apply_edit_DESC \ |
|
"Apply a sequence of edit operations to a string.\n" \ |
|
"\n" \ |
|
"apply_edit(edit_operations, source_string, destination_string)\n" \ |
|
"\n" \ |
|
"In the case of editops, the sequence can be arbitrary ordered subset\n" \ |
|
"of the edit sequence transforming source_string to destination_string.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> e = editops('man', 'scotsman')\n" \ |
|
">>> apply_edit(e, 'man', 'scotsman')\n" \ |
|
"'scotsman'\n" \ |
|
">>> apply_edit(e[:3], 'man', 'scotsman')\n" \ |
|
"'scoman'\n" \ |
|
"\n" \ |
|
"The other form of edit operations, opcodes, is not very suitable for\n" \ |
|
"such a tricks, because it has to always span over complete strings,\n" \ |
|
"subsets can be created by carefully replacing blocks with 'equal'\n" \ |
|
"blocks, or by enlarging 'equal' block at the expense of other blocks\n" \ |
|
"and adjusting the other blocks accordingly.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
">>> a, b = 'spam and eggs', 'foo and bar'\n" \ |
|
">>> e = opcodes(a, b)\n" \ |
|
">>> apply_edit(inverse(e), b, a)\n" \ |
|
"'spam and eggs'\n" \ |
|
">>> e[4] = ('equal', 10, 13, 8, 11)\n" \ |
|
">>> e\n" \ |
|
"[('delete', 0, 1, 0, 0), ('replace', 1, 4, 0, 3), ('equal', 4, 9, 3, 8), ('delete', 9, 10, 8, 8), ('equal', 10, 13, 8, 11)]\n" \ |
|
">>> apply_edit(e, a, b)\n" \ |
|
"'foo and ggs'\n" |
|
|
|
#define matching_blocks_DESC \ |
|
"Find identical blocks in two strings.\n" \ |
|
"\n" \ |
|
"matching_blocks(edit_operations, source_length, destination_length)\n" \ |
|
"\n" \ |
|
"The result is a list of triples with the same meaning as in\n" \ |
|
"SequenceMatcher's get_matching_blocks() output. It can be used with\n" \ |
|
"both editops and opcodes. The second and third arguments don't\n" \ |
|
"have to be actually strings, their lengths are enough.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> a, b = 'spam', 'park'\n" \ |
|
">>> matching_blocks(editops(a, b), a, b)\n" \ |
|
"[(1, 0, 2), (4, 4, 0)]\n" \ |
|
">>> matching_blocks(editops(a, b), len(a), len(b))\n" \ |
|
"[(1, 0, 2), (4, 4, 0)]\n" \ |
|
"\n" \ |
|
"The last zero-length block is not an error, but it's there for\n" \ |
|
"compatibility with difflib which always emits it.\n" \ |
|
"\n" \ |
|
"One can join the matching blocks to get two identical strings:\n" \ |
|
">>> a, b = 'dog kennels', 'mattresses'\n" \ |
|
">>> mb = matching_blocks(editops(a,b), a, b)\n" \ |
|
">>> ''.join([a[x[0]:x[0]+x[2]] for x in mb])\n" \ |
|
"'ees'\n" \ |
|
">>> ''.join([b[x[1]:x[1]+x[2]] for x in mb])\n" \ |
|
"'ees'\n" |
|
|
|
#define subtract_edit_DESC \ |
|
"Subtract an edit subsequence from a sequence.\n" \ |
|
"\n" \ |
|
"subtract_edit(edit_operations, subsequence)\n" \ |
|
"\n" \ |
|
"The result is equivalent to\n" \ |
|
"editops(apply_edit(subsequence, s1, s2), s2), except that is\n" \ |
|
"constructed directly from the edit operations. That is, if you apply\n" \ |
|
"it to the result of subsequence application, you get the same final\n" \ |
|
"string as from application complete edit_operations. It may be not\n" \ |
|
"identical, though (in amibuous cases, like insertion of a character\n" \ |
|
"next to the same character).\n" \ |
|
"\n" \ |
|
"The subtracted subsequence must be an ordered subset of\n" \ |
|
"edit_operations.\n" \ |
|
"\n" \ |
|
"Note this function does not accept difflib-style opcodes as no one in\n" \ |
|
"his right mind wants to create subsequences from them.\n" \ |
|
"\n" \ |
|
"Examples:\n" \ |
|
"\n" \ |
|
">>> e = editops('man', 'scotsman')\n" \ |
|
">>> e1 = e[:3]\n" \ |
|
">>> bastard = apply_edit(e1, 'man', 'scotsman')\n" \ |
|
">>> bastard\n" \ |
|
"'scoman'\n" \ |
|
">>> apply_edit(subtract_edit(e, e1), bastard, 'scotsman')\n" \ |
|
"'scotsman'\n" \ |
|
|
|
#define METHODS_ITEM(x) { #x, x##_py, METH_VARARGS, x##_DESC } |
|
static PyMethodDef methods[] = { |
|
METHODS_ITEM(distance), |
|
METHODS_ITEM(ratio), |
|
METHODS_ITEM(hamming), |
|
METHODS_ITEM(jaro), |
|
METHODS_ITEM(jaro_winkler), |
|
METHODS_ITEM(median), |
|
METHODS_ITEM(median_improve), |
|
METHODS_ITEM(quickmedian), |
|
METHODS_ITEM(setmedian), |
|
METHODS_ITEM(seqratio), |
|
METHODS_ITEM(setratio), |
|
METHODS_ITEM(editops), |
|
METHODS_ITEM(opcodes), |
|
METHODS_ITEM(inverse), |
|
METHODS_ITEM(apply_edit), |
|
METHODS_ITEM(matching_blocks), |
|
METHODS_ITEM(subtract_edit), |
|
{ NULL, NULL, 0, NULL }, |
|
}; |
|
|
|
|
|
|
|
struct { |
|
PyObject* pystring; |
|
const char *cstring; |
|
size_t len; |
|
} |
|
static opcode_names[] = { |
|
{ NULL, "equal", 0 }, |
|
{ NULL, "replace", 0 }, |
|
{ NULL, "insert", 0 }, |
|
{ NULL, "delete", 0 }, |
|
}; |
|
#define N_OPCODE_NAMES ((sizeof(opcode_names)/sizeof(opcode_names[0]))) |
|
|
|
typedef lev_byte *(*MedianFuncString)(size_t n, |
|
const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
size_t *medlength); |
|
typedef Py_UNICODE *(*MedianFuncUnicode)(size_t n, |
|
const size_t *lengths, |
|
const Py_UNICODE *strings[], |
|
const double *weights, |
|
size_t *medlength); |
|
typedef struct { |
|
MedianFuncString s; |
|
MedianFuncUnicode u; |
|
} MedianFuncs; |
|
|
|
typedef lev_byte *(*MedianImproveFuncString)(size_t len, const lev_byte *s, |
|
size_t n, |
|
const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
size_t *medlength); |
|
typedef Py_UNICODE *(*MedianImproveFuncUnicode)(size_t len, const Py_UNICODE *s, |
|
size_t n, |
|
const size_t *lengths, |
|
const Py_UNICODE *strings[], |
|
const double *weights, |
|
size_t *medlength); |
|
typedef struct { |
|
MedianImproveFuncString s; |
|
MedianImproveFuncUnicode u; |
|
} MedianImproveFuncs; |
|
|
|
typedef double (*SetSeqFuncString)(size_t n1, |
|
const size_t *lengths1, |
|
const lev_byte *strings1[], |
|
size_t n2, |
|
const size_t *lengths2, |
|
const lev_byte *strings2[]); |
|
typedef double (*SetSeqFuncUnicode)(size_t n1, |
|
const size_t *lengths1, |
|
const Py_UNICODE *strings1[], |
|
size_t n2, |
|
const size_t *lengths2, |
|
const Py_UNICODE *strings2[]); |
|
|
|
typedef struct { |
|
SetSeqFuncString s; |
|
SetSeqFuncUnicode u; |
|
} SetSeqFuncs; |
|
|
|
static long int |
|
levenshtein_common(PyObject *args, |
|
const char *name, |
|
size_t xcost, |
|
size_t *lensum); |
|
|
|
static int |
|
extract_stringlist(PyObject *list, |
|
const char *name, |
|
size_t n, |
|
size_t **sizelist, |
|
void *strlist); |
|
|
|
static double* |
|
extract_weightlist(PyObject *wlist, |
|
const char *name, |
|
size_t n); |
|
|
|
static PyObject* |
|
median_common(PyObject *args, |
|
const char *name, |
|
MedianFuncs foo); |
|
|
|
static PyObject* |
|
median_improve_common(PyObject *args, |
|
const char *name, |
|
MedianImproveFuncs foo); |
|
|
|
static double |
|
setseq_common(PyObject *args, |
|
const char *name, |
|
SetSeqFuncs foo, |
|
size_t *lensum); |
|
|
|
static void * |
|
safe_malloc(size_t nmemb, size_t size) { |
|
|
|
if (SIZE_MAX / size <= nmemb) { |
|
return NULL; |
|
} |
|
return malloc(nmemb * size); |
|
} |
|
|
|
static void * |
|
safe_malloc_3(size_t nmemb1, size_t nmemb2, size_t size) { |
|
|
|
if (SIZE_MAX / size <= nmemb2) { |
|
return NULL; |
|
} |
|
return safe_malloc(nmemb1, nmemb2 * size); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static long int |
|
levenshtein_common(PyObject *args, const char *name, size_t xcost, |
|
size_t *lensum) |
|
{ |
|
PyObject *arg1, *arg2; |
|
size_t len1, len2; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) |
|
return -1; |
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2; |
|
|
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
*lensum = len1 + len2; |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
{ |
|
size_t d = lev_edit_distance(len1, string1, len2, string2, xcost); |
|
if (d == (size_t)(-1)) { |
|
PyErr_NoMemory(); |
|
return -1; |
|
} |
|
return d; |
|
} |
|
} |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2; |
|
|
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
*lensum = len1 + len2; |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
{ |
|
size_t d = lev_u_edit_distance(len1, string1, len2, string2, xcost); |
|
if (d == (size_t)(-1)) { |
|
PyErr_NoMemory(); |
|
return -1; |
|
} |
|
return d; |
|
} |
|
} |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s expected two Strings or two Unicodes", name); |
|
return -1; |
|
} |
|
} |
|
|
|
static PyObject* |
|
distance_py(PyObject *self, PyObject *args) |
|
{ |
|
size_t lensum; |
|
long int ldist; |
|
|
|
if ((ldist = levenshtein_common(args, "distance", 0, &lensum)) < 0) |
|
return NULL; |
|
|
|
return PyInt_FromLong((long)ldist); |
|
} |
|
|
|
static PyObject* |
|
ratio_py(PyObject *self, PyObject *args) |
|
{ |
|
size_t lensum; |
|
long int ldist; |
|
|
|
if ((ldist = levenshtein_common(args, "ratio", 1, &lensum)) < 0) |
|
return NULL; |
|
|
|
if (lensum == 0) |
|
return PyFloat_FromDouble(1.0); |
|
|
|
return PyFloat_FromDouble((double)(lensum - ldist)/(lensum)); |
|
} |
|
|
|
static PyObject* |
|
hamming_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *arg1, *arg2; |
|
const char *name = "hamming"; |
|
size_t len1, len2; |
|
long int dist; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) |
|
return NULL; |
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2; |
|
|
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
if (len1 != len2) { |
|
PyErr_Format(PyExc_ValueError, |
|
"%s expected two strings of the same length", name); |
|
return NULL; |
|
} |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
dist = lev_hamming_distance(len1, string1, string2); |
|
return PyInt_FromLong(dist); |
|
} |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2; |
|
|
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
if (len1 != len2) { |
|
PyErr_Format(PyExc_ValueError, |
|
"%s expected two unicodes of the same length", name); |
|
return NULL; |
|
} |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
dist = lev_u_hamming_distance(len1, string1, string2); |
|
return PyInt_FromLong(dist); |
|
} |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s expected two Strings or two Unicodes", name); |
|
return NULL; |
|
} |
|
} |
|
|
|
static PyObject* |
|
jaro_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *arg1, *arg2; |
|
const char *name = "jaro"; |
|
size_t len1, len2; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) |
|
return NULL; |
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2; |
|
|
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
return PyFloat_FromDouble(lev_jaro_ratio(len1, string1, len2, string2)); |
|
} |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2; |
|
|
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
return PyFloat_FromDouble(lev_u_jaro_ratio(len1, string1, len2, string2)); |
|
} |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s expected two Strings or two Unicodes", name); |
|
return NULL; |
|
} |
|
} |
|
|
|
static PyObject* |
|
jaro_winkler_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *arg1, *arg2, *arg3 = NULL; |
|
double pfweight = 0.1; |
|
const char *name = "jaro_winkler"; |
|
size_t len1, len2; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &arg2, &arg3)) |
|
return NULL; |
|
|
|
if (arg3) { |
|
if (!PyObject_TypeCheck(arg3, &PyFloat_Type)) { |
|
PyErr_Format(PyExc_TypeError, "%s third argument must be a Float", name); |
|
return NULL; |
|
} |
|
pfweight = PyFloat_AS_DOUBLE(arg3); |
|
if (pfweight < 0.0) { |
|
PyErr_Format(PyExc_ValueError, "%s negative prefix weight", name); |
|
return NULL; |
|
} |
|
} |
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2; |
|
|
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
return PyFloat_FromDouble(lev_jaro_winkler_ratio(len1, string1, |
|
len2, string2, |
|
pfweight)); |
|
} |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2; |
|
|
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
return PyFloat_FromDouble(lev_u_jaro_winkler_ratio(len1, string1, |
|
len2, string2, |
|
pfweight)); |
|
} |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s expected two Strings or two Unicodes", name); |
|
return NULL; |
|
} |
|
} |
|
|
|
static PyObject* |
|
median_py(PyObject *self, PyObject *args) |
|
{ |
|
MedianFuncs engines = { lev_greedy_median, lev_u_greedy_median }; |
|
return median_common(args, "median", engines); |
|
} |
|
|
|
static PyObject* |
|
median_improve_py(PyObject *self, PyObject *args) |
|
{ |
|
MedianImproveFuncs engines = { lev_median_improve, lev_u_median_improve }; |
|
return median_improve_common(args, "median_improve", engines); |
|
} |
|
|
|
static PyObject* |
|
quickmedian_py(PyObject *self, PyObject *args) |
|
{ |
|
MedianFuncs engines = { lev_quick_median, lev_u_quick_median }; |
|
return median_common(args, "quickmedian", engines); |
|
} |
|
|
|
static PyObject* |
|
setmedian_py(PyObject *self, PyObject *args) |
|
{ |
|
MedianFuncs engines = { lev_set_median, lev_u_set_median }; |
|
return median_common(args, "setmedian", engines); |
|
} |
|
|
|
static PyObject* |
|
median_common(PyObject *args, const char *name, MedianFuncs foo) |
|
{ |
|
size_t n, len; |
|
void *strings = NULL; |
|
size_t *sizes = NULL; |
|
PyObject *strlist = NULL; |
|
PyObject *wlist = NULL; |
|
PyObject *strseq = NULL; |
|
double *weights; |
|
int stringtype; |
|
PyObject *result = NULL; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 1, 2, &strlist, &wlist)) |
|
return NULL; |
|
|
|
if (!PySequence_Check(strlist)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s first argument must be a Sequence", name); |
|
return NULL; |
|
} |
|
strseq = PySequence_Fast(strlist, name); |
|
|
|
n = PySequence_Fast_GET_SIZE(strseq); |
|
if (n == 0) { |
|
Py_INCREF(Py_None); |
|
Py_DECREF(strseq); |
|
return Py_None; |
|
} |
|
|
|
|
|
weights = extract_weightlist(wlist, name, n); |
|
if (!weights) { |
|
Py_DECREF(strseq); |
|
return NULL; |
|
} |
|
|
|
stringtype = extract_stringlist(strseq, name, n, &sizes, &strings); |
|
Py_DECREF(strseq); |
|
if (stringtype < 0) { |
|
free(weights); |
|
return NULL; |
|
} |
|
|
|
if (stringtype == 0) { |
|
lev_byte *medstr = foo.s(n, sizes, strings, weights, &len); |
|
if (!medstr && len) |
|
result = PyErr_NoMemory(); |
|
else { |
|
result = PyString_FromStringAndSize(medstr, len); |
|
free(medstr); |
|
} |
|
} |
|
else if (stringtype == 1) { |
|
Py_UNICODE *medstr = foo.u(n, sizes, strings, weights, &len); |
|
if (!medstr && len) |
|
result = PyErr_NoMemory(); |
|
else { |
|
result = PyUnicode_FromUnicode(medstr, len); |
|
free(medstr); |
|
} |
|
} |
|
else |
|
PyErr_Format(PyExc_SystemError, "%s internal error", name); |
|
|
|
free(strings); |
|
free(weights); |
|
free(sizes); |
|
return result; |
|
} |
|
|
|
static PyObject* |
|
median_improve_common(PyObject *args, const char *name, MedianImproveFuncs foo) |
|
{ |
|
size_t n, len; |
|
void *strings = NULL; |
|
size_t *sizes = NULL; |
|
PyObject *arg1 = NULL; |
|
PyObject *strlist = NULL; |
|
PyObject *wlist = NULL; |
|
PyObject *strseq = NULL; |
|
double *weights; |
|
int stringtype; |
|
PyObject *result = NULL; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &strlist, &wlist)) |
|
return NULL; |
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type)) |
|
stringtype = 0; |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)) |
|
stringtype = 1; |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s first argument must be a String or Unicode", name); |
|
return NULL; |
|
} |
|
|
|
if (!PySequence_Check(strlist)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s second argument must be a Sequence", name); |
|
return NULL; |
|
} |
|
strseq = PySequence_Fast(strlist, name); |
|
|
|
n = PySequence_Fast_GET_SIZE(strseq); |
|
if (n == 0) { |
|
Py_INCREF(Py_None); |
|
Py_DECREF(strseq); |
|
return Py_None; |
|
} |
|
|
|
|
|
weights = extract_weightlist(wlist, name, n); |
|
if (!weights) { |
|
Py_DECREF(strseq); |
|
return NULL; |
|
} |
|
|
|
if (extract_stringlist(strseq, name, n, &sizes, &strings) != stringtype) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s argument types don't match", name); |
|
free(weights); |
|
return NULL; |
|
} |
|
|
|
Py_DECREF(strseq); |
|
if (stringtype == 0) { |
|
lev_byte *s = PyString_AS_STRING(arg1); |
|
size_t l = PyString_GET_SIZE(arg1); |
|
lev_byte *medstr = foo.s(l, s, n, sizes, strings, weights, &len); |
|
if (!medstr && len) |
|
result = PyErr_NoMemory(); |
|
else { |
|
result = PyString_FromStringAndSize(medstr, len); |
|
free(medstr); |
|
} |
|
} |
|
else if (stringtype == 1) { |
|
Py_UNICODE *s = PyUnicode_AS_UNICODE(arg1); |
|
size_t l = PyUnicode_GET_SIZE(arg1); |
|
Py_UNICODE *medstr = foo.u(l, s, n, sizes, strings, weights, &len); |
|
if (!medstr && len) |
|
result = PyErr_NoMemory(); |
|
else { |
|
result = PyUnicode_FromUnicode(medstr, len); |
|
free(medstr); |
|
} |
|
} |
|
else |
|
PyErr_Format(PyExc_SystemError, "%s internal error", name); |
|
|
|
free(strings); |
|
free(weights); |
|
free(sizes); |
|
return result; |
|
} |
|
|
|
static double* |
|
extract_weightlist(PyObject *wlist, const char *name, size_t n) |
|
{ |
|
size_t i; |
|
double *weights = NULL; |
|
PyObject *seq; |
|
|
|
if (wlist) { |
|
if (!PySequence_Check(wlist)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s second argument must be a Sequence", name); |
|
return NULL; |
|
} |
|
seq = PySequence_Fast(wlist, name); |
|
if (PySequence_Fast_GET_SIZE(wlist) != n) { |
|
PyErr_Format(PyExc_ValueError, |
|
"%s got %i strings but %i weights", |
|
name, n, PyList_GET_SIZE(wlist)); |
|
Py_DECREF(seq); |
|
return NULL; |
|
} |
|
weights = (double*)safe_malloc(n, sizeof(double)); |
|
if (!weights) |
|
return (double*)PyErr_NoMemory(); |
|
for (i = 0; i < n; i++) { |
|
PyObject *item = PySequence_Fast_GET_ITEM(wlist, i); |
|
PyObject *number = PyNumber_Float(item); |
|
|
|
if (!number) { |
|
free(weights); |
|
PyErr_Format(PyExc_TypeError, |
|
"%s weight #%i is not a Number", name, i); |
|
Py_DECREF(seq); |
|
return NULL; |
|
} |
|
weights[i] = PyFloat_AS_DOUBLE(number); |
|
Py_DECREF(number); |
|
if (weights[i] < 0) { |
|
free(weights); |
|
PyErr_Format(PyExc_ValueError, |
|
"%s weight #%i is negative", name, i); |
|
Py_DECREF(seq); |
|
return NULL; |
|
} |
|
} |
|
Py_DECREF(seq); |
|
} |
|
else { |
|
weights = (double*)safe_malloc(n, sizeof(double)); |
|
if (!weights) |
|
return (double*)PyErr_NoMemory(); |
|
for (i = 0; i < n; i++) |
|
weights[i] = 1.0; |
|
} |
|
|
|
return weights; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
static int |
|
extract_stringlist(PyObject *list, const char *name, |
|
size_t n, size_t **sizelist, void *strlist) |
|
{ |
|
size_t i; |
|
PyObject *first; |
|
|
|
|
|
|
|
|
|
first = PySequence_Fast_GET_ITEM(list, 0); |
|
|
|
|
|
if (first == (PyObject*)-1) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s undecomposable Sequence???", name); |
|
return -1; |
|
} |
|
|
|
if (PyObject_TypeCheck(first, &PyString_Type)) { |
|
lev_byte **strings; |
|
size_t *sizes; |
|
|
|
strings = (lev_byte**)safe_malloc(n, sizeof(lev_byte*)); |
|
if (!strings) { |
|
PyErr_Format(PyExc_MemoryError, |
|
"%s cannot allocate memory", name); |
|
return -1; |
|
} |
|
sizes = (size_t*)safe_malloc(n, sizeof(size_t)); |
|
if (!sizes) { |
|
free(strings); |
|
PyErr_Format(PyExc_MemoryError, |
|
"%s cannot allocate memory", name); |
|
return -1; |
|
} |
|
|
|
strings[0] = PyString_AS_STRING(first); |
|
sizes[0] = PyString_GET_SIZE(first); |
|
for (i = 1; i < n; i++) { |
|
PyObject *item = PySequence_Fast_GET_ITEM(list, i); |
|
|
|
if (!PyObject_TypeCheck(item, &PyString_Type)) { |
|
free(strings); |
|
free(sizes); |
|
PyErr_Format(PyExc_TypeError, |
|
"%s item #%i is not a String", name, i); |
|
return -1; |
|
} |
|
strings[i] = PyString_AS_STRING(item); |
|
sizes[i] = PyString_GET_SIZE(item); |
|
} |
|
|
|
*(lev_byte***)strlist = strings; |
|
*sizelist = sizes; |
|
return 0; |
|
} |
|
if (PyObject_TypeCheck(first, &PyUnicode_Type)) { |
|
Py_UNICODE **strings; |
|
size_t *sizes; |
|
|
|
strings = (Py_UNICODE**)safe_malloc(n, sizeof(Py_UNICODE*)); |
|
if (!strings) { |
|
PyErr_NoMemory(); |
|
return -1; |
|
} |
|
sizes = (size_t*)safe_malloc(n, sizeof(size_t)); |
|
if (!sizes) { |
|
free(strings); |
|
PyErr_NoMemory(); |
|
return -1; |
|
} |
|
|
|
strings[0] = PyUnicode_AS_UNICODE(first); |
|
sizes[0] = PyUnicode_GET_SIZE(first); |
|
for (i = 1; i < n; i++) { |
|
PyObject *item = PySequence_Fast_GET_ITEM(list, i); |
|
|
|
if (!PyObject_TypeCheck(item, &PyUnicode_Type)) { |
|
free(strings); |
|
free(sizes); |
|
PyErr_Format(PyExc_TypeError, |
|
"%s item #%i is not a Unicode", name, i); |
|
return -1; |
|
} |
|
strings[i] = PyUnicode_AS_UNICODE(item); |
|
sizes[i] = PyUnicode_GET_SIZE(item); |
|
} |
|
|
|
*(Py_UNICODE***)strlist = strings; |
|
*sizelist = sizes; |
|
return 1; |
|
} |
|
|
|
PyErr_Format(PyExc_TypeError, |
|
"%s expected list of Strings or Unicodes", name); |
|
return -1; |
|
} |
|
|
|
static PyObject* |
|
seqratio_py(PyObject *self, PyObject *args) |
|
{ |
|
SetSeqFuncs engines = { lev_edit_seq_distance, lev_u_edit_seq_distance }; |
|
size_t lensum; |
|
double r = setseq_common(args, "seqratio", engines, &lensum); |
|
if (r < 0) |
|
return NULL; |
|
if (lensum == 0) |
|
return PyFloat_FromDouble(1.0); |
|
return PyFloat_FromDouble((lensum - r)/lensum); |
|
} |
|
|
|
static PyObject* |
|
setratio_py(PyObject *self, PyObject *args) |
|
{ |
|
SetSeqFuncs engines = { lev_set_distance, lev_u_set_distance }; |
|
size_t lensum; |
|
double r = setseq_common(args, "setratio", engines, &lensum); |
|
if (r < 0) |
|
return NULL; |
|
if (lensum == 0) |
|
return PyFloat_FromDouble(1.0); |
|
return PyFloat_FromDouble((lensum - r)/lensum); |
|
} |
|
|
|
static double |
|
setseq_common(PyObject *args, const char *name, SetSeqFuncs foo, |
|
size_t *lensum) |
|
{ |
|
size_t n1, n2; |
|
void *strings1 = NULL; |
|
void *strings2 = NULL; |
|
size_t *sizes1 = NULL; |
|
size_t *sizes2 = NULL; |
|
PyObject *strlist1; |
|
PyObject *strlist2; |
|
PyObject *strseq1; |
|
PyObject *strseq2; |
|
int stringtype1, stringtype2; |
|
double r = -1.0; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &strlist1, &strlist2)) |
|
return r; |
|
|
|
if (!PySequence_Check(strlist1)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s first argument must be a Sequence", name); |
|
return r; |
|
} |
|
if (!PySequence_Check(strlist2)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s second argument must be a Sequence", name); |
|
return r; |
|
} |
|
|
|
strseq1 = PySequence_Fast(strlist1, name); |
|
strseq2 = PySequence_Fast(strlist2, name); |
|
|
|
n1 = PySequence_Fast_GET_SIZE(strseq1); |
|
n2 = PySequence_Fast_GET_SIZE(strseq2); |
|
*lensum = n1 + n2; |
|
if (n1 == 0) { |
|
Py_DECREF(strseq1); |
|
Py_DECREF(strseq2); |
|
return (double)n2; |
|
} |
|
if (n2 == 0) { |
|
Py_DECREF(strseq1); |
|
Py_DECREF(strseq2); |
|
return (double)n1; |
|
} |
|
|
|
stringtype1 = extract_stringlist(strseq1, name, n1, &sizes1, &strings1); |
|
Py_DECREF(strseq1); |
|
if (stringtype1 < 0) { |
|
Py_DECREF(strseq2); |
|
return r; |
|
} |
|
stringtype2 = extract_stringlist(strseq2, name, n2, &sizes2, &strings2); |
|
Py_DECREF(strseq2); |
|
if (stringtype2 < 0) { |
|
free(sizes1); |
|
free(strings1); |
|
return r; |
|
} |
|
|
|
if (stringtype1 != stringtype2) { |
|
PyErr_Format(PyExc_TypeError, |
|
"%s both sequences must consist of items of the same type", |
|
name); |
|
} |
|
else if (stringtype1 == 0) { |
|
r = foo.s(n1, sizes1, strings1, n2, sizes2, strings2); |
|
if (r < 0.0) |
|
PyErr_NoMemory(); |
|
} |
|
else if (stringtype1 == 1) { |
|
r = foo.u(n1, sizes1, strings1, n2, sizes2, strings2); |
|
if (r < 0.0) |
|
PyErr_NoMemory(); |
|
} |
|
else |
|
PyErr_Format(PyExc_SystemError, "%s internal error", name); |
|
|
|
free(strings1); |
|
free(strings2); |
|
free(sizes1); |
|
free(sizes2); |
|
return r; |
|
} |
|
|
|
static inline LevEditType |
|
string_to_edittype(PyObject *string) |
|
{ |
|
const char *s; |
|
size_t i, len; |
|
|
|
for (i = 0; i < N_OPCODE_NAMES; i++) { |
|
if (string == opcode_names[i].pystring) |
|
return i; |
|
} |
|
|
|
|
|
|
|
#ifdef LEV_PYTHON3 |
|
|
|
if (!PyUnicode_Check(string)) { |
|
return LEV_EDIT_LAST; |
|
} |
|
|
|
for (i = 0; i < N_OPCODE_NAMES; i++) { |
|
if (PyUnicode_CompareWithASCIIString(string, opcode_names[i].cstring) == 0) { |
|
return i; |
|
} |
|
} |
|
|
|
#else |
|
|
|
if (!PyString_Check(string)) |
|
return LEV_EDIT_LAST; |
|
|
|
s = PyString_AS_STRING(string); |
|
len = PyString_GET_SIZE(string); |
|
for (i = 0; i < N_OPCODE_NAMES; i++) { |
|
if (len == opcode_names[i].len |
|
&& memcmp(s, opcode_names[i].cstring, len) == 0) { |
|
return i; |
|
} |
|
} |
|
#endif |
|
|
|
return LEV_EDIT_LAST; |
|
} |
|
|
|
static LevEditOp* |
|
extract_editops(PyObject *list) |
|
{ |
|
LevEditOp *ops; |
|
size_t i; |
|
LevEditType type; |
|
size_t n = PyList_GET_SIZE(list); |
|
|
|
ops = (LevEditOp*)safe_malloc(n, sizeof(LevEditOp)); |
|
if (!ops) |
|
return (LevEditOp*)PyErr_NoMemory(); |
|
for (i = 0; i < n; i++) { |
|
PyObject *item; |
|
PyObject *tuple = PyList_GET_ITEM(list, i); |
|
|
|
if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 3) { |
|
free(ops); |
|
return NULL; |
|
} |
|
item = PyTuple_GET_ITEM(tuple, 0); |
|
if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) { |
|
free(ops); |
|
return NULL; |
|
} |
|
ops[i].type = type; |
|
item = PyTuple_GET_ITEM(tuple, 1); |
|
if (!PyInt_Check(item)) { |
|
free(ops); |
|
return NULL; |
|
} |
|
ops[i].spos = (size_t)PyInt_AS_LONG(item); |
|
item = PyTuple_GET_ITEM(tuple, 2); |
|
if (!PyInt_Check(item)) { |
|
free(ops); |
|
return NULL; |
|
} |
|
ops[i].dpos = (size_t)PyInt_AS_LONG(item); |
|
} |
|
return ops; |
|
} |
|
|
|
static LevOpCode* |
|
extract_opcodes(PyObject *list) |
|
{ |
|
LevOpCode *bops; |
|
size_t i; |
|
LevEditType type; |
|
size_t nb = PyList_GET_SIZE(list); |
|
|
|
bops = (LevOpCode*)safe_malloc(nb, sizeof(LevOpCode)); |
|
if (!bops) |
|
return (LevOpCode*)PyErr_NoMemory(); |
|
for (i = 0; i < nb; i++) { |
|
PyObject *item; |
|
PyObject *tuple = PyList_GET_ITEM(list, i); |
|
|
|
if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 5) { |
|
free(bops); |
|
return NULL; |
|
} |
|
|
|
item = PyTuple_GET_ITEM(tuple, 0); |
|
if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) { |
|
free(bops); |
|
return NULL; |
|
} |
|
bops[i].type = type; |
|
|
|
item = PyTuple_GET_ITEM(tuple, 1); |
|
if (!PyInt_Check(item)) { |
|
free(bops); |
|
return NULL; |
|
} |
|
bops[i].sbeg = (size_t)PyInt_AS_LONG(item); |
|
|
|
item = PyTuple_GET_ITEM(tuple, 2); |
|
if (!PyInt_Check(item)) { |
|
free(bops); |
|
return NULL; |
|
} |
|
bops[i].send = (size_t)PyInt_AS_LONG(item); |
|
|
|
item = PyTuple_GET_ITEM(tuple, 3); |
|
if (!PyInt_Check(item)) { |
|
free(bops); |
|
return NULL; |
|
} |
|
bops[i].dbeg = (size_t)PyInt_AS_LONG(item); |
|
|
|
item = PyTuple_GET_ITEM(tuple, 4); |
|
if (!PyInt_Check(item)) { |
|
free(bops); |
|
return NULL; |
|
} |
|
bops[i].dend = (size_t)PyInt_AS_LONG(item); |
|
} |
|
return bops; |
|
} |
|
|
|
static PyObject* |
|
editops_to_tuple_list(size_t n, LevEditOp *ops) |
|
{ |
|
PyObject *list; |
|
size_t i; |
|
|
|
list = PyList_New(n); |
|
for (i = 0; i < n; i++, ops++) { |
|
PyObject *tuple = PyTuple_New(3); |
|
PyObject *is = opcode_names[ops->type].pystring; |
|
Py_INCREF(is); |
|
PyTuple_SET_ITEM(tuple, 0, is); |
|
PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)ops->spos)); |
|
PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)ops->dpos)); |
|
PyList_SET_ITEM(list, i, tuple); |
|
} |
|
|
|
return list; |
|
} |
|
|
|
static PyObject* |
|
matching_blocks_to_tuple_list(size_t len1, size_t len2, |
|
size_t nmb, LevMatchingBlock *mblocks) |
|
{ |
|
PyObject *list, *tuple; |
|
size_t i; |
|
|
|
list = PyList_New(nmb + 1); |
|
for (i = 0; i < nmb; i++, mblocks++) { |
|
tuple = PyTuple_New(3); |
|
PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)mblocks->spos)); |
|
PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)mblocks->dpos)); |
|
PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)mblocks->len)); |
|
PyList_SET_ITEM(list, i, tuple); |
|
} |
|
tuple = PyTuple_New(3); |
|
PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)len1)); |
|
PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)len2)); |
|
PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)0)); |
|
PyList_SET_ITEM(list, nmb, tuple); |
|
|
|
return list; |
|
} |
|
|
|
static size_t |
|
get_length_of_anything(PyObject *object) |
|
{ |
|
if (PyInt_Check(object)) { |
|
long int len = PyInt_AS_LONG(object); |
|
if (len < 0) |
|
len = -1; |
|
return (size_t)len; |
|
} |
|
if (PySequence_Check(object)) |
|
return PySequence_Length(object); |
|
return (size_t)-1; |
|
} |
|
|
|
static PyObject* |
|
editops_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *arg1, *arg2, *arg3 = NULL; |
|
PyObject *oplist; |
|
size_t len1, len2, n; |
|
LevEditOp *ops; |
|
LevOpCode *bops; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX("editops"), 2, 3, |
|
&arg1, &arg2, &arg3)) { |
|
return NULL; |
|
} |
|
|
|
|
|
if (arg3) { |
|
if (!PyList_Check(arg1)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"editops first argument must be a List of edit operations"); |
|
return NULL; |
|
} |
|
n = PyList_GET_SIZE(arg1); |
|
if (!n) { |
|
Py_INCREF(arg1); |
|
return arg1; |
|
} |
|
len1 = get_length_of_anything(arg2); |
|
len2 = get_length_of_anything(arg3); |
|
if (len1 == (size_t)-1 || len2 == (size_t)-1) { |
|
PyErr_Format(PyExc_ValueError, |
|
"editops second and third argument must specify sizes"); |
|
return NULL; |
|
} |
|
|
|
if ((bops = extract_opcodes(arg1)) != NULL) { |
|
if (lev_opcodes_check_errors(len1, len2, n, bops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"editops edit operation list is invalid"); |
|
free(bops); |
|
return NULL; |
|
} |
|
ops = lev_opcodes_to_editops(n, bops, &n, 0); |
|
if (!ops && n) { |
|
free(bops); |
|
return PyErr_NoMemory(); |
|
} |
|
oplist = editops_to_tuple_list(n, ops); |
|
free(ops); |
|
free(bops); |
|
return oplist; |
|
} |
|
if ((ops = extract_editops(arg1)) != NULL) { |
|
if (lev_editops_check_errors(len1, len2, n, ops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"editops edit operation list is invalid"); |
|
free(ops); |
|
return NULL; |
|
} |
|
free(ops); |
|
Py_INCREF(arg1); |
|
return arg1; |
|
} |
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"editops first argument must be a List of edit operations"); |
|
return NULL; |
|
} |
|
|
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2; |
|
|
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
ops = lev_editops_find(len1, string1, len2, string2, &n); |
|
} |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2; |
|
|
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
ops = lev_u_editops_find(len1, string1, len2, string2, &n); |
|
} |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"editops expected two Strings or two Unicodes"); |
|
return NULL; |
|
} |
|
if (!ops && n) |
|
return PyErr_NoMemory(); |
|
oplist = editops_to_tuple_list(n, ops); |
|
free(ops); |
|
return oplist; |
|
} |
|
|
|
static PyObject* |
|
opcodes_to_tuple_list(size_t nb, LevOpCode *bops) |
|
{ |
|
PyObject *list; |
|
size_t i; |
|
|
|
list = PyList_New(nb); |
|
for (i = 0; i < nb; i++, bops++) { |
|
PyObject *tuple = PyTuple_New(5); |
|
PyObject *is = opcode_names[bops->type].pystring; |
|
Py_INCREF(is); |
|
PyTuple_SET_ITEM(tuple, 0, is); |
|
PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)bops->sbeg)); |
|
PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)bops->send)); |
|
PyTuple_SET_ITEM(tuple, 3, PyInt_FromLong((long)bops->dbeg)); |
|
PyTuple_SET_ITEM(tuple, 4, PyInt_FromLong((long)bops->dend)); |
|
PyList_SET_ITEM(list, i, tuple); |
|
} |
|
|
|
return list; |
|
} |
|
|
|
static PyObject* |
|
opcodes_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *arg1, *arg2, *arg3 = NULL; |
|
PyObject *oplist; |
|
size_t len1, len2, n, nb; |
|
LevEditOp *ops; |
|
LevOpCode *bops; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX("opcodes"), 2, 3, |
|
&arg1, &arg2, &arg3)) |
|
return NULL; |
|
|
|
|
|
if (arg3) { |
|
if (!PyList_Check(arg1)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"opcodes first argument must be a List of edit operations"); |
|
return NULL; |
|
} |
|
n = PyList_GET_SIZE(arg1); |
|
len1 = get_length_of_anything(arg2); |
|
len2 = get_length_of_anything(arg3); |
|
if (len1 == (size_t)-1 || len2 == (size_t)-1) { |
|
PyErr_Format(PyExc_ValueError, |
|
"opcodes second and third argument must specify sizes"); |
|
return NULL; |
|
} |
|
|
|
if ((ops = extract_editops(arg1)) != NULL) { |
|
if (lev_editops_check_errors(len1, len2, n, ops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"opcodes edit operation list is invalid"); |
|
free(ops); |
|
return NULL; |
|
} |
|
bops = lev_editops_to_opcodes(n, ops, &n, len1, len2); |
|
if (!bops && n) { |
|
free(ops); |
|
return PyErr_NoMemory(); |
|
} |
|
oplist = opcodes_to_tuple_list(n, bops); |
|
free(bops); |
|
free(ops); |
|
return oplist; |
|
} |
|
if ((bops = extract_opcodes(arg1)) != NULL) { |
|
if (lev_opcodes_check_errors(len1, len2, n, bops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"opcodes edit operation list is invalid"); |
|
free(bops); |
|
return NULL; |
|
} |
|
free(bops); |
|
Py_INCREF(arg1); |
|
return arg1; |
|
} |
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"opcodes first argument must be a List of edit operations"); |
|
return NULL; |
|
} |
|
|
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2; |
|
|
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
ops = lev_editops_find(len1, string1, len2, string2, &n); |
|
} |
|
else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2; |
|
|
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
ops = lev_u_editops_find(len1, string1, len2, string2, &n); |
|
} |
|
else { |
|
PyErr_Format(PyExc_TypeError, |
|
"opcodes expected two Strings or two Unicodes"); |
|
return NULL; |
|
} |
|
if (!ops && n) |
|
return PyErr_NoMemory(); |
|
bops = lev_editops_to_opcodes(n, ops, &nb, len1, len2); |
|
free(ops); |
|
if (!bops && nb) |
|
return PyErr_NoMemory(); |
|
oplist = opcodes_to_tuple_list(nb, bops); |
|
free(bops); |
|
return oplist; |
|
} |
|
|
|
static PyObject* |
|
inverse_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *list, *result; |
|
size_t n; |
|
LevEditOp *ops; |
|
LevOpCode *bops; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX("inverse"), 1, 1, &list) |
|
|| !PyList_Check(list)) |
|
return NULL; |
|
|
|
n = PyList_GET_SIZE(list); |
|
if (!n) { |
|
Py_INCREF(list); |
|
return list; |
|
} |
|
if ((ops = extract_editops(list)) != NULL) { |
|
lev_editops_invert(n, ops); |
|
result = editops_to_tuple_list(n, ops); |
|
free(ops); |
|
return result; |
|
} |
|
if ((bops = extract_opcodes(list)) != NULL) { |
|
lev_opcodes_invert(n, bops); |
|
result = opcodes_to_tuple_list(n, bops); |
|
free(bops); |
|
return result; |
|
} |
|
|
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"inverse expected a list of edit operations"); |
|
return NULL; |
|
} |
|
|
|
static PyObject* |
|
apply_edit_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *list, *result, *arg1, *arg2; |
|
size_t n, len, len1, len2; |
|
LevEditOp *ops; |
|
LevOpCode *bops; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX("apply_edit"), 3, 3, |
|
&list, &arg1, &arg2)) |
|
return NULL; |
|
|
|
if (!PyList_Check(list)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"apply_edit first argument must be a List of edit operations"); |
|
return NULL; |
|
} |
|
n = PyList_GET_SIZE(list); |
|
|
|
if (PyObject_TypeCheck(arg1, &PyString_Type) |
|
&& PyObject_TypeCheck(arg2, &PyString_Type)) { |
|
lev_byte *string1, *string2, *s; |
|
|
|
if (!n) { |
|
Py_INCREF(arg1); |
|
return arg1; |
|
} |
|
len1 = PyString_GET_SIZE(arg1); |
|
len2 = PyString_GET_SIZE(arg2); |
|
string1 = PyString_AS_STRING(arg1); |
|
string2 = PyString_AS_STRING(arg2); |
|
|
|
if ((ops = extract_editops(list)) != NULL) { |
|
if (lev_editops_check_errors(len1, len2, n, ops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"apply_edit edit operations are invalid or inapplicable"); |
|
free(ops); |
|
return NULL; |
|
} |
|
s = lev_editops_apply(len1, string1, len2, string2, |
|
n, ops, &len); |
|
free(ops); |
|
if (!s && len) |
|
return PyErr_NoMemory(); |
|
result = PyString_FromStringAndSize(s, len); |
|
free(s); |
|
return result; |
|
} |
|
if ((bops = extract_opcodes(list)) != NULL) { |
|
if (lev_opcodes_check_errors(len1, len2, n, bops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"apply_edit edit operations are invalid or inapplicable"); |
|
free(bops); |
|
return NULL; |
|
} |
|
s = lev_opcodes_apply(len1, string1, len2, string2, |
|
n, bops, &len); |
|
free(bops); |
|
if (!s && len) |
|
return PyErr_NoMemory(); |
|
result = PyString_FromStringAndSize(s, len); |
|
free(s); |
|
return result; |
|
} |
|
|
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"apply_edit first argument must be " |
|
"a list of edit operations"); |
|
return NULL; |
|
} |
|
if (PyObject_TypeCheck(arg1, &PyUnicode_Type) |
|
&& PyObject_TypeCheck(arg2, &PyUnicode_Type)) { |
|
Py_UNICODE *string1, *string2, *s; |
|
|
|
if (!n) { |
|
Py_INCREF(arg1); |
|
return arg1; |
|
} |
|
len1 = PyUnicode_GET_SIZE(arg1); |
|
len2 = PyUnicode_GET_SIZE(arg2); |
|
string1 = PyUnicode_AS_UNICODE(arg1); |
|
string2 = PyUnicode_AS_UNICODE(arg2); |
|
|
|
if ((ops = extract_editops(list)) != NULL) { |
|
if (lev_editops_check_errors(len1, len2, n, ops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"apply_edit edit operations are invalid or inapplicable"); |
|
free(ops); |
|
return NULL; |
|
} |
|
s = lev_u_editops_apply(len1, string1, len2, string2, |
|
n, ops, &len); |
|
free(ops); |
|
if (!s && len) |
|
return PyErr_NoMemory(); |
|
result = PyUnicode_FromUnicode(s, len); |
|
free(s); |
|
return result; |
|
} |
|
if ((bops = extract_opcodes(list)) != NULL) { |
|
if (lev_opcodes_check_errors(len1, len2, n, bops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"apply_edit edit operations are invalid or inapplicable"); |
|
free(bops); |
|
return NULL; |
|
} |
|
s = lev_u_opcodes_apply(len1, string1, len2, string2, |
|
n, bops, &len); |
|
free(bops); |
|
if (!s && len) |
|
return PyErr_NoMemory(); |
|
result = PyUnicode_FromUnicode(s, len); |
|
free(s); |
|
return result; |
|
} |
|
|
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"apply_edit first argument must be " |
|
"a list of edit operations"); |
|
return NULL; |
|
} |
|
|
|
PyErr_Format(PyExc_TypeError, |
|
"apply_edit expected two Strings or two Unicodes"); |
|
return NULL; |
|
} |
|
|
|
static PyObject* |
|
matching_blocks_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *list, *arg1, *arg2, *result; |
|
size_t n, nmb, len1, len2; |
|
LevEditOp *ops; |
|
LevOpCode *bops; |
|
LevMatchingBlock *mblocks; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX("matching_blocks"), 3, 3, |
|
&list, &arg1, &arg2) |
|
|| !PyList_Check(list)) |
|
return NULL; |
|
|
|
if (!PyList_Check(list)) { |
|
PyErr_Format(PyExc_TypeError, |
|
"matching_blocks first argument must be " |
|
"a List of edit operations"); |
|
return NULL; |
|
} |
|
n = PyList_GET_SIZE(list); |
|
len1 = get_length_of_anything(arg1); |
|
len2 = get_length_of_anything(arg2); |
|
if (len1 == (size_t)-1 || len2 == (size_t)-1) { |
|
PyErr_Format(PyExc_ValueError, |
|
"matching_blocks second and third argument " |
|
"must specify sizes"); |
|
return NULL; |
|
} |
|
|
|
if ((ops = extract_editops(list)) != NULL) { |
|
if (lev_editops_check_errors(len1, len2, n, ops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"apply_edit edit operations are invalid or inapplicable"); |
|
free(ops); |
|
return NULL; |
|
} |
|
mblocks = lev_editops_matching_blocks(len1, len2, n, ops, &nmb); |
|
free(ops); |
|
if (!mblocks && nmb) |
|
return PyErr_NoMemory(); |
|
result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks); |
|
free(mblocks); |
|
return result; |
|
} |
|
if ((bops = extract_opcodes(list)) != NULL) { |
|
if (lev_opcodes_check_errors(len1, len2, n, bops)) { |
|
PyErr_Format(PyExc_ValueError, |
|
"apply_edit edit operations are invalid or inapplicable"); |
|
free(bops); |
|
return NULL; |
|
} |
|
mblocks = lev_opcodes_matching_blocks(len1, len2, n, bops, &nmb); |
|
free(bops); |
|
if (!mblocks && nmb) |
|
return PyErr_NoMemory(); |
|
result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks); |
|
free(mblocks); |
|
return result; |
|
} |
|
|
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"inverse expected a list of edit operations"); |
|
return NULL; |
|
} |
|
|
|
static PyObject* |
|
subtract_edit_py(PyObject *self, PyObject *args) |
|
{ |
|
PyObject *list, *sub, *result; |
|
size_t n, ns, nr; |
|
LevEditOp *ops, *osub, *orem; |
|
|
|
if (!PyArg_UnpackTuple(args, PYARGCFIX("subtract_edit"), 2, 2, &list, &sub) |
|
|| !PyList_Check(list)) |
|
return NULL; |
|
|
|
ns = PyList_GET_SIZE(sub); |
|
if (!ns) { |
|
Py_INCREF(list); |
|
return list; |
|
} |
|
n = PyList_GET_SIZE(list); |
|
if (!n) { |
|
PyErr_Format(PyExc_ValueError, |
|
"subtract_edit subsequence is not a subsequence " |
|
"or is invalid"); |
|
return NULL; |
|
} |
|
|
|
if ((ops = extract_editops(list)) != NULL) { |
|
if ((osub = extract_editops(sub)) != NULL) { |
|
orem = lev_editops_subtract(n, ops, ns, osub, &nr); |
|
free(ops); |
|
free(osub); |
|
|
|
if (!orem && nr == -1) { |
|
PyErr_Format(PyExc_ValueError, |
|
"subtract_edit subsequence is not a subsequence " |
|
"or is invalid"); |
|
return NULL; |
|
} |
|
result = editops_to_tuple_list(nr, orem); |
|
free(orem); |
|
|
|
return result; |
|
} |
|
free(ops); |
|
} |
|
|
|
if (!PyErr_Occurred()) |
|
PyErr_Format(PyExc_TypeError, |
|
"subtract_edit expected two lists of edit operations"); |
|
return NULL; |
|
} |
|
|
|
|
|
PY_MOD_INIT_FUNC_DEF(_levenshtein) |
|
{ |
|
#ifdef LEV_PYTHON3 |
|
PyObject *module; |
|
#endif |
|
size_t i; |
|
|
|
PY_INIT_MOD(module, "_levenshtein", Levenshtein_DESC, methods) |
|
|
|
if (opcode_names[0].pystring) |
|
abort(); |
|
for (i = 0; i < N_OPCODE_NAMES; i++) { |
|
#ifdef LEV_PYTHON3 |
|
opcode_names[i].pystring |
|
= PyUnicode_InternFromString(opcode_names[i].cstring); |
|
#else |
|
opcode_names[i].pystring |
|
= PyString_InternFromString(opcode_names[i].cstring); |
|
#endif |
|
opcode_names[i].len = strlen(opcode_names[i].cstring); |
|
} |
|
lev_init_rng(0); |
|
#ifdef LEV_PYTHON3 |
|
return module; |
|
#endif |
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline unsigned long |
|
taus113_get(taus113_state_t *state) |
|
{ |
|
unsigned long b1, b2, b3, b4; |
|
|
|
b1 = ((((state->z1 << 6UL) & TAUS113_MASK) ^ state->z1) >> 13UL); |
|
state->z1 = ((((state->z1 & 4294967294UL) << 18UL) & TAUS113_MASK) ^ b1); |
|
|
|
b2 = ((((state->z2 << 2UL) & TAUS113_MASK) ^ state->z2) >> 27UL); |
|
state->z2 = ((((state->z2 & 4294967288UL) << 2UL) & TAUS113_MASK) ^ b2); |
|
|
|
b3 = ((((state->z3 << 13UL) & TAUS113_MASK) ^ state->z3) >> 21UL); |
|
state->z3 = ((((state->z3 & 4294967280UL) << 7UL) & TAUS113_MASK) ^ b3); |
|
|
|
b4 = ((((state->z4 << 3UL) & TAUS113_MASK) ^ state->z4) >> 12UL); |
|
state->z4 = ((((state->z4 & 4294967168UL) << 13UL) & TAUS113_MASK) ^ b4); |
|
|
|
return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4); |
|
|
|
} |
|
|
|
static void |
|
taus113_set(taus113_state_t *state, |
|
unsigned long int s) |
|
{ |
|
if (!s) |
|
s = 1UL; |
|
|
|
state->z1 = TAUS113_LCG(s); |
|
if (state->z1 < 2UL) |
|
state->z1 += 2UL; |
|
|
|
state->z2 = TAUS113_LCG(state->z1); |
|
if (state->z2 < 8UL) |
|
state->z2 += 8UL; |
|
|
|
state->z3 = TAUS113_LCG(state->z2); |
|
if (state->z3 < 16UL) |
|
state->z3 += 16UL; |
|
|
|
state->z4 = TAUS113_LCG(state->z3); |
|
if (state->z4 < 128UL) |
|
state->z4 += 128UL; |
|
|
|
|
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
taus113_get(state); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY void |
|
lev_init_rng(unsigned long int seed) |
|
{ |
|
static taus113_state_t state; |
|
|
|
taus113_set(&state, seed); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_edit_distance(size_t len1, const lev_byte *string1, |
|
size_t len2, const lev_byte *string2, |
|
int xcost) |
|
{ |
|
size_t i; |
|
size_t *row; |
|
size_t *end; |
|
size_t half; |
|
|
|
|
|
while (len1 > 0 && len2 > 0 && *string1 == *string2) { |
|
len1--; |
|
len2--; |
|
string1++; |
|
string2++; |
|
} |
|
|
|
|
|
while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { |
|
len1--; |
|
len2--; |
|
} |
|
|
|
|
|
if (len1 == 0) |
|
return len2; |
|
if (len2 == 0) |
|
return len1; |
|
|
|
|
|
if (len1 > len2) { |
|
size_t nx = len1; |
|
const lev_byte *sx = string1; |
|
len1 = len2; |
|
len2 = nx; |
|
string1 = string2; |
|
string2 = sx; |
|
} |
|
|
|
if (len1 == 1) { |
|
if (xcost) |
|
return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL); |
|
else |
|
return len2 - (memchr(string2, *string1, len2) != NULL); |
|
} |
|
len1++; |
|
len2++; |
|
half = len1 >> 1; |
|
|
|
|
|
row = (size_t*)safe_malloc(len2, sizeof(size_t)); |
|
if (!row) |
|
return (size_t)(-1); |
|
end = row + len2 - 1; |
|
for (i = 0; i < len2 - (xcost ? 0 : half); i++) |
|
row[i] = i; |
|
|
|
|
|
|
|
|
|
if (xcost) { |
|
for (i = 1; i < len1; i++) { |
|
size_t *p = row + 1; |
|
const lev_byte char1 = string1[i - 1]; |
|
const lev_byte *char2p = string2; |
|
size_t D = i; |
|
size_t x = i; |
|
while (p <= end) { |
|
if (char1 == *(char2p++)) |
|
x = --D; |
|
else |
|
x++; |
|
D = *p; |
|
D++; |
|
if (x > D) |
|
x = D; |
|
*(p++) = x; |
|
} |
|
} |
|
} |
|
else { |
|
|
|
|
|
|
|
|
|
row[0] = len1 - half - 1; |
|
for (i = 1; i < len1; i++) { |
|
size_t *p; |
|
const lev_byte char1 = string1[i - 1]; |
|
const lev_byte *char2p; |
|
size_t D, x; |
|
|
|
if (i >= len1 - half) { |
|
size_t offset = i - (len1 - half); |
|
size_t c3; |
|
|
|
char2p = string2 + offset; |
|
p = row + offset; |
|
c3 = *(p++) + (char1 != *(char2p++)); |
|
x = *p; |
|
x++; |
|
D = x; |
|
if (x > c3) |
|
x = c3; |
|
*(p++) = x; |
|
} |
|
else { |
|
p = row + 1; |
|
char2p = string2; |
|
D = x = i; |
|
} |
|
|
|
if (i <= half + 1) |
|
end = row + len2 + i - half - 2; |
|
|
|
while (p <= end) { |
|
size_t c3 = --D + (char1 != *(char2p++)); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
D = *p; |
|
D++; |
|
if (x > D) |
|
x = D; |
|
*(p++) = x; |
|
} |
|
|
|
if (i <= half) { |
|
size_t c3 = --D + (char1 != *char2p); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
*p = x; |
|
} |
|
} |
|
} |
|
|
|
i = *end; |
|
free(row); |
|
return i; |
|
} |
|
|
|
_LEV_STATIC_PY double |
|
lev_edit_distance_sod(size_t len, const lev_byte *string, |
|
size_t n, const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
int xcost) |
|
{ |
|
size_t i, d; |
|
double sum = 0.0; |
|
|
|
for (i = 0; i < n; i++) { |
|
d = lev_edit_distance(len, string, lengths[i], strings[i], xcost); |
|
if (d == (size_t)-1) |
|
return -1.0; |
|
sum += weights[i]*d; |
|
} |
|
return sum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_u_edit_distance(size_t len1, const lev_wchar *string1, |
|
size_t len2, const lev_wchar *string2, |
|
int xcost) |
|
{ |
|
size_t i; |
|
size_t *row; |
|
size_t *end; |
|
size_t half; |
|
|
|
|
|
while (len1 > 0 && len2 > 0 && *string1 == *string2) { |
|
len1--; |
|
len2--; |
|
string1++; |
|
string2++; |
|
} |
|
|
|
|
|
while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { |
|
len1--; |
|
len2--; |
|
} |
|
|
|
|
|
if (len1 == 0) |
|
return len2; |
|
if (len2 == 0) |
|
return len1; |
|
|
|
|
|
if (len1 > len2) { |
|
size_t nx = len1; |
|
const lev_wchar *sx = string1; |
|
len1 = len2; |
|
len2 = nx; |
|
string1 = string2; |
|
string2 = sx; |
|
} |
|
|
|
if (len1 == 1) { |
|
lev_wchar z = *string1; |
|
const lev_wchar *p = string2; |
|
for (i = len2; i; i--) { |
|
if (*(p++) == z) |
|
return len2 - 1; |
|
} |
|
return len2 + (xcost != 0); |
|
} |
|
len1++; |
|
len2++; |
|
half = len1 >> 1; |
|
|
|
|
|
row = (size_t*)safe_malloc(len2, sizeof(size_t)); |
|
if (!row) |
|
return (size_t)(-1); |
|
end = row + len2 - 1; |
|
for (i = 0; i < len2 - (xcost ? 0 : half); i++) |
|
row[i] = i; |
|
|
|
|
|
|
|
|
|
if (xcost) { |
|
for (i = 1; i < len1; i++) { |
|
size_t *p = row + 1; |
|
const lev_wchar char1 = string1[i - 1]; |
|
const lev_wchar *char2p = string2; |
|
size_t D = i - 1; |
|
size_t x = i; |
|
while (p <= end) { |
|
if (char1 == *(char2p++)) |
|
x = D; |
|
else |
|
x++; |
|
D = *p; |
|
if (x > D + 1) |
|
x = D + 1; |
|
*(p++) = x; |
|
} |
|
} |
|
} |
|
else { |
|
|
|
|
|
|
|
|
|
row[0] = len1 - half - 1; |
|
for (i = 1; i < len1; i++) { |
|
size_t *p; |
|
const lev_wchar char1 = string1[i - 1]; |
|
const lev_wchar *char2p; |
|
size_t D, x; |
|
|
|
if (i >= len1 - half) { |
|
size_t offset = i - (len1 - half); |
|
size_t c3; |
|
|
|
char2p = string2 + offset; |
|
p = row + offset; |
|
c3 = *(p++) + (char1 != *(char2p++)); |
|
x = *p; |
|
x++; |
|
D = x; |
|
if (x > c3) |
|
x = c3; |
|
*(p++) = x; |
|
} |
|
else { |
|
p = row + 1; |
|
char2p = string2; |
|
D = x = i; |
|
} |
|
|
|
if (i <= half + 1) |
|
end = row + len2 + i - half - 2; |
|
|
|
while (p <= end) { |
|
size_t c3 = --D + (char1 != *(char2p++)); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
D = *p; |
|
D++; |
|
if (x > D) |
|
x = D; |
|
*(p++) = x; |
|
} |
|
|
|
if (i <= half) { |
|
size_t c3 = --D + (char1 != *char2p); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
*p = x; |
|
} |
|
} |
|
} |
|
|
|
i = *end; |
|
free(row); |
|
return i; |
|
} |
|
|
|
_LEV_STATIC_PY double |
|
lev_u_edit_distance_sod(size_t len, const lev_wchar *string, |
|
size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], |
|
const double *weights, |
|
int xcost) |
|
{ |
|
size_t i, d; |
|
double sum = 0.0; |
|
|
|
for (i = 0; i < n; i++) { |
|
d = lev_u_edit_distance(len, string, lengths[i], strings[i], xcost); |
|
if (d == (size_t)-1) |
|
return -1.0; |
|
sum += weights[i]*d; |
|
} |
|
return sum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_hamming_distance(size_t len, |
|
const lev_byte *string1, |
|
const lev_byte *string2) |
|
{ |
|
size_t dist, i; |
|
|
|
dist = 0; |
|
for (i = len; i; i--) { |
|
if (*(string1++) != *(string2++)) |
|
dist++; |
|
} |
|
|
|
return dist; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_u_hamming_distance(size_t len, |
|
const lev_wchar *string1, |
|
const lev_wchar *string2) |
|
{ |
|
size_t dist, i; |
|
|
|
dist = 0; |
|
for (i = len; i; i--) { |
|
if (*(string1++) != *(string2++)) |
|
dist++; |
|
} |
|
|
|
return dist; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_jaro_ratio(size_t len1, const lev_byte *string1, |
|
size_t len2, const lev_byte *string2) |
|
{ |
|
size_t i, j, halflen, trans, match, to; |
|
size_t *idx; |
|
double md; |
|
|
|
if (len1 == 0 || len2 == 0) { |
|
if (len1 == 0 && len2 == 0) |
|
return 1.0; |
|
return 0.0; |
|
} |
|
|
|
if (len1 > len2) { |
|
const lev_byte *b; |
|
|
|
b = string1; |
|
string1 = string2; |
|
string2 = b; |
|
|
|
i = len1; |
|
len1 = len2; |
|
len2 = i; |
|
} |
|
|
|
halflen = (len1 + 1)/2; |
|
idx = (size_t*)calloc(len1, sizeof(size_t)); |
|
if (!idx) |
|
return -1.0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
match = 0; |
|
|
|
for (i = 0; i < halflen; i++) { |
|
for (j = 0; j <= i+halflen; j++) { |
|
if (string1[j] == string2[i] && !idx[j]) { |
|
match++; |
|
idx[j] = match; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
to = len1+halflen < len2 ? len1+halflen : len2; |
|
for (i = halflen; i < to; i++) { |
|
for (j = i - halflen; j < len1; j++) { |
|
if (string1[j] == string2[i] && !idx[j]) { |
|
match++; |
|
idx[j] = match; |
|
break; |
|
} |
|
} |
|
} |
|
if (!match) { |
|
free(idx); |
|
return 0.0; |
|
} |
|
|
|
i = 0; |
|
trans = 0; |
|
for (j = 0; j < len1; j++) { |
|
if (idx[j]) { |
|
i++; |
|
if (idx[j] != i) |
|
trans++; |
|
} |
|
} |
|
free(idx); |
|
|
|
md = (double)match; |
|
return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_u_jaro_ratio(size_t len1, const lev_wchar *string1, |
|
size_t len2, const lev_wchar *string2) |
|
{ |
|
size_t i, j, halflen, trans, match, to; |
|
size_t *idx; |
|
double md; |
|
|
|
if (len1 == 0 || len2 == 0) { |
|
if (len1 == 0 && len2 == 0) |
|
return 1.0; |
|
return 0.0; |
|
} |
|
|
|
if (len1 > len2) { |
|
const lev_wchar *b; |
|
|
|
b = string1; |
|
string1 = string2; |
|
string2 = b; |
|
|
|
i = len1; |
|
len1 = len2; |
|
len2 = i; |
|
} |
|
|
|
halflen = (len1 + 1)/2; |
|
idx = (size_t*)calloc(len1, sizeof(size_t)); |
|
if (!idx) |
|
return -1.0; |
|
|
|
match = 0; |
|
|
|
for (i = 0; i < halflen; i++) { |
|
for (j = 0; j <= i+halflen; j++) { |
|
if (string1[j] == string2[i] && !idx[j]) { |
|
match++; |
|
idx[j] = match; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
to = len1+halflen < len2 ? len1+halflen : len2; |
|
for (i = halflen; i < to; i++) { |
|
for (j = i - halflen; j < len1; j++) { |
|
if (string1[j] == string2[i] && !idx[j]) { |
|
match++; |
|
idx[j] = match; |
|
break; |
|
} |
|
} |
|
} |
|
if (!match) { |
|
free(idx); |
|
return 0.0; |
|
} |
|
|
|
i = 0; |
|
trans = 0; |
|
for (j = 0; j < len1; j++) { |
|
if (idx[j]) { |
|
i++; |
|
if (idx[j] != i) |
|
trans++; |
|
} |
|
} |
|
free(idx); |
|
|
|
md = (double)match; |
|
return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1, |
|
size_t len2, const lev_byte *string2, |
|
double pfweight) |
|
{ |
|
double j; |
|
size_t p, m; |
|
|
|
j = lev_jaro_ratio(len1, string1, len2, string2); |
|
m = len1 < len2 ? len1 : len2; |
|
for (p = 0; p < m; p++) { |
|
if (string1[p] != string2[p]) |
|
break; |
|
} |
|
j += (1.0 - j)*p*pfweight; |
|
return j > 1.0 ? 1.0 : j; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1, |
|
size_t len2, const lev_wchar *string2, |
|
double pfweight) |
|
{ |
|
double j; |
|
size_t p, m; |
|
|
|
j = lev_u_jaro_ratio(len1, string1, len2, string2); |
|
m = len1 < len2 ? len1 : len2; |
|
for (p = 0; p < m; p++) { |
|
if (string1[p] != string2[p]) |
|
break; |
|
} |
|
j += (1.0 - j)*p*pfweight; |
|
return j > 1.0 ? 1.0 : j; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static lev_byte* |
|
make_symlist(size_t n, const size_t *lengths, |
|
const lev_byte *strings[], size_t *symlistlen) |
|
{ |
|
short int *symset; |
|
|
|
size_t i, j; |
|
lev_byte *symlist; |
|
|
|
symset = calloc(0x100, sizeof(short int)); |
|
if (!symset) { |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
*symlistlen = 0; |
|
for (i = 0; i < n; i++) { |
|
const lev_byte *stri = strings[i]; |
|
for (j = 0; j < lengths[i]; j++) { |
|
int c = stri[j]; |
|
if (!symset[c]) { |
|
(*symlistlen)++; |
|
symset[c] = 1; |
|
} |
|
} |
|
} |
|
if (!*symlistlen) { |
|
free(symset); |
|
return NULL; |
|
} |
|
|
|
|
|
|
|
{ |
|
size_t pos = 0; |
|
symlist = (lev_byte*)safe_malloc((*symlistlen), sizeof(lev_byte)); |
|
if (!symlist) { |
|
*symlistlen = (size_t)(-1); |
|
free(symset); |
|
return NULL; |
|
} |
|
for (j = 0; j < 0x100; j++) { |
|
if (symset[j]) |
|
symlist[pos++] = (lev_byte)j; |
|
} |
|
} |
|
free(symset); |
|
|
|
return symlist; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_byte* |
|
lev_greedy_median(size_t n, const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t i; |
|
size_t j; |
|
size_t len; |
|
lev_byte *symlist; |
|
|
|
|
|
size_t symlistlen; |
|
size_t maxlen; |
|
size_t stoplen; |
|
|
|
|
|
size_t **rows; |
|
|
|
size_t *row; |
|
|
|
lev_byte *median; |
|
double *mediandist; |
|
|
|
|
|
|
|
size_t bestlen; |
|
|
|
|
|
symlist = make_symlist(n, lengths, strings, &symlistlen); |
|
if (!symlist) { |
|
*medlength = 0; |
|
if (symlistlen != 0) |
|
return NULL; |
|
else |
|
return calloc(1, sizeof(lev_byte)); |
|
} |
|
|
|
|
|
rows = (size_t**)safe_malloc(n, sizeof(size_t*)); |
|
if (!rows) { |
|
free(symlist); |
|
return NULL; |
|
} |
|
maxlen = 0; |
|
for (i = 0; i < n; i++) { |
|
size_t *ri; |
|
size_t leni = lengths[i]; |
|
if (leni > maxlen) |
|
maxlen = leni; |
|
ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); |
|
if (!ri) { |
|
for (j = 0; j < i; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
for (j = 0; j <= leni; j++) |
|
ri[j] = j; |
|
} |
|
stoplen = 2*maxlen + 1; |
|
row = (size_t*)safe_malloc((stoplen + 1), sizeof(size_t)); |
|
if (!row) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
|
|
|
|
|
|
median = (lev_byte*)safe_malloc(stoplen, sizeof(lev_byte)); |
|
if (!median) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
return NULL; |
|
} |
|
mediandist = (double*)safe_malloc((stoplen + 1), sizeof(double)); |
|
if (!mediandist) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
free(median); |
|
return NULL; |
|
} |
|
mediandist[0] = 0.0; |
|
for (i = 0; i < n; i++) |
|
mediandist[0] += lengths[i]*weights[i]; |
|
|
|
|
|
|
|
for (len = 1; len <= stoplen; len++) { |
|
lev_byte symbol; |
|
double minminsum = LEV_INFINITY; |
|
row[0] = len; |
|
|
|
for (j = 0; j < symlistlen; j++) { |
|
double totaldist = 0.0; |
|
double minsum = 0.0; |
|
symbol = symlist[j]; |
|
|
|
for (i = 0; i < n; i++) { |
|
const lev_byte *stri = strings[i]; |
|
size_t *p = rows[i]; |
|
size_t leni = lengths[i]; |
|
size_t *end = rows[i] + leni; |
|
size_t min = len; |
|
size_t x = len; |
|
|
|
|
|
while (p < end) { |
|
size_t D = *(p++) + (symbol != *(stri++)); |
|
x++; |
|
if (x > D) |
|
x = D; |
|
if (x > *p + 1) |
|
x = *p + 1; |
|
if (x < min) |
|
min = x; |
|
} |
|
minsum += min*weights[i]; |
|
totaldist += x*weights[i]; |
|
} |
|
|
|
if (minsum < minminsum) { |
|
minminsum = minsum; |
|
mediandist[len] = totaldist; |
|
median[len - 1] = symbol; |
|
} |
|
} |
|
|
|
|
|
|
|
if (len == stoplen |
|
|| (len > maxlen && mediandist[len] > mediandist[len - 1])) { |
|
stoplen = len; |
|
break; |
|
} |
|
|
|
|
|
symbol = median[len - 1]; |
|
for (i = 0; i < n; i++) { |
|
const lev_byte *stri = strings[i]; |
|
size_t *oldrow = rows[i]; |
|
size_t leni = lengths[i]; |
|
size_t k; |
|
|
|
for (k = 1; k <= leni; k++) { |
|
size_t c1 = oldrow[k] + 1; |
|
size_t c2 = row[k - 1] + 1; |
|
size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); |
|
row[k] = c2 > c3 ? c3 : c2; |
|
if (row[k] > c1) |
|
row[k] = c1; |
|
} |
|
memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); |
|
} |
|
} |
|
|
|
|
|
bestlen = 0; |
|
for (len = 1; len <= stoplen; len++) { |
|
if (mediandist[len] < mediandist[bestlen]) |
|
bestlen = len; |
|
} |
|
|
|
|
|
for (i = 0; i < n; i++) |
|
free(rows[i]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
free(mediandist); |
|
|
|
|
|
{ |
|
lev_byte *result = (lev_byte*)safe_malloc(bestlen, sizeof(lev_byte)); |
|
if (!result) { |
|
free(median); |
|
return NULL; |
|
} |
|
memcpy(result, median, bestlen*sizeof(lev_byte)); |
|
free(median); |
|
*medlength = bestlen; |
|
return result; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static double |
|
finish_distance_computations(size_t len1, lev_byte *string1, |
|
size_t n, const size_t *lengths, |
|
const lev_byte **strings, |
|
const double *weights, size_t **rows, |
|
size_t *row) |
|
{ |
|
size_t *end; |
|
size_t i, j; |
|
size_t offset; |
|
double distsum = 0.0; |
|
|
|
|
|
if (len1 == 0) { |
|
for (j = 0; j < n; j++) |
|
distsum += rows[j][lengths[j]]*weights[j]; |
|
return distsum; |
|
} |
|
|
|
|
|
for (j = 0; j < n; j++) { |
|
size_t *rowi = rows[j]; |
|
size_t leni = lengths[j]; |
|
size_t len = len1; |
|
const lev_byte *stringi = strings[j]; |
|
|
|
|
|
while (len && leni && stringi[leni-1] == string1[len-1]) { |
|
len--; |
|
leni--; |
|
} |
|
|
|
|
|
if (len == 0) { |
|
distsum += rowi[leni]*weights[j]; |
|
continue; |
|
} |
|
offset = rowi[0]; |
|
if (leni == 0) { |
|
distsum += (offset + len)*weights[j]; |
|
continue; |
|
} |
|
|
|
|
|
memcpy(row, rowi, (leni + 1)*sizeof(size_t)); |
|
end = row + leni; |
|
|
|
for (i = 1; i <= len; i++) { |
|
size_t *p = row + 1; |
|
const lev_byte char1 = string1[i - 1]; |
|
const lev_byte *char2p = stringi; |
|
size_t D, x; |
|
|
|
D = x = i + offset; |
|
while (p <= end) { |
|
size_t c3 = --D + (char1 != *(char2p++)); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
D = *p; |
|
D++; |
|
if (x > D) |
|
x = D; |
|
*(p++) = x; |
|
} |
|
} |
|
distsum += weights[j]*(*end); |
|
} |
|
|
|
return distsum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_byte* |
|
lev_median_improve(size_t len, const lev_byte *s, |
|
size_t n, const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t i; |
|
size_t j; |
|
size_t pos; |
|
|
|
lev_byte *symlist; |
|
|
|
|
|
size_t symlistlen; |
|
size_t maxlen; |
|
size_t stoplen; |
|
|
|
|
|
size_t **rows; |
|
|
|
size_t *row; |
|
|
|
lev_byte *median; |
|
size_t medlen; |
|
double minminsum; |
|
|
|
|
|
symlist = make_symlist(n, lengths, strings, &symlistlen); |
|
if (!symlist) { |
|
*medlength = 0; |
|
if (symlistlen != 0) |
|
return NULL; |
|
else |
|
return calloc(1, sizeof(lev_byte)); |
|
} |
|
|
|
|
|
rows = (size_t**)safe_malloc(n, sizeof(size_t*)); |
|
if (!rows) { |
|
free(symlist); |
|
return NULL; |
|
} |
|
maxlen = 0; |
|
for (i = 0; i < n; i++) { |
|
size_t *ri; |
|
size_t leni = lengths[i]; |
|
if (leni > maxlen) |
|
maxlen = leni; |
|
ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); |
|
if (!ri) { |
|
for (j = 0; j < i; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
for (j = 0; j <= leni; j++) |
|
ri[j] = j; |
|
} |
|
stoplen = 2*maxlen + 1; |
|
row = (size_t*)safe_malloc((stoplen + 2), sizeof(size_t)); |
|
if (!row) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
|
|
|
|
median = (lev_byte*)safe_malloc((stoplen+1), sizeof(lev_byte)); |
|
if (!median) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
return NULL; |
|
} |
|
median++; |
|
medlen = len; |
|
memcpy(median, s, (medlen)*sizeof(lev_byte)); |
|
minminsum = finish_distance_computations(medlen, median, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
|
|
|
|
for (pos = 0; pos <= medlen; ) { |
|
lev_byte orig_symbol, symbol; |
|
LevEditType operation; |
|
double sum; |
|
|
|
symbol = median[pos]; |
|
operation = LEV_EDIT_KEEP; |
|
|
|
|
|
if (pos < medlen) { |
|
orig_symbol = median[pos]; |
|
for (j = 0; j < symlistlen; j++) { |
|
if (symlist[j] == orig_symbol) |
|
continue; |
|
median[pos] = symlist[j]; |
|
sum = finish_distance_computations(medlen - pos, median + pos, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
if (sum < minminsum) { |
|
minminsum = sum; |
|
symbol = symlist[j]; |
|
operation = LEV_EDIT_REPLACE; |
|
} |
|
} |
|
median[pos] = orig_symbol; |
|
} |
|
|
|
|
|
|
|
orig_symbol = *(median + pos - 1); |
|
for (j = 0; j < symlistlen; j++) { |
|
*(median + pos - 1) = symlist[j]; |
|
sum = finish_distance_computations(medlen - pos + 1, median + pos - 1, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
if (sum < minminsum) { |
|
minminsum = sum; |
|
symbol = symlist[j]; |
|
operation = LEV_EDIT_INSERT; |
|
} |
|
} |
|
*(median + pos - 1) = orig_symbol; |
|
|
|
|
|
if (pos < medlen) { |
|
sum = finish_distance_computations(medlen - pos - 1, median + pos + 1, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
if (sum < minminsum) { |
|
minminsum = sum; |
|
operation = LEV_EDIT_DELETE; |
|
} |
|
} |
|
|
|
switch (operation) { |
|
case LEV_EDIT_REPLACE: |
|
median[pos] = symbol; |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
memmove(median+pos+1, median+pos, |
|
(medlen - pos)*sizeof(lev_byte)); |
|
median[pos] = symbol; |
|
medlen++; |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
memmove(median+pos, median + pos+1, |
|
(medlen - pos-1)*sizeof(lev_byte)); |
|
medlen--; |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
assert(medlen <= stoplen); |
|
|
|
if (operation != LEV_EDIT_DELETE) { |
|
symbol = median[pos]; |
|
row[0] = pos + 1; |
|
for (i = 0; i < n; i++) { |
|
const lev_byte *stri = strings[i]; |
|
size_t *oldrow = rows[i]; |
|
size_t leni = lengths[i]; |
|
size_t k; |
|
|
|
for (k = 1; k <= leni; k++) { |
|
size_t c1 = oldrow[k] + 1; |
|
size_t c2 = row[k - 1] + 1; |
|
size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); |
|
row[k] = c2 > c3 ? c3 : c2; |
|
if (row[k] > c1) |
|
row[k] = c1; |
|
} |
|
memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); |
|
} |
|
pos++; |
|
} |
|
} |
|
|
|
|
|
for (i = 0; i < n; i++) |
|
free(rows[i]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
|
|
|
|
{ |
|
lev_byte *result = (lev_byte*)safe_malloc(medlen, sizeof(lev_byte)); |
|
if (!result) { |
|
free(median); |
|
return NULL; |
|
} |
|
*medlength = medlen; |
|
memcpy(result, median, medlen*sizeof(lev_byte)); |
|
median--; |
|
free(median); |
|
return result; |
|
} |
|
} |
|
|
|
|
|
typedef struct _HItem HItem; |
|
struct _HItem { |
|
lev_wchar c; |
|
HItem *n; |
|
}; |
|
|
|
|
|
|
|
static void |
|
free_usymlist_hash(HItem *symmap) |
|
{ |
|
size_t j; |
|
|
|
for (j = 0; j < 0x100; j++) { |
|
HItem *p = symmap + j; |
|
if (p->n == symmap || p->n == NULL) |
|
continue; |
|
p = p->n; |
|
while (p) { |
|
HItem *q = p; |
|
p = p->n; |
|
free(q); |
|
} |
|
} |
|
free(symmap); |
|
} |
|
|
|
|
|
|
|
|
|
static lev_wchar* |
|
make_usymlist(size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], size_t *symlistlen) |
|
{ |
|
lev_wchar *symlist; |
|
size_t i, j; |
|
HItem *symmap; |
|
|
|
j = 0; |
|
for (i = 0; i < n; i++) |
|
j += lengths[i]; |
|
|
|
*symlistlen = 0; |
|
if (j == 0) |
|
return NULL; |
|
|
|
|
|
symmap = (HItem*)safe_malloc(0x100, sizeof(HItem)); |
|
if (!symmap) { |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < 0x100; i++) |
|
symmap[i].n = symmap; |
|
for (i = 0; i < n; i++) { |
|
const lev_wchar *stri = strings[i]; |
|
for (j = 0; j < lengths[i]; j++) { |
|
int c = stri[j]; |
|
int key = (c + (c >> 7)) & 0xff; |
|
HItem *p = symmap + key; |
|
if (p->n == symmap) { |
|
p->c = c; |
|
p->n = NULL; |
|
(*symlistlen)++; |
|
continue; |
|
} |
|
while (p->c != c && p->n != NULL) |
|
p = p->n; |
|
if (p->c != c) { |
|
p->n = (HItem*)malloc(sizeof(HItem)); |
|
if (!p->n) { |
|
free_usymlist_hash(symmap); |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
p = p->n; |
|
p->n = NULL; |
|
p->c = c; |
|
(*symlistlen)++; |
|
} |
|
} |
|
} |
|
|
|
|
|
{ |
|
size_t pos = 0; |
|
symlist = (lev_wchar*)safe_malloc((*symlistlen), sizeof(lev_wchar)); |
|
if (!symlist) { |
|
free_usymlist_hash(symmap); |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
for (j = 0; j < 0x100; j++) { |
|
HItem *p = symmap + j; |
|
while (p != NULL && p->n != symmap) { |
|
symlist[pos++] = p->c; |
|
p = p->n; |
|
} |
|
} |
|
} |
|
|
|
|
|
free_usymlist_hash(symmap); |
|
|
|
return symlist; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_wchar* |
|
lev_u_greedy_median(size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t i; |
|
size_t j; |
|
size_t len; |
|
lev_wchar *symlist; |
|
|
|
|
|
size_t symlistlen; |
|
size_t maxlen; |
|
size_t stoplen; |
|
|
|
|
|
size_t **rows; |
|
|
|
size_t *row; |
|
|
|
lev_wchar *median; |
|
double *mediandist; |
|
|
|
|
|
|
|
size_t bestlen; |
|
|
|
|
|
symlist = make_usymlist(n, lengths, strings, &symlistlen); |
|
if (!symlist) { |
|
*medlength = 0; |
|
if (symlistlen != 0) |
|
return NULL; |
|
else |
|
return calloc(1, sizeof(lev_wchar)); |
|
} |
|
|
|
|
|
rows = (size_t**)safe_malloc(n, sizeof(size_t*)); |
|
if (!rows) { |
|
free(symlist); |
|
return NULL; |
|
} |
|
maxlen = 0; |
|
for (i = 0; i < n; i++) { |
|
size_t *ri; |
|
size_t leni = lengths[i]; |
|
if (leni > maxlen) |
|
maxlen = leni; |
|
ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); |
|
if (!ri) { |
|
for (j = 0; j < i; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
for (j = 0; j <= leni; j++) |
|
ri[j] = j; |
|
} |
|
stoplen = 2*maxlen + 1; |
|
row = (size_t*)safe_malloc((stoplen + 1), sizeof(size_t)); |
|
if (!row) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
|
|
|
|
|
|
median = (lev_wchar*)safe_malloc(stoplen, sizeof(lev_wchar)); |
|
if (!median) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
return NULL; |
|
} |
|
mediandist = (double*)safe_malloc((stoplen + 1), sizeof(double)); |
|
if (!mediandist) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
free(median); |
|
return NULL; |
|
} |
|
mediandist[0] = 0.0; |
|
for (i = 0; i < n; i++) |
|
mediandist[0] += lengths[i]*weights[i]; |
|
|
|
|
|
|
|
for (len = 1; len <= stoplen; len++) { |
|
lev_wchar symbol; |
|
double minminsum = LEV_INFINITY; |
|
row[0] = len; |
|
|
|
for (j = 0; j < symlistlen; j++) { |
|
double totaldist = 0.0; |
|
double minsum = 0.0; |
|
symbol = symlist[j]; |
|
|
|
for (i = 0; i < n; i++) { |
|
const lev_wchar *stri = strings[i]; |
|
size_t *p = rows[i]; |
|
size_t leni = lengths[i]; |
|
size_t *end = rows[i] + leni; |
|
size_t min = len; |
|
size_t x = len; |
|
|
|
|
|
while (p < end) { |
|
size_t D = *(p++) + (symbol != *(stri++)); |
|
x++; |
|
if (x > D) |
|
x = D; |
|
if (x > *p + 1) |
|
x = *p + 1; |
|
if (x < min) |
|
min = x; |
|
} |
|
minsum += min*weights[i]; |
|
totaldist += x*weights[i]; |
|
} |
|
|
|
if (minsum < minminsum) { |
|
minminsum = minsum; |
|
mediandist[len] = totaldist; |
|
median[len - 1] = symbol; |
|
} |
|
} |
|
|
|
|
|
|
|
if (len == stoplen |
|
|| (len > maxlen && mediandist[len] > mediandist[len - 1])) { |
|
stoplen = len; |
|
break; |
|
} |
|
|
|
|
|
symbol = median[len - 1]; |
|
for (i = 0; i < n; i++) { |
|
const lev_wchar *stri = strings[i]; |
|
size_t *oldrow = rows[i]; |
|
size_t leni = lengths[i]; |
|
size_t k; |
|
|
|
for (k = 1; k <= leni; k++) { |
|
size_t c1 = oldrow[k] + 1; |
|
size_t c2 = row[k - 1] + 1; |
|
size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); |
|
row[k] = c2 > c3 ? c3 : c2; |
|
if (row[k] > c1) |
|
row[k] = c1; |
|
} |
|
memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); |
|
} |
|
} |
|
|
|
|
|
bestlen = 0; |
|
for (len = 1; len <= stoplen; len++) { |
|
if (mediandist[len] < mediandist[bestlen]) |
|
bestlen = len; |
|
} |
|
|
|
|
|
for (i = 0; i < n; i++) |
|
free(rows[i]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
free(mediandist); |
|
|
|
|
|
{ |
|
lev_wchar *result = (lev_wchar*)safe_malloc(bestlen, sizeof(lev_wchar)); |
|
if (!result) { |
|
free(median); |
|
return NULL; |
|
} |
|
memcpy(result, median, bestlen*sizeof(lev_wchar)); |
|
free(median); |
|
*medlength = bestlen; |
|
return result; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static double |
|
finish_udistance_computations(size_t len1, lev_wchar *string1, |
|
size_t n, const size_t *lengths, |
|
const lev_wchar **strings, |
|
const double *weights, size_t **rows, |
|
size_t *row) |
|
{ |
|
size_t *end; |
|
size_t i, j; |
|
size_t offset; |
|
double distsum = 0.0; |
|
|
|
|
|
if (len1 == 0) { |
|
for (j = 0; j < n; j++) |
|
distsum += rows[j][lengths[j]]*weights[j]; |
|
return distsum; |
|
} |
|
|
|
|
|
for (j = 0; j < n; j++) { |
|
size_t *rowi = rows[j]; |
|
size_t leni = lengths[j]; |
|
size_t len = len1; |
|
const lev_wchar *stringi = strings[j]; |
|
|
|
|
|
while (len && leni && stringi[leni-1] == string1[len-1]) { |
|
len--; |
|
leni--; |
|
} |
|
|
|
|
|
if (len == 0) { |
|
distsum += rowi[leni]*weights[j]; |
|
continue; |
|
} |
|
offset = rowi[0]; |
|
if (leni == 0) { |
|
distsum += (offset + len)*weights[j]; |
|
continue; |
|
} |
|
|
|
|
|
memcpy(row, rowi, (leni + 1)*sizeof(size_t)); |
|
end = row + leni; |
|
|
|
for (i = 1; i <= len; i++) { |
|
size_t *p = row + 1; |
|
const lev_wchar char1 = string1[i - 1]; |
|
const lev_wchar *char2p = stringi; |
|
size_t D, x; |
|
|
|
D = x = i + offset; |
|
while (p <= end) { |
|
size_t c3 = --D + (char1 != *(char2p++)); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
D = *p; |
|
D++; |
|
if (x > D) |
|
x = D; |
|
*(p++) = x; |
|
} |
|
} |
|
distsum += weights[j]*(*end); |
|
} |
|
|
|
return distsum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_wchar* |
|
lev_u_median_improve(size_t len, const lev_wchar *s, |
|
size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t i; |
|
size_t j; |
|
size_t pos; |
|
|
|
lev_wchar *symlist; |
|
|
|
|
|
size_t symlistlen; |
|
size_t maxlen; |
|
size_t stoplen; |
|
|
|
|
|
size_t **rows; |
|
|
|
size_t *row; |
|
|
|
lev_wchar *median; |
|
size_t medlen; |
|
double minminsum; |
|
|
|
|
|
symlist = make_usymlist(n, lengths, strings, &symlistlen); |
|
if (!symlist) { |
|
*medlength = 0; |
|
if (symlistlen != 0) |
|
return NULL; |
|
else |
|
return calloc(1, sizeof(lev_wchar)); |
|
} |
|
|
|
|
|
rows = (size_t**)safe_malloc(n, sizeof(size_t*)); |
|
if (!rows) { |
|
free(symlist); |
|
return NULL; |
|
} |
|
maxlen = 0; |
|
for (i = 0; i < n; i++) { |
|
size_t *ri; |
|
size_t leni = lengths[i]; |
|
if (leni > maxlen) |
|
maxlen = leni; |
|
ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); |
|
if (!ri) { |
|
for (j = 0; j < i; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
for (j = 0; j <= leni; j++) |
|
ri[j] = j; |
|
} |
|
stoplen = 2*maxlen + 1; |
|
row = (size_t*)safe_malloc((stoplen + 2), sizeof(size_t)); |
|
if (!row) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(symlist); |
|
return NULL; |
|
} |
|
|
|
|
|
median = (lev_wchar*)safe_malloc((stoplen+1), sizeof(lev_wchar)); |
|
if (!median) { |
|
for (j = 0; j < n; j++) |
|
free(rows[j]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
return NULL; |
|
} |
|
median++; |
|
medlen = len; |
|
memcpy(median, s, (medlen)*sizeof(lev_wchar)); |
|
minminsum = finish_udistance_computations(medlen, median, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
|
|
|
|
for (pos = 0; pos <= medlen; ) { |
|
lev_wchar orig_symbol, symbol; |
|
LevEditType operation; |
|
double sum; |
|
|
|
symbol = median[pos]; |
|
operation = LEV_EDIT_KEEP; |
|
|
|
|
|
if (pos < medlen) { |
|
orig_symbol = median[pos]; |
|
for (j = 0; j < symlistlen; j++) { |
|
if (symlist[j] == orig_symbol) |
|
continue; |
|
median[pos] = symlist[j]; |
|
sum = finish_udistance_computations(medlen - pos, median + pos, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
if (sum < minminsum) { |
|
minminsum = sum; |
|
symbol = symlist[j]; |
|
operation = LEV_EDIT_REPLACE; |
|
} |
|
} |
|
median[pos] = orig_symbol; |
|
} |
|
|
|
|
|
|
|
orig_symbol = *(median + pos - 1); |
|
for (j = 0; j < symlistlen; j++) { |
|
*(median + pos - 1) = symlist[j]; |
|
sum = finish_udistance_computations(medlen - pos + 1, median + pos - 1, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
if (sum < minminsum) { |
|
minminsum = sum; |
|
symbol = symlist[j]; |
|
operation = LEV_EDIT_INSERT; |
|
} |
|
} |
|
*(median + pos - 1) = orig_symbol; |
|
|
|
|
|
if (pos < medlen) { |
|
sum = finish_udistance_computations(medlen - pos - 1, median + pos + 1, |
|
n, lengths, strings, |
|
weights, rows, row); |
|
if (sum < minminsum) { |
|
minminsum = sum; |
|
operation = LEV_EDIT_DELETE; |
|
} |
|
} |
|
|
|
switch (operation) { |
|
case LEV_EDIT_REPLACE: |
|
median[pos] = symbol; |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
memmove(median+pos+1, median+pos, |
|
(medlen - pos)*sizeof(lev_wchar)); |
|
median[pos] = symbol; |
|
medlen++; |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
memmove(median+pos, median + pos+1, |
|
(medlen - pos-1)*sizeof(lev_wchar)); |
|
medlen--; |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
assert(medlen <= stoplen); |
|
|
|
if (operation != LEV_EDIT_DELETE) { |
|
symbol = median[pos]; |
|
row[0] = pos + 1; |
|
for (i = 0; i < n; i++) { |
|
const lev_wchar *stri = strings[i]; |
|
size_t *oldrow = rows[i]; |
|
size_t leni = lengths[i]; |
|
size_t k; |
|
|
|
for (k = 1; k <= leni; k++) { |
|
size_t c1 = oldrow[k] + 1; |
|
size_t c2 = row[k - 1] + 1; |
|
size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); |
|
row[k] = c2 > c3 ? c3 : c2; |
|
if (row[k] > c1) |
|
row[k] = c1; |
|
} |
|
memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); |
|
} |
|
pos++; |
|
} |
|
} |
|
|
|
|
|
for (i = 0; i < n; i++) |
|
free(rows[i]); |
|
free(rows); |
|
free(row); |
|
free(symlist); |
|
|
|
|
|
{ |
|
lev_wchar *result = (lev_wchar*)safe_malloc(medlen, sizeof(lev_wchar)); |
|
if (!result) { |
|
free(median); |
|
return NULL; |
|
} |
|
*medlength = medlen; |
|
memcpy(result, median, medlen*sizeof(lev_wchar)); |
|
median--; |
|
free(median); |
|
return result; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static lev_byte* |
|
make_symlistset(size_t n, const size_t *lengths, |
|
const lev_byte *strings[], size_t *symlistlen, |
|
double *symset) |
|
{ |
|
size_t i, j; |
|
lev_byte *symlist; |
|
|
|
if (!symset) { |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
memset(symset, 0, 0x100*sizeof(double)); |
|
*symlistlen = 0; |
|
for (i = 0; i < n; i++) { |
|
const lev_byte *stri = strings[i]; |
|
for (j = 0; j < lengths[i]; j++) { |
|
int c = stri[j]; |
|
if (!symset[c]) { |
|
(*symlistlen)++; |
|
symset[c] = 1.0; |
|
} |
|
} |
|
} |
|
if (!*symlistlen) |
|
return NULL; |
|
|
|
|
|
|
|
{ |
|
size_t pos = 0; |
|
symlist = (lev_byte*)safe_malloc((*symlistlen), sizeof(lev_byte)); |
|
if (!symlist) { |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
for (j = 0; j < 0x100; j++) { |
|
if (symset[j]) |
|
symlist[pos++] = (lev_byte)j; |
|
} |
|
} |
|
|
|
return symlist; |
|
} |
|
|
|
_LEV_STATIC_PY lev_byte* |
|
lev_quick_median(size_t n, |
|
const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t symlistlen, len, i, j, k; |
|
lev_byte *symlist; |
|
lev_byte *median; |
|
double *symset; |
|
double ml, wl; |
|
|
|
|
|
|
|
ml = wl = 0.0; |
|
for (i = 0; i < n; i++) { |
|
ml += lengths[i]*weights[i]; |
|
wl += weights[i]; |
|
} |
|
if (wl == 0.0) |
|
return (lev_byte*)calloc(1, sizeof(lev_byte)); |
|
ml = floor(ml/wl + 0.499999); |
|
*medlength = len = ml; |
|
if (!len) |
|
return (lev_byte*)calloc(1, sizeof(lev_byte)); |
|
median = (lev_byte*)safe_malloc(len, sizeof(lev_byte)); |
|
if (!median) |
|
return NULL; |
|
|
|
|
|
|
|
symset = (double*)calloc(0x100, sizeof(double)); |
|
if (!symset) { |
|
free(median); |
|
return NULL; |
|
} |
|
symlist = make_symlistset(n, lengths, strings, &symlistlen, symset); |
|
if (!symlist) { |
|
free(median); |
|
free(symset); |
|
return NULL; |
|
} |
|
|
|
for (j = 0; j < len; j++) { |
|
|
|
if (symlistlen < 32) { |
|
for (i = 0; i < symlistlen; i++) |
|
symset[symlist[i]] = 0.0; |
|
} |
|
else |
|
memset(symset, 0, 0x100*sizeof(double)); |
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
const lev_byte *stri = strings[i]; |
|
double weighti = weights[i]; |
|
size_t lengthi = lengths[i]; |
|
double start = lengthi/ml*j; |
|
double end = start + lengthi/ml; |
|
size_t istart = floor(start); |
|
size_t iend = ceil(end); |
|
|
|
|
|
if (iend > lengthi) |
|
iend = lengthi; |
|
|
|
for (k = istart+1; k < iend; k++) |
|
symset[stri[k]] += weighti; |
|
symset[stri[istart]] += weighti*(1+istart - start); |
|
symset[stri[iend-1]] -= weighti*(iend - end); |
|
} |
|
|
|
|
|
k = symlist[0]; |
|
for (i = 1; i < symlistlen; i++) { |
|
if (symset[symlist[i]] > symset[k]) |
|
k = symlist[i]; |
|
} |
|
median[j] = k; |
|
} |
|
|
|
free(symset); |
|
free(symlist); |
|
|
|
return median; |
|
} |
|
|
|
|
|
typedef struct _HQItem HQItem; |
|
struct _HQItem { |
|
lev_wchar c; |
|
double s; |
|
HQItem *n; |
|
}; |
|
|
|
|
|
|
|
static void |
|
free_usymlistset_hash(HQItem *symmap) |
|
{ |
|
size_t j; |
|
|
|
for (j = 0; j < 0x100; j++) { |
|
HQItem *p = symmap + j; |
|
if (p->n == symmap || p->n == NULL) |
|
continue; |
|
p = p->n; |
|
while (p) { |
|
HQItem *q = p; |
|
p = p->n; |
|
free(q); |
|
} |
|
} |
|
free(symmap); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
static lev_wchar* |
|
make_usymlistset(size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], size_t *symlistlen, |
|
HQItem *symmap) |
|
{ |
|
lev_wchar *symlist; |
|
size_t i, j; |
|
|
|
j = 0; |
|
for (i = 0; i < n; i++) |
|
j += lengths[i]; |
|
|
|
*symlistlen = 0; |
|
if (j == 0) |
|
return NULL; |
|
|
|
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < 0x100; i++) |
|
symmap[i].n = symmap; |
|
for (i = 0; i < n; i++) { |
|
const lev_wchar *stri = strings[i]; |
|
for (j = 0; j < lengths[i]; j++) { |
|
int c = stri[j]; |
|
int key = (c + (c >> 7)) & 0xff; |
|
HQItem *p = symmap + key; |
|
if (p->n == symmap) { |
|
p->c = c; |
|
p->n = NULL; |
|
(*symlistlen)++; |
|
continue; |
|
} |
|
while (p->c != c && p->n != NULL) |
|
p = p->n; |
|
if (p->c != c) { |
|
p->n = (HQItem*)malloc(sizeof(HQItem)); |
|
if (!p->n) { |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
p = p->n; |
|
p->n = NULL; |
|
p->c = c; |
|
(*symlistlen)++; |
|
} |
|
} |
|
} |
|
|
|
|
|
{ |
|
size_t pos = 0; |
|
symlist = (lev_wchar*)safe_malloc((*symlistlen), sizeof(lev_wchar)); |
|
if (!symlist) { |
|
*symlistlen = (size_t)(-1); |
|
return NULL; |
|
} |
|
for (j = 0; j < 0x100; j++) { |
|
HQItem *p = symmap + j; |
|
while (p != NULL && p->n != symmap) { |
|
symlist[pos++] = p->c; |
|
p = p->n; |
|
} |
|
} |
|
} |
|
|
|
return symlist; |
|
} |
|
|
|
_LEV_STATIC_PY lev_wchar* |
|
lev_u_quick_median(size_t n, |
|
const size_t *lengths, |
|
const lev_wchar *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t symlistlen, len, i, j, k; |
|
lev_wchar *symlist; |
|
lev_wchar *median; |
|
HQItem *symmap; |
|
double ml, wl; |
|
|
|
|
|
|
|
ml = wl = 0.0; |
|
for (i = 0; i < n; i++) { |
|
ml += lengths[i]*weights[i]; |
|
wl += weights[i]; |
|
} |
|
if (wl == 0.0) |
|
return (lev_wchar*)calloc(1, sizeof(lev_wchar)); |
|
ml = floor(ml/wl + 0.499999); |
|
*medlength = len = ml; |
|
if (!len) |
|
return (lev_wchar*)calloc(1, sizeof(lev_wchar)); |
|
median = (lev_wchar*)safe_malloc(len, sizeof(lev_wchar)); |
|
if (!median) |
|
return NULL; |
|
|
|
|
|
|
|
symmap = (HQItem*)safe_malloc(0x100, sizeof(HQItem)); |
|
if (!symmap) { |
|
free(median); |
|
return NULL; |
|
} |
|
symlist = make_usymlistset(n, lengths, strings, &symlistlen, symmap); |
|
if (!symlist) { |
|
free(median); |
|
free_usymlistset_hash(symmap); |
|
return NULL; |
|
} |
|
|
|
for (j = 0; j < len; j++) { |
|
|
|
for (i = 0; i < 0x100; i++) { |
|
HQItem *p = symmap + i; |
|
if (p->n == symmap) |
|
continue; |
|
while (p) { |
|
p->s = 0.0; |
|
p = p->n; |
|
} |
|
} |
|
|
|
|
|
for (i = 0; i < n; i++) { |
|
const lev_wchar *stri = strings[i]; |
|
double weighti = weights[i]; |
|
size_t lengthi = lengths[i]; |
|
double start = lengthi/ml*j; |
|
double end = start + lengthi/ml; |
|
size_t istart = floor(start); |
|
size_t iend = ceil(end); |
|
|
|
|
|
if (iend > lengthi) |
|
iend = lengthi; |
|
|
|
|
|
for (k = istart+1; k < iend; k++) { |
|
int c = stri[k]; |
|
int key = (c + (c >> 7)) & 0xff; |
|
HQItem *p = symmap + key; |
|
while (p->c != c) |
|
p = p->n; |
|
p->s += weighti; |
|
} |
|
|
|
{ |
|
int c = stri[istart]; |
|
int key = (c + (c >> 7)) & 0xff; |
|
HQItem *p = symmap + key; |
|
while (p->c != c) |
|
p = p->n; |
|
p->s += weighti*(1+istart - start); |
|
} |
|
|
|
|
|
|
|
|
|
{ |
|
int c = stri[iend-1]; |
|
int key = (c + (c >> 7)) & 0xff; |
|
HQItem *p = symmap + key; |
|
while (p->c != c) |
|
p = p->n; |
|
p->s -= weighti*(iend - end); |
|
} |
|
} |
|
|
|
|
|
{ |
|
HQItem *max = NULL; |
|
|
|
for (i = 0; i < 0x100; i++) { |
|
HQItem *p = symmap + i; |
|
if (p->n == symmap) |
|
continue; |
|
while (p) { |
|
if (!max || p->s > max->s) |
|
max = p; |
|
p = p->n; |
|
} |
|
} |
|
median[j] = max->c; |
|
} |
|
} |
|
|
|
free_usymlistset_hash(symmap); |
|
free(symlist); |
|
|
|
return median; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_set_median_index(size_t n, const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights) |
|
{ |
|
size_t minidx = 0; |
|
double mindist = LEV_INFINITY; |
|
size_t i; |
|
long int *distances; |
|
|
|
distances = (long int*)safe_malloc((n*(n - 1)/2), sizeof(long int)); |
|
if (!distances) |
|
return (size_t)-1; |
|
|
|
memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); |
|
for (i = 0; i < n; i++) { |
|
size_t j = 0; |
|
double dist = 0.0; |
|
const lev_byte *stri = strings[i]; |
|
size_t leni = lengths[i]; |
|
|
|
while (j < i && dist < mindist) { |
|
size_t dindex = (i - 1)*(i - 2)/2 + j; |
|
long int d; |
|
if (distances[dindex] >= 0) |
|
d = distances[dindex]; |
|
else { |
|
d = lev_edit_distance(lengths[j], strings[j], leni, stri, 0); |
|
if (d < 0) { |
|
free(distances); |
|
return (size_t)-1; |
|
} |
|
} |
|
dist += weights[j]*d; |
|
j++; |
|
} |
|
j++; |
|
|
|
while (j < n && dist < mindist) { |
|
size_t dindex = (j - 1)*(j - 2)/2 + i; |
|
distances[dindex] = lev_edit_distance(lengths[j], strings[j], |
|
leni, stri, 0); |
|
if (distances[dindex] < 0) { |
|
free(distances); |
|
return (size_t)-1; |
|
} |
|
dist += weights[j]*distances[dindex]; |
|
j++; |
|
} |
|
|
|
if (dist < mindist) { |
|
mindist = dist; |
|
minidx = i; |
|
} |
|
} |
|
|
|
free(distances); |
|
return minidx; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_u_set_median_index(size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], |
|
const double *weights) |
|
{ |
|
size_t minidx = 0; |
|
double mindist = LEV_INFINITY; |
|
size_t i; |
|
long int *distances; |
|
|
|
distances = (long int*)safe_malloc((n*(n - 1)/2), sizeof(long int)); |
|
if (!distances) |
|
return (size_t)-1; |
|
|
|
memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); |
|
for (i = 0; i < n; i++) { |
|
size_t j = 0; |
|
double dist = 0.0; |
|
const lev_wchar *stri = strings[i]; |
|
size_t leni = lengths[i]; |
|
|
|
while (j < i && dist < mindist) { |
|
size_t dindex = (i - 1)*(i - 2)/2 + j; |
|
long int d; |
|
if (distances[dindex] >= 0) |
|
d = distances[dindex]; |
|
else { |
|
d = lev_u_edit_distance(lengths[j], strings[j], leni, stri, 0); |
|
if (d < 0) { |
|
free(distances); |
|
return (size_t)-1; |
|
} |
|
} |
|
dist += weights[j]*d; |
|
j++; |
|
} |
|
j++; |
|
|
|
while (j < n && dist < mindist) { |
|
size_t dindex = (j - 1)*(j - 2)/2 + i; |
|
distances[dindex] = lev_u_edit_distance(lengths[j], strings[j], |
|
leni, stri, 0); |
|
if (distances[dindex] < 0) { |
|
free(distances); |
|
return (size_t)-1; |
|
} |
|
dist += weights[j]*distances[dindex]; |
|
j++; |
|
} |
|
|
|
if (dist < mindist) { |
|
mindist = dist; |
|
minidx = i; |
|
} |
|
} |
|
|
|
free(distances); |
|
return minidx; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_byte* |
|
lev_set_median(size_t n, const size_t *lengths, |
|
const lev_byte *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t minidx = lev_set_median_index(n, lengths, strings, weights); |
|
lev_byte *result; |
|
|
|
if (minidx == (size_t)-1) |
|
return NULL; |
|
|
|
*medlength = lengths[minidx]; |
|
if (!lengths[minidx]) |
|
return (lev_byte*)calloc(1, sizeof(lev_byte)); |
|
|
|
result = (lev_byte*)safe_malloc(lengths[minidx], sizeof(lev_byte)); |
|
if (!result) |
|
return NULL; |
|
return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_byte)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_wchar* |
|
lev_u_set_median(size_t n, const size_t *lengths, |
|
const lev_wchar *strings[], |
|
const double *weights, |
|
size_t *medlength) |
|
{ |
|
size_t minidx = lev_u_set_median_index(n, lengths, strings, weights); |
|
lev_wchar *result; |
|
|
|
if (minidx == (size_t)-1) |
|
return NULL; |
|
|
|
*medlength = lengths[minidx]; |
|
if (!lengths[minidx]) |
|
return (lev_wchar*)calloc(1, sizeof(lev_wchar)); |
|
|
|
result = (lev_wchar*)safe_malloc(lengths[minidx], sizeof(lev_wchar)); |
|
if (!result) |
|
return NULL; |
|
return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_wchar)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_edit_seq_distance(size_t n1, const size_t *lengths1, |
|
const lev_byte *strings1[], |
|
size_t n2, const size_t *lengths2, |
|
const lev_byte *strings2[]) |
|
{ |
|
size_t i; |
|
double *row; |
|
double *end; |
|
|
|
|
|
while (n1 > 0 && n2 > 0 |
|
&& *lengths1 == *lengths2 |
|
&& memcmp(*strings1, *strings2, |
|
*lengths1*sizeof(lev_byte)) == 0) { |
|
n1--; |
|
n2--; |
|
strings1++; |
|
strings2++; |
|
lengths1++; |
|
lengths2++; |
|
} |
|
|
|
|
|
while (n1 > 0 && n2 > 0 |
|
&& lengths1[n1-1] == lengths2[n2-1] |
|
&& memcmp(strings1[n1-1], strings2[n2-1], |
|
lengths1[n1-1]*sizeof(lev_byte)) == 0) { |
|
n1--; |
|
n2--; |
|
} |
|
|
|
|
|
if (n1 == 0) |
|
return n2; |
|
if (n2 == 0) |
|
return n1; |
|
|
|
|
|
if (n1 > n2) { |
|
size_t nx = n1; |
|
const size_t *lx = lengths1; |
|
const lev_byte **sx = strings1; |
|
n1 = n2; |
|
n2 = nx; |
|
lengths1 = lengths2; |
|
lengths2 = lx; |
|
strings1 = strings2; |
|
strings2 = sx; |
|
} |
|
n1++; |
|
n2++; |
|
|
|
|
|
row = (double*)safe_malloc(n2, sizeof(double)); |
|
if (!row) |
|
return -1.0; |
|
end = row + n2 - 1; |
|
for (i = 0; i < n2; i++) |
|
row[i] = (double)i; |
|
|
|
|
|
|
|
|
|
for (i = 1; i < n1; i++) { |
|
double *p = row + 1; |
|
const lev_byte *str1 = strings1[i - 1]; |
|
const size_t len1 = lengths1[i - 1]; |
|
const lev_byte **str2p = strings2; |
|
const size_t *len2p = lengths2; |
|
double D = i - 1.0; |
|
double x = i; |
|
while (p <= end) { |
|
size_t l = len1 + *len2p; |
|
double q; |
|
if (l == 0) |
|
q = D; |
|
else { |
|
size_t d = lev_edit_distance(len1, str1, *(len2p++), *(str2p++), 1); |
|
if (d == (size_t)(-1)) { |
|
free(row); |
|
return -1.0; |
|
} |
|
q = D + 2.0/l*d; |
|
} |
|
x += 1.0; |
|
if (x > q) |
|
x = q; |
|
D = *p; |
|
if (x > D + 1.0) |
|
x = D + 1.0; |
|
*(p++) = x; |
|
} |
|
} |
|
|
|
{ |
|
double q = *end; |
|
free(row); |
|
return q; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_u_edit_seq_distance(size_t n1, const size_t *lengths1, |
|
const lev_wchar *strings1[], |
|
size_t n2, const size_t *lengths2, |
|
const lev_wchar *strings2[]) |
|
{ |
|
size_t i; |
|
double *row; |
|
double *end; |
|
|
|
|
|
while (n1 > 0 && n2 > 0 |
|
&& *lengths1 == *lengths2 |
|
&& memcmp(*strings1, *strings2, |
|
*lengths1*sizeof(lev_wchar)) == 0) { |
|
n1--; |
|
n2--; |
|
strings1++; |
|
strings2++; |
|
lengths1++; |
|
lengths2++; |
|
} |
|
|
|
|
|
while (n1 > 0 && n2 > 0 |
|
&& lengths1[n1-1] == lengths2[n2-1] |
|
&& memcmp(strings1[n1-1], strings2[n2-1], |
|
lengths1[n1-1]*sizeof(lev_wchar)) == 0) { |
|
n1--; |
|
n2--; |
|
} |
|
|
|
|
|
if (n1 == 0) |
|
return (double)n2; |
|
if (n2 == 0) |
|
return (double)n1; |
|
|
|
|
|
if (n1 > n2) { |
|
size_t nx = n1; |
|
const size_t *lx = lengths1; |
|
const lev_wchar **sx = strings1; |
|
n1 = n2; |
|
n2 = nx; |
|
lengths1 = lengths2; |
|
lengths2 = lx; |
|
strings1 = strings2; |
|
strings2 = sx; |
|
} |
|
n1++; |
|
n2++; |
|
|
|
|
|
row = (double*)safe_malloc(n2, sizeof(double)); |
|
if (!row) |
|
return -1.0; |
|
end = row + n2 - 1; |
|
for (i = 0; i < n2; i++) |
|
row[i] = (double)i; |
|
|
|
|
|
|
|
|
|
for (i = 1; i < n1; i++) { |
|
double *p = row + 1; |
|
const lev_wchar *str1 = strings1[i - 1]; |
|
const size_t len1 = lengths1[i - 1]; |
|
const lev_wchar **str2p = strings2; |
|
const size_t *len2p = lengths2; |
|
double D = i - 1.0; |
|
double x = i; |
|
while (p <= end) { |
|
size_t l = len1 + *len2p; |
|
double q; |
|
if (l == 0) |
|
q = D; |
|
else { |
|
size_t d = lev_u_edit_distance(len1, str1, *(len2p++), *(str2p++), 1); |
|
if (d == (size_t)(-1)) { |
|
free(row); |
|
return -1.0; |
|
} |
|
q = D + 2.0/l*d; |
|
} |
|
x += 1.0; |
|
if (x > q) |
|
x = q; |
|
D = *p; |
|
if (x > D + 1.0) |
|
x = D + 1.0; |
|
*(p++) = x; |
|
} |
|
} |
|
|
|
{ |
|
double q = *end; |
|
free(row); |
|
return q; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_set_distance(size_t n1, const size_t *lengths1, |
|
const lev_byte *strings1[], |
|
size_t n2, const size_t *lengths2, |
|
const lev_byte *strings2[]) |
|
{ |
|
double *dists; |
|
double *r; |
|
size_t i, j; |
|
size_t *map; |
|
double sum; |
|
|
|
|
|
if (n1 == 0) |
|
return (double)n2; |
|
if (n2 == 0) |
|
return (double)n1; |
|
|
|
|
|
if (n1 > n2) { |
|
size_t nx = n1; |
|
const size_t *lx = lengths1; |
|
const lev_byte **sx = strings1; |
|
n1 = n2; |
|
n2 = nx; |
|
lengths1 = lengths2; |
|
lengths2 = lx; |
|
strings1 = strings2; |
|
strings2 = sx; |
|
} |
|
|
|
|
|
r = dists = (double*)safe_malloc_3(n1, n2, sizeof(double)); |
|
if (!r) |
|
return -1.0; |
|
for (i = 0; i < n2; i++) { |
|
size_t len2 = lengths2[i]; |
|
const lev_byte *str2 = strings2[i]; |
|
const size_t *len1p = lengths1; |
|
const lev_byte **str1p = strings1; |
|
for (j = 0; j < n1; j++) { |
|
size_t l = len2 + *len1p; |
|
if (l == 0) |
|
*(r++) = 0.0; |
|
else { |
|
size_t d = lev_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1); |
|
if (d == (size_t)(-1)) { |
|
free(r); |
|
return -1.0; |
|
} |
|
*(r++) = (double)d/l; |
|
} |
|
} |
|
} |
|
|
|
|
|
map = munkers_blackman(n1, n2, dists); |
|
if (!map) |
|
return -1.0; |
|
|
|
|
|
sum = n2 - n1; |
|
for (j = 0; j < n1; j++) { |
|
size_t l; |
|
i = map[j]; |
|
l = lengths1[j] + lengths2[i]; |
|
if (l > 0) { |
|
size_t d = lev_edit_distance(lengths1[j], strings1[j], |
|
lengths2[i], strings2[i], 1); |
|
if (d == (size_t)(-1)) { |
|
free(map); |
|
return -1.0; |
|
} |
|
sum += 2.0*d/l; |
|
} |
|
} |
|
free(map); |
|
|
|
return sum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY double |
|
lev_u_set_distance(size_t n1, const size_t *lengths1, |
|
const lev_wchar *strings1[], |
|
size_t n2, const size_t *lengths2, |
|
const lev_wchar *strings2[]) |
|
{ |
|
double *dists; |
|
double *r; |
|
size_t i, j; |
|
size_t *map; |
|
double sum; |
|
|
|
|
|
if (n1 == 0) |
|
return (double)n2; |
|
if (n2 == 0) |
|
return (double)n1; |
|
|
|
|
|
if (n1 > n2) { |
|
size_t nx = n1; |
|
const size_t *lx = lengths1; |
|
const lev_wchar **sx = strings1; |
|
n1 = n2; |
|
n2 = nx; |
|
lengths1 = lengths2; |
|
lengths2 = lx; |
|
strings1 = strings2; |
|
strings2 = sx; |
|
} |
|
|
|
|
|
r = dists = (double*)safe_malloc_3(n1, n2, sizeof(double)); |
|
if (!r) |
|
return -1.0; |
|
for (i = 0; i < n2; i++) { |
|
size_t len2 = lengths2[i]; |
|
const lev_wchar *str2 = strings2[i]; |
|
const size_t *len1p = lengths1; |
|
const lev_wchar **str1p = strings1; |
|
for (j = 0; j < n1; j++) { |
|
size_t l = len2 + *len1p; |
|
if (l == 0) |
|
*(r++) = 0.0; |
|
else { |
|
size_t d = lev_u_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1); |
|
if (d == (size_t)(-1)) { |
|
free(r); |
|
return -1.0; |
|
} |
|
*(r++) = (double)d/l; |
|
} |
|
} |
|
} |
|
|
|
|
|
map = munkers_blackman(n1, n2, dists); |
|
if (!map) |
|
return -1.0; |
|
|
|
|
|
sum = n2 - n1; |
|
for (j = 0; j < n1; j++) { |
|
size_t l; |
|
i = map[j]; |
|
l = lengths1[j] + lengths2[i]; |
|
if (l > 0) { |
|
size_t d = lev_u_edit_distance(lengths1[j], strings1[j], |
|
lengths2[i], strings2[i], 1); |
|
if (d == (size_t)(-1)) { |
|
free(map); |
|
return -1.0; |
|
} |
|
sum += 2.0*d/l; |
|
} |
|
} |
|
free(map); |
|
|
|
return sum; |
|
} |
|
|
|
|
|
|
|
|
|
static size_t* |
|
munkers_blackman(size_t n1, size_t n2, double *dists) |
|
{ |
|
size_t i, j; |
|
size_t *covc, *covr; |
|
|
|
|
|
|
|
|
|
size_t *zstarr, *zstarc, *zprimer; |
|
|
|
|
|
covc = calloc(n1, sizeof(size_t)); |
|
if (!covc) |
|
return NULL; |
|
zstarc = calloc(n1, sizeof(size_t)); |
|
if (!zstarc) { |
|
free(covc); |
|
return NULL; |
|
} |
|
covr = calloc(n2, sizeof(size_t)); |
|
if (!covr) { |
|
free(zstarc); |
|
free(covc); |
|
return NULL; |
|
} |
|
zstarr = calloc(n2, sizeof(size_t)); |
|
if (!zstarr) { |
|
free(covr); |
|
free(zstarc); |
|
free(covc); |
|
return NULL; |
|
} |
|
zprimer = calloc(n2, sizeof(size_t)); |
|
if (!zprimer) { |
|
free(zstarr); |
|
free(covr); |
|
free(zstarc); |
|
free(covc); |
|
return NULL; |
|
} |
|
|
|
|
|
for (j = 0; j < n1; j++) { |
|
size_t minidx = 0; |
|
double *col = dists + j; |
|
double min = *col; |
|
double *p = col + n1; |
|
for (i = 1; i < n2; i++) { |
|
if (min > *p) { |
|
minidx = i; |
|
min = *p; |
|
} |
|
p += n1; |
|
} |
|
|
|
p = col; |
|
for (i = 0; i < n2; i++) { |
|
*p -= min; |
|
if (*p < LEV_EPSILON) |
|
*p = 0.0; |
|
p += n1; |
|
} |
|
|
|
if (!zstarc[j] && !zstarr[minidx]) { |
|
zstarc[j] = minidx + 1; |
|
zstarr[minidx] = j + 1; |
|
} |
|
else { |
|
|
|
p = col; |
|
for (i = 0; i < n2; i++) { |
|
if (i != minidx && *p == 0.0 |
|
&& !zstarc[j] && !zstarr[i]) { |
|
zstarc[j] = i + 1; |
|
zstarr[i] = j + 1; |
|
break; |
|
} |
|
p += n1; |
|
} |
|
} |
|
} |
|
|
|
|
|
while (1) { |
|
|
|
{ |
|
size_t nc = 0; |
|
for (j = 0; j < n1; j++) { |
|
if (zstarc[j]) { |
|
covc[j] = 1; |
|
nc++; |
|
} |
|
} |
|
if (nc == n1) |
|
break; |
|
} |
|
|
|
|
|
while (1) { |
|
step_3: |
|
|
|
for (j = 0; j < n1; j++) { |
|
double *p = dists + j; |
|
if (covc[j]) |
|
continue; |
|
for (i = 0; i < n2; i++) { |
|
if (!covr[i] && *p == 0.0) { |
|
|
|
zprimer[i] = j + 1; |
|
if (zstarr[i]) { |
|
|
|
|
|
covr[i] = 1; |
|
covc[zstarr[i] - 1] = 0; |
|
goto step_3; |
|
} |
|
|
|
|
|
|
|
goto step_4; |
|
} |
|
p += n1; |
|
} |
|
} |
|
|
|
|
|
|
|
{ |
|
|
|
double min = LEV_INFINITY; |
|
for (j = 0; j < n1; j++) { |
|
double *p = dists + j; |
|
if (covc[j]) |
|
continue; |
|
for (i = 0; i < n2; i++) { |
|
if (!covr[i] && min > *p) { |
|
min = *p; |
|
} |
|
p += n1; |
|
} |
|
} |
|
|
|
for (i = 0; i < n2; i++) { |
|
double *p = dists + i*n1; |
|
if (!covr[i]) |
|
continue; |
|
for (j = 0; j < n1; j++) |
|
*(p++) += min; |
|
} |
|
|
|
for (j = 0; j < n1; j++) { |
|
double *p = dists + j; |
|
if (covc[j]) |
|
continue; |
|
for (i = 0; i < n2; i++) { |
|
*p -= min; |
|
if (*p < LEV_EPSILON) |
|
*p = 0.0; |
|
p += n1; |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
step_4: |
|
i++; |
|
do { |
|
size_t x = i; |
|
|
|
i--; |
|
j = zprimer[i] - 1; |
|
zstarr[i] = j + 1; |
|
i = zstarc[j]; |
|
zstarc[j] = x; |
|
} while (i); |
|
memset(zprimer, 0, n2*sizeof(size_t)); |
|
memset(covr, 0, n2*sizeof(size_t)); |
|
memset(covc, 0, n1*sizeof(size_t)); |
|
} |
|
|
|
free(dists); |
|
free(covc); |
|
free(covr); |
|
|
|
free(zstarr); |
|
free(zprimer); |
|
|
|
for (j = 0; j < n1; j++) |
|
zstarc[j]--; |
|
return zstarc; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY int |
|
lev_editops_check_errors(size_t len1, size_t len2, |
|
size_t n, const LevEditOp *ops) |
|
{ |
|
const LevEditOp *o; |
|
size_t i; |
|
|
|
if (!n) |
|
return LEV_EDIT_ERR_OK; |
|
|
|
|
|
o = ops; |
|
for (i = n; i; i--, o++) { |
|
if (o->type >= LEV_EDIT_LAST) |
|
return LEV_EDIT_ERR_TYPE; |
|
if (o->spos > len1 || o->dpos > len2) |
|
return LEV_EDIT_ERR_OUT; |
|
if (o->spos == len1 && o->type != LEV_EDIT_INSERT) |
|
return LEV_EDIT_ERR_OUT; |
|
if (o->dpos == len2 && o->type != LEV_EDIT_DELETE) |
|
return LEV_EDIT_ERR_OUT; |
|
} |
|
|
|
|
|
o = ops + 1; |
|
for (i = n - 1; i; i--, o++, ops++) { |
|
if (o->spos < ops->spos || o->dpos < ops->dpos) |
|
return LEV_EDIT_ERR_ORDER; |
|
} |
|
|
|
return LEV_EDIT_ERR_OK; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY int |
|
lev_opcodes_check_errors(size_t len1, size_t len2, |
|
size_t nb, const LevOpCode *bops) |
|
{ |
|
const LevOpCode *b; |
|
size_t i; |
|
|
|
if (!nb) |
|
return 1; |
|
|
|
|
|
if (bops->sbeg || bops->dbeg |
|
|| bops[nb - 1].send != len1 || bops[nb - 1].dend != len2) |
|
return LEV_EDIT_ERR_SPAN; |
|
|
|
|
|
b = bops; |
|
for (i = nb; i; i--, b++) { |
|
if (b->send > len1 || b->dend > len2) |
|
return LEV_EDIT_ERR_OUT; |
|
switch (b->type) { |
|
case LEV_EDIT_KEEP: |
|
case LEV_EDIT_REPLACE: |
|
if (b->dend - b->dbeg != b->send - b->sbeg || b->dend == b->dbeg) |
|
return LEV_EDIT_ERR_BLOCK; |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
if (b->dend - b->dbeg == 0 || b->send - b->sbeg != 0) |
|
return LEV_EDIT_ERR_BLOCK; |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
if (b->send - b->sbeg == 0 || b->dend - b->dbeg != 0) |
|
return LEV_EDIT_ERR_BLOCK; |
|
break; |
|
|
|
default: |
|
return LEV_EDIT_ERR_TYPE; |
|
break; |
|
} |
|
} |
|
|
|
|
|
b = bops + 1; |
|
for (i = nb - 1; i; i--, b++, bops++) { |
|
if (b->sbeg != bops->send || b->dbeg != bops->dend) |
|
return LEV_EDIT_ERR_ORDER; |
|
} |
|
|
|
return LEV_EDIT_ERR_OK; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY void |
|
lev_editops_invert(size_t n, LevEditOp *ops) |
|
{ |
|
size_t i; |
|
|
|
for (i = n; i; i--, ops++) { |
|
size_t z; |
|
|
|
z = ops->dpos; |
|
ops->dpos = ops->spos; |
|
ops->spos = z; |
|
if (ops->type & 2) |
|
ops->type ^= 1; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_byte* |
|
lev_editops_apply(size_t len1, const lev_byte *string1, |
|
__attribute__((unused)) size_t len2, const lev_byte *string2, |
|
size_t n, const LevEditOp *ops, |
|
size_t *len) |
|
{ |
|
lev_byte *dst, *dpos; |
|
const lev_byte *spos; |
|
size_t i, j; |
|
|
|
|
|
|
|
dpos = dst = (lev_byte*)safe_malloc((n + len1), sizeof(lev_byte)); |
|
if (!dst) { |
|
*len = (size_t)(-1); |
|
return NULL; |
|
} |
|
spos = string1; |
|
for (i = n; i; i--, ops++) { |
|
|
|
|
|
j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP); |
|
if (j) { |
|
memcpy(dpos, spos, j*sizeof(lev_byte)); |
|
spos += j; |
|
dpos += j; |
|
} |
|
switch (ops->type) { |
|
case LEV_EDIT_DELETE: |
|
spos++; |
|
break; |
|
|
|
case LEV_EDIT_REPLACE: |
|
spos++; |
|
case LEV_EDIT_INSERT: |
|
*(dpos++) = string2[ops->dpos]; |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
j = len1 - (spos - string1); |
|
if (j) { |
|
memcpy(dpos, spos, j*sizeof(lev_byte)); |
|
spos += j; |
|
dpos += j; |
|
} |
|
|
|
*len = dpos - dst; |
|
|
|
return realloc(dst, *len*sizeof(lev_byte)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_wchar* |
|
lev_u_editops_apply(size_t len1, const lev_wchar *string1, |
|
__attribute__((unused)) size_t len2, |
|
const lev_wchar *string2, |
|
size_t n, const LevEditOp *ops, |
|
size_t *len) |
|
{ |
|
lev_wchar *dst, *dpos; |
|
const lev_wchar *spos; |
|
size_t i, j; |
|
|
|
|
|
|
|
dpos = dst = (lev_wchar*)safe_malloc((n + len1), sizeof(lev_wchar)); |
|
if (!dst) { |
|
*len = (size_t)(-1); |
|
return NULL; |
|
} |
|
spos = string1; |
|
for (i = n; i; i--, ops++) { |
|
|
|
|
|
j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP); |
|
if (j) { |
|
memcpy(dpos, spos, j*sizeof(lev_wchar)); |
|
spos += j; |
|
dpos += j; |
|
} |
|
switch (ops->type) { |
|
case LEV_EDIT_DELETE: |
|
spos++; |
|
break; |
|
|
|
case LEV_EDIT_REPLACE: |
|
spos++; |
|
case LEV_EDIT_INSERT: |
|
*(dpos++) = string2[ops->dpos]; |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
j = len1 - (spos - string1); |
|
if (j) { |
|
memcpy(dpos, spos, j*sizeof(lev_wchar)); |
|
spos += j; |
|
dpos += j; |
|
} |
|
|
|
*len = dpos - dst; |
|
|
|
return realloc(dst, *len*sizeof(lev_wchar)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static LevEditOp* |
|
editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1, |
|
size_t len2, const lev_byte *string2, size_t off2, |
|
size_t *matrix, size_t *n) |
|
{ |
|
size_t *p; |
|
size_t i, j, pos; |
|
LevEditOp *ops; |
|
int dir = 0; |
|
|
|
pos = *n = matrix[len1*len2 - 1]; |
|
if (!*n) { |
|
free(matrix); |
|
return NULL; |
|
} |
|
ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); |
|
if (!ops) { |
|
free(matrix); |
|
*n = (size_t)(-1); |
|
return NULL; |
|
} |
|
i = len1 - 1; |
|
j = len2 - 1; |
|
p = matrix + len1*len2 - 1; |
|
while (i || j) { |
|
|
|
if (dir < 0 && j && *p == *(p - 1) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_INSERT; |
|
ops[pos].spos = i + off1; |
|
ops[pos].dpos = --j + off2; |
|
p--; |
|
continue; |
|
} |
|
if (dir > 0 && i && *p == *(p - len2) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_DELETE; |
|
ops[pos].spos = --i + off1; |
|
ops[pos].dpos = j + off2; |
|
p -= len2; |
|
continue; |
|
} |
|
if (i && j && *p == *(p - len2 - 1) |
|
&& string1[i - 1] == string2[j - 1]) { |
|
|
|
i--; |
|
j--; |
|
p -= len2 + 1; |
|
dir = 0; |
|
continue; |
|
} |
|
if (i && j && *p == *(p - len2 - 1) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_REPLACE; |
|
ops[pos].spos = --i + off1; |
|
ops[pos].dpos = --j + off2; |
|
p -= len2 + 1; |
|
dir = 0; |
|
continue; |
|
} |
|
|
|
|
|
if (dir == 0 && j && *p == *(p - 1) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_INSERT; |
|
ops[pos].spos = i + off1; |
|
ops[pos].dpos = --j + off2; |
|
p--; |
|
dir = -1; |
|
continue; |
|
} |
|
if (dir == 0 && i && *p == *(p - len2) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_DELETE; |
|
ops[pos].spos = --i + off1; |
|
ops[pos].dpos = j + off2; |
|
p -= len2; |
|
dir = 1; |
|
continue; |
|
} |
|
|
|
assert("lost in the cost matrix" == NULL); |
|
} |
|
free(matrix); |
|
|
|
return ops; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevEditOp* |
|
lev_editops_find(size_t len1, const lev_byte *string1, |
|
size_t len2, const lev_byte *string2, |
|
size_t *n) |
|
{ |
|
size_t len1o, len2o; |
|
size_t i; |
|
size_t *matrix; |
|
|
|
|
|
len1o = 0; |
|
while (len1 > 0 && len2 > 0 && *string1 == *string2) { |
|
len1--; |
|
len2--; |
|
string1++; |
|
string2++; |
|
len1o++; |
|
} |
|
len2o = len1o; |
|
|
|
|
|
while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { |
|
len1--; |
|
len2--; |
|
} |
|
len1++; |
|
len2++; |
|
|
|
|
|
matrix = (size_t*)safe_malloc_3(len1, len2, sizeof(size_t)); |
|
if (!matrix) { |
|
*n = (size_t)(-1); |
|
return NULL; |
|
} |
|
for (i = 0; i < len2; i++) |
|
matrix[i] = i; |
|
for (i = 1; i < len1; i++) |
|
matrix[len2*i] = i; |
|
|
|
|
|
for (i = 1; i < len1; i++) { |
|
size_t *prev = matrix + (i - 1)*len2; |
|
size_t *p = matrix + i*len2; |
|
size_t *end = p + len2 - 1; |
|
const lev_byte char1 = string1[i - 1]; |
|
const lev_byte *char2p = string2; |
|
size_t x = i; |
|
p++; |
|
while (p <= end) { |
|
size_t c3 = *(prev++) + (char1 != *(char2p++)); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
c3 = *prev + 1; |
|
if (x > c3) |
|
x = c3; |
|
*(p++) = x; |
|
} |
|
} |
|
|
|
|
|
return editops_from_cost_matrix(len1, string1, len1o, |
|
len2, string2, len2o, |
|
matrix, n); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static LevEditOp* |
|
ueditops_from_cost_matrix(size_t len1, const lev_wchar *string1, size_t o1, |
|
size_t len2, const lev_wchar *string2, size_t o2, |
|
size_t *matrix, size_t *n) |
|
{ |
|
size_t *p; |
|
size_t i, j, pos; |
|
LevEditOp *ops; |
|
int dir = 0; |
|
|
|
pos = *n = matrix[len1*len2 - 1]; |
|
if (!*n) { |
|
free(matrix); |
|
return NULL; |
|
} |
|
ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); |
|
if (!ops) { |
|
free(matrix); |
|
*n = (size_t)(-1); |
|
return NULL; |
|
} |
|
i = len1 - 1; |
|
j = len2 - 1; |
|
p = matrix + len1*len2 - 1; |
|
while (i || j) { |
|
|
|
if (dir < 0 && j && *p == *(p - 1) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_INSERT; |
|
ops[pos].spos = i + o1; |
|
ops[pos].dpos = --j + o2; |
|
p--; |
|
continue; |
|
} |
|
if (dir > 0 && i && *p == *(p - len2) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_DELETE; |
|
ops[pos].spos = --i + o1; |
|
ops[pos].dpos = j + o2; |
|
p -= len2; |
|
continue; |
|
} |
|
if (i && j && *p == *(p - len2 - 1) |
|
&& string1[i - 1] == string2[j - 1]) { |
|
|
|
i--; |
|
j--; |
|
p -= len2 + 1; |
|
dir = 0; |
|
continue; |
|
} |
|
if (i && j && *p == *(p - len2 - 1) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_REPLACE; |
|
ops[pos].spos = --i + o1; |
|
ops[pos].dpos = --j + o2; |
|
p -= len2 + 1; |
|
dir = 0; |
|
continue; |
|
} |
|
|
|
|
|
if (dir == 0 && j && *p == *(p - 1) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_INSERT; |
|
ops[pos].spos = i + o1; |
|
ops[pos].dpos = --j + o2; |
|
p--; |
|
dir = -1; |
|
continue; |
|
} |
|
if (dir == 0 && i && *p == *(p - len2) + 1) { |
|
pos--; |
|
ops[pos].type = LEV_EDIT_DELETE; |
|
ops[pos].spos = --i + o1; |
|
ops[pos].dpos = j + o2; |
|
p -= len2; |
|
dir = 1; |
|
continue; |
|
} |
|
|
|
assert("lost in the cost matrix" == NULL); |
|
} |
|
free(matrix); |
|
|
|
return ops; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevEditOp* |
|
lev_u_editops_find(size_t len1, const lev_wchar *string1, |
|
size_t len2, const lev_wchar *string2, |
|
size_t *n) |
|
{ |
|
size_t len1o, len2o; |
|
size_t i; |
|
size_t *matrix; |
|
|
|
|
|
len1o = 0; |
|
while (len1 > 0 && len2 > 0 && *string1 == *string2) { |
|
len1--; |
|
len2--; |
|
string1++; |
|
string2++; |
|
len1o++; |
|
} |
|
len2o = len1o; |
|
|
|
|
|
while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { |
|
len1--; |
|
len2--; |
|
} |
|
len1++; |
|
len2++; |
|
|
|
|
|
matrix = (size_t*)safe_malloc_3(len1, len2, sizeof(size_t)); |
|
if (!matrix) { |
|
*n = (size_t)(-1); |
|
return NULL; |
|
} |
|
for (i = 0; i < len2; i++) |
|
matrix[i] = i; |
|
for (i = 1; i < len1; i++) |
|
matrix[len2*i] = i; |
|
|
|
|
|
for (i = 1; i < len1; i++) { |
|
size_t *prev = matrix + (i - 1)*len2; |
|
size_t *p = matrix + i*len2; |
|
size_t *end = p + len2 - 1; |
|
const lev_wchar char1 = string1[i - 1]; |
|
const lev_wchar *char2p = string2; |
|
size_t x = i; |
|
p++; |
|
while (p <= end) { |
|
size_t c3 = *(prev++) + (char1 != *(char2p++)); |
|
x++; |
|
if (x > c3) |
|
x = c3; |
|
c3 = *prev + 1; |
|
if (x > c3) |
|
x = c3; |
|
*(p++) = x; |
|
} |
|
} |
|
|
|
|
|
return ueditops_from_cost_matrix(len1, string1, len1o, |
|
len2, string2, len2o, |
|
matrix, n); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevEditOp* |
|
lev_opcodes_to_editops(size_t nb, const LevOpCode *bops, |
|
size_t *n, int keepkeep) |
|
{ |
|
size_t i; |
|
const LevOpCode *b; |
|
LevEditOp *ops, *o; |
|
|
|
|
|
*n = 0; |
|
if (!nb) |
|
return NULL; |
|
|
|
b = bops; |
|
if (keepkeep) { |
|
for (i = nb; i; i--, b++) { |
|
size_t sd = b->send - b->sbeg; |
|
size_t dd = b->dend - b->dbeg; |
|
*n += (sd > dd ? sd : dd); |
|
} |
|
} |
|
else { |
|
for (i = nb; i; i--, b++) { |
|
size_t sd = b->send - b->sbeg; |
|
size_t dd = b->dend - b->dbeg; |
|
*n += (b->type != LEV_EDIT_KEEP ? (sd > dd ? sd : dd) : 0); |
|
} |
|
} |
|
|
|
|
|
o = ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); |
|
if (!ops) { |
|
*n = (size_t)(-1); |
|
return NULL; |
|
} |
|
b = bops; |
|
for (i = nb; i; i--, b++) { |
|
size_t j; |
|
|
|
switch (b->type) { |
|
case LEV_EDIT_KEEP: |
|
if (keepkeep) { |
|
for (j = 0; j < b->send - b->sbeg; j++, o++) { |
|
o->type = LEV_EDIT_KEEP; |
|
o->spos = b->sbeg + j; |
|
o->dpos = b->dbeg + j; |
|
} |
|
} |
|
break; |
|
|
|
case LEV_EDIT_REPLACE: |
|
for (j = 0; j < b->send - b->sbeg; j++, o++) { |
|
o->type = LEV_EDIT_REPLACE; |
|
o->spos = b->sbeg + j; |
|
o->dpos = b->dbeg + j; |
|
} |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
for (j = 0; j < b->send - b->sbeg; j++, o++) { |
|
o->type = LEV_EDIT_DELETE; |
|
o->spos = b->sbeg + j; |
|
o->dpos = b->dbeg; |
|
} |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
for (j = 0; j < b->dend - b->dbeg; j++, o++) { |
|
o->type = LEV_EDIT_INSERT; |
|
o->spos = b->sbeg; |
|
o->dpos = b->dbeg + j; |
|
} |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
assert((size_t)(o - ops) == *n); |
|
|
|
return ops; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevOpCode* |
|
lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb, |
|
size_t len1, size_t len2) |
|
{ |
|
size_t nbl, i, spos, dpos; |
|
const LevEditOp *o; |
|
LevOpCode *bops, *b; |
|
LevEditType type; |
|
|
|
|
|
nbl = 0; |
|
o = ops; |
|
spos = dpos = 0; |
|
type = LEV_EDIT_KEEP; |
|
for (i = n; i; ) { |
|
|
|
while (o->type == LEV_EDIT_KEEP && --i) |
|
o++; |
|
if (!i) |
|
break; |
|
if (spos < o->spos || dpos < o->dpos) { |
|
nbl++; |
|
spos = o->spos; |
|
dpos = o->dpos; |
|
} |
|
nbl++; |
|
type = o->type; |
|
switch (type) { |
|
case LEV_EDIT_REPLACE: |
|
do { |
|
spos++; |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
do { |
|
spos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
do { |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
if (spos < len1 || dpos < len2) |
|
nbl++; |
|
|
|
|
|
b = bops = (LevOpCode*)safe_malloc(nbl, sizeof(LevOpCode)); |
|
if (!bops) { |
|
*nb = (size_t)(-1); |
|
return NULL; |
|
} |
|
o = ops; |
|
spos = dpos = 0; |
|
type = LEV_EDIT_KEEP; |
|
for (i = n; i; ) { |
|
|
|
while (o->type == LEV_EDIT_KEEP && --i) |
|
o++; |
|
if (!i) |
|
break; |
|
b->sbeg = spos; |
|
b->dbeg = dpos; |
|
if (spos < o->spos || dpos < o->dpos) { |
|
b->type = LEV_EDIT_KEEP; |
|
spos = b->send = o->spos; |
|
dpos = b->dend = o->dpos; |
|
b++; |
|
b->sbeg = spos; |
|
b->dbeg = dpos; |
|
} |
|
type = o->type; |
|
switch (type) { |
|
case LEV_EDIT_REPLACE: |
|
do { |
|
spos++; |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
do { |
|
spos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
do { |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
b->type = type; |
|
b->send = spos; |
|
b->dend = dpos; |
|
b++; |
|
} |
|
if (spos < len1 || dpos < len2) { |
|
assert(len1 - spos == len2 - dpos); |
|
b->type = LEV_EDIT_KEEP; |
|
b->sbeg = spos; |
|
b->dbeg = dpos; |
|
b->send = len1; |
|
b->dend = len2; |
|
b++; |
|
} |
|
assert((size_t)(b - bops) == nbl); |
|
|
|
*nb = nbl; |
|
return bops; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_byte* |
|
lev_opcodes_apply(size_t len1, const lev_byte *string1, |
|
size_t len2, const lev_byte *string2, |
|
size_t nb, const LevOpCode *bops, |
|
size_t *len) |
|
{ |
|
lev_byte *dst, *dpos; |
|
const lev_byte *spos; |
|
size_t i; |
|
|
|
|
|
|
|
dpos = dst = (lev_byte*)safe_malloc((len1 + len2), sizeof(lev_byte)); |
|
if (!dst) { |
|
*len = (size_t)(-1); |
|
return NULL; |
|
} |
|
spos = string1; |
|
for (i = nb; i; i--, bops++) { |
|
switch (bops->type) { |
|
case LEV_EDIT_INSERT: |
|
case LEV_EDIT_REPLACE: |
|
memcpy(dpos, string2 + bops->dbeg, |
|
(bops->dend - bops->dbeg)*sizeof(lev_byte)); |
|
break; |
|
|
|
case LEV_EDIT_KEEP: |
|
memcpy(dpos, string1 + bops->sbeg, |
|
(bops->send - bops->sbeg)*sizeof(lev_byte)); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
spos += bops->send - bops->sbeg; |
|
dpos += bops->dend - bops->dbeg; |
|
} |
|
|
|
*len = dpos - dst; |
|
|
|
return realloc(dst, *len*sizeof(lev_byte)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY lev_wchar* |
|
lev_u_opcodes_apply(size_t len1, const lev_wchar *string1, |
|
size_t len2, const lev_wchar *string2, |
|
size_t nb, const LevOpCode *bops, |
|
size_t *len) |
|
{ |
|
lev_wchar *dst, *dpos; |
|
const lev_wchar *spos; |
|
size_t i; |
|
|
|
|
|
|
|
dpos = dst = (lev_wchar*)safe_malloc((len1 + len2), sizeof(lev_wchar)); |
|
if (!dst) { |
|
*len = (size_t)(-1); |
|
return NULL; |
|
} |
|
spos = string1; |
|
for (i = nb; i; i--, bops++) { |
|
switch (bops->type) { |
|
case LEV_EDIT_INSERT: |
|
case LEV_EDIT_REPLACE: |
|
memcpy(dpos, string2 + bops->dbeg, |
|
(bops->dend - bops->dbeg)*sizeof(lev_wchar)); |
|
break; |
|
|
|
case LEV_EDIT_KEEP: |
|
memcpy(dpos, string1 + bops->sbeg, |
|
(bops->send - bops->sbeg)*sizeof(lev_wchar)); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
spos += bops->send - bops->sbeg; |
|
dpos += bops->dend - bops->dbeg; |
|
} |
|
|
|
*len = dpos - dst; |
|
|
|
return realloc(dst, *len*sizeof(lev_wchar)); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY void |
|
lev_opcodes_invert(size_t nb, LevOpCode *bops) |
|
{ |
|
size_t i; |
|
|
|
for (i = nb; i; i--, bops++) { |
|
size_t z; |
|
|
|
z = bops->dbeg; |
|
bops->dbeg = bops->sbeg; |
|
bops->sbeg = z; |
|
z = bops->dend; |
|
bops->dend = bops->send; |
|
bops->send = z; |
|
if (bops->type & 2) |
|
bops->type ^= 1; |
|
} |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevMatchingBlock* |
|
lev_editops_matching_blocks(size_t len1, |
|
size_t len2, |
|
size_t n, |
|
const LevEditOp *ops, |
|
size_t *nmblocks) |
|
{ |
|
size_t nmb, i, spos, dpos; |
|
LevEditType type; |
|
const LevEditOp *o; |
|
LevMatchingBlock *mblocks, *mb; |
|
|
|
|
|
nmb = 0; |
|
o = ops; |
|
spos = dpos = 0; |
|
type = LEV_EDIT_KEEP; |
|
for (i = n; i; ) { |
|
|
|
while (o->type == LEV_EDIT_KEEP && --i) |
|
o++; |
|
if (!i) |
|
break; |
|
if (spos < o->spos || dpos < o->dpos) { |
|
nmb++; |
|
spos = o->spos; |
|
dpos = o->dpos; |
|
} |
|
type = o->type; |
|
switch (type) { |
|
case LEV_EDIT_REPLACE: |
|
do { |
|
spos++; |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
do { |
|
spos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
do { |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
if (spos < len1 || dpos < len2) |
|
nmb++; |
|
|
|
|
|
mb = mblocks = (LevMatchingBlock*)safe_malloc(nmb, sizeof(LevOpCode)); |
|
if (!mblocks) { |
|
*nmblocks = (size_t)(-1); |
|
return NULL; |
|
} |
|
o = ops; |
|
spos = dpos = 0; |
|
type = LEV_EDIT_KEEP; |
|
for (i = n; i; ) { |
|
|
|
while (o->type == LEV_EDIT_KEEP && --i) |
|
o++; |
|
if (!i) |
|
break; |
|
if (spos < o->spos || dpos < o->dpos) { |
|
mb->spos = spos; |
|
mb->dpos = dpos; |
|
mb->len = o->spos - spos; |
|
spos = o->spos; |
|
dpos = o->dpos; |
|
mb++; |
|
} |
|
type = o->type; |
|
switch (type) { |
|
case LEV_EDIT_REPLACE: |
|
do { |
|
spos++; |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_DELETE: |
|
do { |
|
spos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
do { |
|
dpos++; |
|
i--; |
|
o++; |
|
} while (i && o->type == type && spos == o->spos && dpos == o->dpos); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
if (spos < len1 || dpos < len2) { |
|
assert(len1 - spos == len2 - dpos); |
|
mb->spos = spos; |
|
mb->dpos = dpos; |
|
mb->len = len1 - spos; |
|
mb++; |
|
} |
|
assert((size_t)(mb - mblocks) == nmb); |
|
|
|
*nmblocks = nmb; |
|
return mblocks; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevMatchingBlock* |
|
lev_opcodes_matching_blocks(size_t len1, |
|
__attribute__((unused)) size_t len2, |
|
size_t nb, |
|
const LevOpCode *bops, |
|
size_t *nmblocks) |
|
{ |
|
size_t nmb, i; |
|
const LevOpCode *b; |
|
LevMatchingBlock *mblocks, *mb; |
|
|
|
|
|
nmb = 0; |
|
b = bops; |
|
for (i = nb; i; i--, b++) { |
|
if (b->type == LEV_EDIT_KEEP) { |
|
nmb++; |
|
|
|
while (i && b->type == LEV_EDIT_KEEP) { |
|
i--; |
|
b++; |
|
} |
|
if (!i) |
|
break; |
|
} |
|
} |
|
|
|
|
|
mb = mblocks = (LevMatchingBlock*)safe_malloc(nmb, sizeof(LevOpCode)); |
|
if (!mblocks) { |
|
*nmblocks = (size_t)(-1); |
|
return NULL; |
|
} |
|
b = bops; |
|
for (i = nb; i; i--, b++) { |
|
if (b->type == LEV_EDIT_KEEP) { |
|
mb->spos = b->sbeg; |
|
mb->dpos = b->dbeg; |
|
|
|
while (i && b->type == LEV_EDIT_KEEP) { |
|
i--; |
|
b++; |
|
} |
|
if (!i) { |
|
mb->len = len1 - mb->spos; |
|
mb++; |
|
break; |
|
} |
|
mb->len = b->sbeg - mb->spos; |
|
mb++; |
|
} |
|
} |
|
assert((size_t)(mb - mblocks) == nmb); |
|
|
|
*nmblocks = nmb; |
|
return mblocks; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_editops_total_cost(size_t n, |
|
const LevEditOp *ops) |
|
{ |
|
size_t i, sum = 0; |
|
|
|
for (i = n; i; i--, ops++) |
|
sum += !!ops->type; |
|
|
|
return sum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevEditOp* |
|
lev_editops_normalize(size_t n, |
|
const LevEditOp *ops, |
|
size_t *nnorm) |
|
{ |
|
size_t nx, i; |
|
const LevEditOp *o; |
|
LevEditOp *opsnorm, *on; |
|
|
|
if (!n || !ops) { |
|
*nnorm = 0; |
|
return NULL; |
|
} |
|
|
|
nx = 0; |
|
o = ops; |
|
for (i = n; i; i--, o++) |
|
nx += (o->type == LEV_EDIT_KEEP); |
|
|
|
*nnorm = n - nx; |
|
if (!*nnorm) |
|
return NULL; |
|
|
|
opsnorm = on = (LevEditOp*)safe_malloc((n - nx), sizeof(LevEditOp)); |
|
o = ops; |
|
for (i = n; i; i--, o++) { |
|
if (o->type == LEV_EDIT_KEEP) |
|
continue; |
|
memcpy(on++, o, sizeof(LevEditOp)); |
|
} |
|
|
|
return opsnorm; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY size_t |
|
lev_opcodes_total_cost(size_t nb, |
|
const LevOpCode *bops) |
|
{ |
|
size_t i, sum = 0; |
|
|
|
for (i = nb; i; i--, bops++) { |
|
switch (bops->type) { |
|
case LEV_EDIT_REPLACE: |
|
case LEV_EDIT_DELETE: |
|
sum += (bops->send - bops->sbeg); |
|
break; |
|
|
|
case LEV_EDIT_INSERT: |
|
sum += (bops->dend - bops->dbeg); |
|
break; |
|
|
|
default: |
|
break; |
|
} |
|
} |
|
|
|
return sum; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_LEV_STATIC_PY LevEditOp* |
|
lev_editops_subtract(size_t n, |
|
const LevEditOp *ops, |
|
size_t ns, |
|
const LevEditOp *sub, |
|
size_t *nrem) |
|
{ |
|
static const int shifts[] = { 0, 0, 1, -1 }; |
|
LevEditOp *rem; |
|
size_t i, j, nr, nn; |
|
int shift; |
|
|
|
|
|
*nrem = -1; |
|
nr = nn = 0; |
|
for (i = 0; i < n; i++) { |
|
if (ops[i].type != LEV_EDIT_KEEP) |
|
nr++; |
|
} |
|
for (i = 0; i < ns; i++) { |
|
if (sub[i].type != LEV_EDIT_KEEP) |
|
nn++; |
|
} |
|
if (nn > nr) |
|
return NULL; |
|
nr -= nn; |
|
|
|
|
|
|
|
|
|
|
|
|
|
rem = nr ? (LevEditOp*)safe_malloc(nr, sizeof(LevEditOp)) : NULL; |
|
j = nn = shift = 0; |
|
for (i = 0; i < ns; i++) { |
|
while ((ops[j].spos != sub[i].spos |
|
|| ops[j].dpos != sub[i].dpos |
|
|| ops[j].type != sub[i].type) |
|
&& j < n) { |
|
if (ops[j].type != LEV_EDIT_KEEP) { |
|
rem[nn] = ops[j]; |
|
rem[nn].spos += shift; |
|
nn++; |
|
} |
|
j++; |
|
} |
|
if (j == n) { |
|
free(rem); |
|
return NULL; |
|
} |
|
|
|
shift += shifts[sub[i].type]; |
|
j++; |
|
} |
|
|
|
while (j < n) { |
|
if (ops[j].type != LEV_EDIT_KEEP) { |
|
rem[nn] = ops[j]; |
|
rem[nn].spos += shift; |
|
nn++; |
|
} |
|
j++; |
|
} |
|
assert(nn == nr); |
|
|
|
*nrem = nr; |
|
return rem; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|