diff --git "a/.venv/lib/python3.9/site-packages/Levenshtein/_levenshtein.c" "b/.venv/lib/python3.9/site-packages/Levenshtein/_levenshtein.c" new file mode 100644--- /dev/null +++ "b/.venv/lib/python3.9/site-packages/Levenshtein/_levenshtein.c" @@ -0,0 +1,6853 @@ +/* + * Levenshtein.c + * @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $ + * Python extension computing Levenshtein distances, string similarities, + * median strings and other goodies. + * + * Copyright (C) 2002-2003 David Necas (Yeti) . + * + * The Taus113 random generator: + * Copyright (C) 2002 Atakan Gurkan + * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough + * (see below for more) + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + **/ + +/** + * TODO: + * + * - Implement weighted string averaging, see: + * H. Bunke et. al.: On the Weighted Mean of a Pair of Strings, + * Pattern Analysis and Applications 2002, 5(1): 23-30. + * X. Jiang et. al.: Dynamic Computations of Generalized Median Strings, + * Pattern Analysis and Applications 2002, ???. + * The latter also contains an interesting median-search algorithm. + * + * - Deal with stray symbols in greedy median() and median_improve(). + * There are two possibilities: + * (i) Remember which strings contain which symbols. This allows certain + * small optimizations when processing them. + * (ii) Use some overall heuristics to find symbols which don't worth + * trying. This is very appealing, but hard to do properly + * (requires some inequality strong enough to allow practical exclusion + * of certain symbols -- at certain positions) + * + * - Editops should be an object that only *looks* like a list (which means + * it is a list in duck typing) to avoid never-ending conversions from + * Python lists to LevEditOp arrays and back + * + * - Optimize munkers_blackman(), it's pretty dumb (no memory of visited + * columns/rows) + * + * - Make it really usable as a C library (needs some wrappers, headers, ..., + * and maybe even documentation ;-) + * + * - Add interface to various interesting auxiliary results, namely + * set and sequence distance (only ratio is exported), the map from + * munkers_blackman() itself, ... + * + * - Generalizations: + * - character weight matrix/function + * - arbitrary edit operation costs, decomposable edit operations + * + * - Create a test suite + * + * - Add more interesting algorithms ;-) + * + * Postponed TODO (investigated, and a big `but' was found): + * + * - A linear approximate set median algorithm: + * P. Indyk: Sublinear time algorithms for metric space problems, + * STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html. + * BUT: The algorithm seems to be advantageous only in the case of very + * large sets -- if my estimates are correct (the article itself is quite + * `asymptotic'), say 10^5 at least. On smaller sets either one would get + * only an extermely rough median estimate, or the number of distance + * computations would be in fact higher than in the dumb O(n^2) algorithm. + * + * - Improve setmedian() speed with triangular inequality, see: + * Juan, A., E. Vidal: An Algorithm for Fast Median Search, + * 1997, http://citeseer.nj.nec.com/article/juan97algorithm.html + * BUT: It doesn't seem to help much in spaces of high dimension (see the + * discussion and graphs in the article itself), a few percents at most, + * and strings behave like a space with a very high dimension (locally), so + * who knows, it probably wouldn't help much. + * + **/ +#ifdef NO_PYTHON +#define _GNU_SOURCE +#include +#include +/* for debugging */ +#include +#else /* NO_PYTHON */ +#define _LEV_STATIC_PY static +#define lev_wchar Py_UNICODE +#include +#endif /* NO_PYTHON */ + +#if PY_MAJOR_VERSION >= 3 +#define LEV_PYTHON3 +#define PyString_Type PyBytes_Type +#define PyString_GET_SIZE PyBytes_GET_SIZE +#define PyString_AS_STRING PyBytes_AS_STRING +#define PyString_Check PyBytes_Check +#define PyString_FromStringAndSize PyBytes_FromStringAndSize +#define PyString_InternFromString PyUnicode_InternFromString +#define PyInt_AS_LONG PyLong_AsLong +#define PyInt_FromLong PyLong_FromLong +#define PyInt_Check PyLong_Check +#define PY_INIT_MOD(module, name, doc, methods) \ + static struct PyModuleDef moduledef = { \ + PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ + module = PyModule_Create(&moduledef); + #define PY_MOD_INIT_FUNC_DEF(name) PyObject* PyInit_##name(void) +#else + #define PY_INIT_MOD(module, name, doc, methods) \ + Py_InitModule3(name, methods, doc); + #define PY_MOD_INIT_FUNC_DEF(name) void init##name(void) +#endif /* PY_MAJOR_VERSION */ + +#include +#include "_levenshtein.h" + +/* FIXME: inline avaliability should be solved in setup.py, somehow, or + * even better in Python.h, like const is... + * this should inline at least with gcc and msvc */ +#ifndef __GNUC__ +# ifdef _MSC_VER +# define inline __inline +# else +# define inline /* */ +# endif +# define __attribute__(x) /* */ +#endif + +#define LEV_EPSILON 1e-14 +#define LEV_INFINITY 1e100 + +/* Me thinks the second argument of PyArg_UnpackTuple() should be const. + * Anyway I habitually pass a constant string. + * A cast to placate the compiler. */ +#define PYARGCFIX(x) (char*)(x) + +/* local functions */ +static size_t* +munkers_blackman(size_t n1, + size_t n2, + double *dists); + +#define TAUS113_LCG(n) ((69069UL * n) & 0xffffffffUL) +#define TAUS113_MASK 0xffffffffUL + +typedef struct { + unsigned long int z1, z2, z3, z4; +} taus113_state_t; + +static inline unsigned long int +taus113_get(taus113_state_t *state); + +static void +taus113_set(taus113_state_t *state, + unsigned long int s); + +#ifndef NO_PYTHON +/* python interface and wrappers */ +/* declarations and docstrings {{{ */ +static PyObject* distance_py(PyObject *self, PyObject *args); +static PyObject* ratio_py(PyObject *self, PyObject *args); +static PyObject* hamming_py(PyObject *self, PyObject *args); +static PyObject* jaro_py(PyObject *self, PyObject *args); +static PyObject* jaro_winkler_py(PyObject *self, PyObject *args); +static PyObject* median_py(PyObject *self, PyObject *args); +static PyObject* median_improve_py(PyObject *self, PyObject *args); +static PyObject* quickmedian_py(PyObject *self, PyObject *args); +static PyObject* setmedian_py(PyObject *self, PyObject *args); +static PyObject* seqratio_py(PyObject *self, PyObject *args); +static PyObject* setratio_py(PyObject *self, PyObject *args); +static PyObject* editops_py(PyObject *self, PyObject *args); +static PyObject* opcodes_py(PyObject *self, PyObject *args); +static PyObject* inverse_py(PyObject *self, PyObject *args); +static PyObject* apply_edit_py(PyObject *self, PyObject *args); +static PyObject* matching_blocks_py(PyObject *self, PyObject *args); +static PyObject* subtract_edit_py(PyObject *self, PyObject *args); + +#define Levenshtein_DESC \ + "A C extension module for fast computation of:\n" \ + "- Levenshtein (edit) distance and edit sequence manipulation\n" \ + "- string similarity\n" \ + "- approximate median strings, and generally string averaging\n" \ + "- string sequence and set similarity\n" \ + "\n" \ + "Levenshtein has a some overlap with difflib (SequenceMatcher). It\n" \ + "supports only strings, not arbitrary sequence types, but on the\n" \ + "other hand it's much faster.\n" \ + "\n" \ + "It supports both normal and Unicode strings, but can't mix them, all\n" \ + "arguments to a function (method) have to be of the same type (or its\n" \ + "subclasses).\n" + +#define distance_DESC \ + "Compute absolute Levenshtein distance of two strings.\n" \ + "\n" \ + "distance(string1, string2)\n" \ + "\n" \ + "Examples (it's hard to spell Levenshtein correctly):\n" \ + "\n" \ + ">>> distance('Levenshtein', 'Lenvinsten')\n" \ + "4\n" \ + ">>> distance('Levenshtein', 'Levensthein')\n" \ + "2\n" \ + ">>> distance('Levenshtein', 'Levenshten')\n" \ + "1\n" \ + ">>> distance('Levenshtein', 'Levenshtein')\n" \ + "0\n" \ + "\n" \ + "Yeah, we've managed it at last.\n" + +#define ratio_DESC \ + "Compute similarity of two strings.\n" \ + "\n" \ + "ratio(string1, string2)\n" \ + "\n" \ + "The similarity is a number between 0 and 1, it's usually equal or\n" \ + "somewhat higher than difflib.SequenceMatcher.ratio(), because it's\n" \ + "based on real minimal edit distance.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> ratio('Hello world!', 'Holly grail!') # doctest: +ELLIPSIS\n" \ + "0.583333...\n" \ + ">>> ratio('Brian', 'Jesus')\n" \ + "0.0\n" \ + "\n" \ + "Really? I thought there was some similarity.\n" + +#define hamming_DESC \ + "Compute Hamming distance of two strings.\n" \ + "\n" \ + "hamming(string1, string2)\n" \ + "\n" \ + "The Hamming distance is simply the number of differing characters.\n" \ + "That means the length of the strings must be the same.\n" \ + "\n" \ + "Examples:\n" \ + ">>> hamming('Hello world!', 'Holly grail!')\n" \ + "7\n" \ + ">>> hamming('Brian', 'Jesus')\n" \ + "5\n" + +#define jaro_DESC \ + "Compute Jaro string similarity metric of two strings.\n" \ + "\n" \ + "jaro(string1, string2)\n" \ + "\n" \ + "The Jaro string similarity metric is intended for short strings like\n" \ + "personal last names. It is 0 for completely different strings and\n" \ + "1 for identical strings.\n" \ + "\n" \ + "Examples:\n" \ + ">>> jaro('Brian', 'Jesus')\n" \ + "0.0\n" \ + ">>> jaro('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \ + "0.779761...\n" \ + ">>> jaro('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \ + "0.708333...\n" + +#define jaro_winkler_DESC \ + "Compute Jaro string similarity metric of two strings.\n" \ + "\n" \ + "jaro_winkler(string1, string2[, prefix_weight])\n" \ + "\n" \ + "The Jaro-Winkler string similarity metric is a modification of Jaro\n" \ + "metric giving more weight to common prefix, as spelling mistakes are\n" \ + "more likely to occur near ends of words.\n" \ + "\n" \ + "The prefix weight is inverse value of common prefix length sufficient\n" \ + "to consider the strings *identical*. If no prefix weight is\n" \ + "specified, 1/10 is used.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> jaro_winkler('Brian', 'Jesus')\n" \ + "0.0\n" \ + ">>> jaro_winkler('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \ + "0.867857...\n" \ + ">>> jaro_winkler('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \ + "0.7375...\n" \ + ">>> jaro_winkler('Thorkel', 'Thorgier', 0.25)\n" \ + "1.0\n" + +#define median_DESC \ + "Find an approximate generalized median string using greedy algorithm.\n" \ + "\n" \ + "median(string_sequence[, weight_sequence])\n" \ + "\n" \ + "You can optionally pass a weight for each string as the second\n" \ + "argument. The weights are interpreted as item multiplicities,\n" \ + "although any non-negative real numbers are accepted. Use them to\n" \ + "improve computation speed when strings often appear multiple times\n" \ + "in the sequence.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> median(['SpSm', 'mpamm', 'Spam', 'Spa', 'Sua', 'hSam'])\n" \ + "'Spam'\n" \ + ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ + "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ + "... 'Levenhtin', 'evenshtei']\n" \ + ">>> median(fixme)\n" \ + "'Levenshtein'\n" \ + "\n" \ + "Hm. Even a computer program can spell Levenshtein better than me.\n" + +#define median_improve_DESC \ + "Improve an approximate generalized median string by perturbations.\n" \ + "\n" \ + "median_improve(string, string_sequence[, weight_sequence])\n" \ + "\n" \ + "The first argument is the estimated generalized median string you\n" \ + "want to improve, the others are the same as in median(). It returns\n" \ + "a string with total distance less or equal to that of the given string.\n" \ + "\n" \ + "Note this is much slower than median(). Also note it performs only\n" \ + "one improvement step, calling median_improve() again on the result\n" \ + "may improve it further, though this is unlikely to happen unless the\n" \ + "given string was not very similar to the actual generalized median.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ + "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ + "... 'Levenhtin', 'evenshtei']\n" \ + ">>> median_improve('spam', fixme)\n" \ + "'enhtein'\n" \ + ">>> median_improve(median_improve('spam', fixme), fixme)\n" \ + "'Levenshtein'\n" \ + "\n" \ + "It takes some work to change spam to Levenshtein.\n" + +#define quickmedian_DESC \ + "Find a very approximate generalized median string, but fast.\n" \ + "\n" \ + "quickmedian(string[, weight_sequence])\n" \ + "\n" \ + "See median() for argument description.\n" \ + "\n" \ + "This method is somewhere between setmedian() and picking a random\n" \ + "string from the set; both speedwise and quality-wise.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ + "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ + "... 'Levenhtin', 'evenshtei']\n" \ + ">>> quickmedian(fixme)\n" \ + "'Levnshein'\n" + +#define setmedian_DESC \ + "Find set median of a string set (passed as a sequence).\n" \ + "\n" \ + "setmedian(string[, weight_sequence])\n" \ + "\n" \ + "See median() for argument description.\n" \ + "\n" \ + "The returned string is always one of the strings in the sequence.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> setmedian(['ehee', 'cceaes', 'chees', 'chreesc',\n" \ + "... 'chees', 'cheesee', 'cseese', 'chetese'])\n" \ + "'chees'\n" \ + "\n" \ + "You haven't asked me about Limburger, sir.\n" + +#define seqratio_DESC \ + "Compute similarity ratio of two sequences of strings.\n" \ + "\n" \ + "seqratio(string_sequence1, string_sequence2)\n" \ + "\n" \ + "This is like ratio(), but for string sequences. A kind of ratio()\n" \ + "is used to to measure the cost of item change operation for the\n" \ + "strings.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> seqratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \ + "... ['caribou', 'sausage', 'gorn', 'woody'])\n" \ + "0.21517857142857144\n" + +#define setratio_DESC \ + "Compute similarity ratio of two strings sets (passed as sequences).\n" \ + "\n" \ + "setratio(string_sequence1, string_sequence2)\n" \ + "\n" \ + "The best match between any strings in the first set and the second\n" \ + "set (passed as sequences) is attempted. I.e., the order doesn't\n" \ + "matter here.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> setratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \ + "... ['caribou', 'sausage', 'gorn', 'woody']) # doctest: +ELLIPSIS\n" \ + "0.281845...\n" \ + "\n" \ + "No, even reordering doesn't help the tinny words to match the\n" \ + "woody ones.\n" + +#define editops_DESC \ + "Find sequence of edit operations transforming one string to another.\n" \ + "\n" \ + "editops(source_string, destination_string)\n" \ + "editops(edit_operations, source_length, destination_length)\n" \ + "\n" \ + "The result is a list of triples (operation, spos, dpos), where\n" \ + "operation is one of 'equal', 'replace', 'insert', or 'delete'; spos\n" \ + "and dpos are position of characters in the first (source) and the\n" \ + "second (destination) strings. These are operations on signle\n" \ + "characters. In fact the returned list doesn't contain the 'equal',\n" \ + "but all the related functions accept both lists with and without\n" \ + "'equal's.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> editops('spam', 'park')\n" \ + "[('delete', 0, 0), ('insert', 3, 2), ('replace', 3, 3)]\n" \ + "\n" \ + "The alternate form editops(opcodes, source_string, destination_string)\n" \ + "can be used for conversion from opcodes (5-tuples) to editops (you can\n" \ + "pass strings or their lengths, it doesn't matter).\n" + +#define opcodes_DESC \ + "Find sequence of edit operations transforming one string to another.\n" \ + "\n" \ + "opcodes(source_string, destination_string)\n" \ + "opcodes(edit_operations, source_length, destination_length)\n" \ + "\n" \ + "The result is a list of 5-tuples with the same meaning as in\n" \ + "SequenceMatcher's get_opcodes() output. But since the algorithms\n" \ + "differ, the actual sequences from Levenshtein and SequenceMatcher\n" \ + "may differ too.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> for x in opcodes('spam', 'park'):\n" \ + "... print(x)\n" \ + "...\n" \ + "('delete', 0, 1, 0, 0)\n" \ + "('equal', 1, 3, 0, 2)\n" \ + "('insert', 3, 3, 2, 3)\n" \ + "('replace', 3, 4, 3, 4)\n" \ + "\n" \ + "The alternate form opcodes(editops, source_string, destination_string)\n" \ + "can be used for conversion from editops (triples) to opcodes (you can\n" \ + "pass strings or their lengths, it doesn't matter).\n" + +#define inverse_DESC \ + "Invert the sense of an edit operation sequence.\n" \ + "\n" \ + "inverse(edit_operations)\n" \ + "\n" \ + "In other words, it returns a list of edit operations transforming the\n" \ + "second (destination) string to the first (source). It can be used\n" \ + "with both editops and opcodes.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> inverse(editops('spam', 'park'))\n" \ + "[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" \ + ">>> editops('park', 'spam')\n" \ + "[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" + +#define apply_edit_DESC \ + "Apply a sequence of edit operations to a string.\n" \ + "\n" \ + "apply_edit(edit_operations, source_string, destination_string)\n" \ + "\n" \ + "In the case of editops, the sequence can be arbitrary ordered subset\n" \ + "of the edit sequence transforming source_string to destination_string.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> e = editops('man', 'scotsman')\n" \ + ">>> apply_edit(e, 'man', 'scotsman')\n" \ + "'scotsman'\n" \ + ">>> apply_edit(e[:3], 'man', 'scotsman')\n" \ + "'scoman'\n" \ + "\n" \ + "The other form of edit operations, opcodes, is not very suitable for\n" \ + "such a tricks, because it has to always span over complete strings,\n" \ + "subsets can be created by carefully replacing blocks with 'equal'\n" \ + "blocks, or by enlarging 'equal' block at the expense of other blocks\n" \ + "and adjusting the other blocks accordingly.\n" \ + "\n" \ + "Examples:\n" \ + ">>> a, b = 'spam and eggs', 'foo and bar'\n" \ + ">>> e = opcodes(a, b)\n" \ + ">>> apply_edit(inverse(e), b, a)\n" \ + "'spam and eggs'\n" \ + ">>> e[4] = ('equal', 10, 13, 8, 11)\n" \ + ">>> e\n" \ + "[('delete', 0, 1, 0, 0), ('replace', 1, 4, 0, 3), ('equal', 4, 9, 3, 8), ('delete', 9, 10, 8, 8), ('equal', 10, 13, 8, 11)]\n" \ + ">>> apply_edit(e, a, b)\n" \ + "'foo and ggs'\n" + +#define matching_blocks_DESC \ + "Find identical blocks in two strings.\n" \ + "\n" \ + "matching_blocks(edit_operations, source_length, destination_length)\n" \ + "\n" \ + "The result is a list of triples with the same meaning as in\n" \ + "SequenceMatcher's get_matching_blocks() output. It can be used with\n" \ + "both editops and opcodes. The second and third arguments don't\n" \ + "have to be actually strings, their lengths are enough.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> a, b = 'spam', 'park'\n" \ + ">>> matching_blocks(editops(a, b), a, b)\n" \ + "[(1, 0, 2), (4, 4, 0)]\n" \ + ">>> matching_blocks(editops(a, b), len(a), len(b))\n" \ + "[(1, 0, 2), (4, 4, 0)]\n" \ + "\n" \ + "The last zero-length block is not an error, but it's there for\n" \ + "compatibility with difflib which always emits it.\n" \ + "\n" \ + "One can join the matching blocks to get two identical strings:\n" \ + ">>> a, b = 'dog kennels', 'mattresses'\n" \ + ">>> mb = matching_blocks(editops(a,b), a, b)\n" \ + ">>> ''.join([a[x[0]:x[0]+x[2]] for x in mb])\n" \ + "'ees'\n" \ + ">>> ''.join([b[x[1]:x[1]+x[2]] for x in mb])\n" \ + "'ees'\n" + +#define subtract_edit_DESC \ + "Subtract an edit subsequence from a sequence.\n" \ + "\n" \ + "subtract_edit(edit_operations, subsequence)\n" \ + "\n" \ + "The result is equivalent to\n" \ + "editops(apply_edit(subsequence, s1, s2), s2), except that is\n" \ + "constructed directly from the edit operations. That is, if you apply\n" \ + "it to the result of subsequence application, you get the same final\n" \ + "string as from application complete edit_operations. It may be not\n" \ + "identical, though (in amibuous cases, like insertion of a character\n" \ + "next to the same character).\n" \ + "\n" \ + "The subtracted subsequence must be an ordered subset of\n" \ + "edit_operations.\n" \ + "\n" \ + "Note this function does not accept difflib-style opcodes as no one in\n" \ + "his right mind wants to create subsequences from them.\n" \ + "\n" \ + "Examples:\n" \ + "\n" \ + ">>> e = editops('man', 'scotsman')\n" \ + ">>> e1 = e[:3]\n" \ + ">>> bastard = apply_edit(e1, 'man', 'scotsman')\n" \ + ">>> bastard\n" \ + "'scoman'\n" \ + ">>> apply_edit(subtract_edit(e, e1), bastard, 'scotsman')\n" \ + "'scotsman'\n" \ + +#define METHODS_ITEM(x) { #x, x##_py, METH_VARARGS, x##_DESC } +static PyMethodDef methods[] = { + METHODS_ITEM(distance), + METHODS_ITEM(ratio), + METHODS_ITEM(hamming), + METHODS_ITEM(jaro), + METHODS_ITEM(jaro_winkler), + METHODS_ITEM(median), + METHODS_ITEM(median_improve), + METHODS_ITEM(quickmedian), + METHODS_ITEM(setmedian), + METHODS_ITEM(seqratio), + METHODS_ITEM(setratio), + METHODS_ITEM(editops), + METHODS_ITEM(opcodes), + METHODS_ITEM(inverse), + METHODS_ITEM(apply_edit), + METHODS_ITEM(matching_blocks), + METHODS_ITEM(subtract_edit), + { NULL, NULL, 0, NULL }, +}; + +/* opcode names, these are to be initialized in the init func, + * indexed by LevEditType values */ +struct { + PyObject* pystring; + const char *cstring; + size_t len; +} +static opcode_names[] = { + { NULL, "equal", 0 }, + { NULL, "replace", 0 }, + { NULL, "insert", 0 }, + { NULL, "delete", 0 }, +}; +#define N_OPCODE_NAMES ((sizeof(opcode_names)/sizeof(opcode_names[0]))) + +typedef lev_byte *(*MedianFuncString)(size_t n, + const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength); +typedef Py_UNICODE *(*MedianFuncUnicode)(size_t n, + const size_t *lengths, + const Py_UNICODE *strings[], + const double *weights, + size_t *medlength); +typedef struct { + MedianFuncString s; + MedianFuncUnicode u; +} MedianFuncs; + +typedef lev_byte *(*MedianImproveFuncString)(size_t len, const lev_byte *s, + size_t n, + const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength); +typedef Py_UNICODE *(*MedianImproveFuncUnicode)(size_t len, const Py_UNICODE *s, + size_t n, + const size_t *lengths, + const Py_UNICODE *strings[], + const double *weights, + size_t *medlength); +typedef struct { + MedianImproveFuncString s; + MedianImproveFuncUnicode u; +} MedianImproveFuncs; + +typedef double (*SetSeqFuncString)(size_t n1, + const size_t *lengths1, + const lev_byte *strings1[], + size_t n2, + const size_t *lengths2, + const lev_byte *strings2[]); +typedef double (*SetSeqFuncUnicode)(size_t n1, + const size_t *lengths1, + const Py_UNICODE *strings1[], + size_t n2, + const size_t *lengths2, + const Py_UNICODE *strings2[]); + +typedef struct { + SetSeqFuncString s; + SetSeqFuncUnicode u; +} SetSeqFuncs; + +static long int +levenshtein_common(PyObject *args, + const char *name, + size_t xcost, + size_t *lensum); + +static int +extract_stringlist(PyObject *list, + const char *name, + size_t n, + size_t **sizelist, + void *strlist); + +static double* +extract_weightlist(PyObject *wlist, + const char *name, + size_t n); + +static PyObject* +median_common(PyObject *args, + const char *name, + MedianFuncs foo); + +static PyObject* +median_improve_common(PyObject *args, + const char *name, + MedianImproveFuncs foo); + +static double +setseq_common(PyObject *args, + const char *name, + SetSeqFuncs foo, + size_t *lensum); + +static void * +safe_malloc(size_t nmemb, size_t size) { + /* extra-conservative overflow check */ + if (SIZE_MAX / size <= nmemb) { + return NULL; + } + return malloc(nmemb * size); +} + +static void * +safe_malloc_3(size_t nmemb1, size_t nmemb2, size_t size) { + /* extra-conservative overflow check */ + if (SIZE_MAX / size <= nmemb2) { + return NULL; + } + return safe_malloc(nmemb1, nmemb2 * size); +} + +/* }}} */ + +/**************************************************************************** + * + * Python interface and subroutines + * + ****************************************************************************/ +/* {{{ */ + +static long int +levenshtein_common(PyObject *args, const char *name, size_t xcost, + size_t *lensum) +{ + PyObject *arg1, *arg2; + size_t len1, len2; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) + return -1; + + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2; + + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + *lensum = len1 + len2; + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + { + size_t d = lev_edit_distance(len1, string1, len2, string2, xcost); + if (d == (size_t)(-1)) { + PyErr_NoMemory(); + return -1; + } + return d; + } + } + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2; + + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + *lensum = len1 + len2; + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + { + size_t d = lev_u_edit_distance(len1, string1, len2, string2, xcost); + if (d == (size_t)(-1)) { + PyErr_NoMemory(); + return -1; + } + return d; + } + } + else { + PyErr_Format(PyExc_TypeError, + "%s expected two Strings or two Unicodes", name); + return -1; + } +} + +static PyObject* +distance_py(PyObject *self, PyObject *args) +{ + size_t lensum; + long int ldist; + + if ((ldist = levenshtein_common(args, "distance", 0, &lensum)) < 0) + return NULL; + + return PyInt_FromLong((long)ldist); +} + +static PyObject* +ratio_py(PyObject *self, PyObject *args) +{ + size_t lensum; + long int ldist; + + if ((ldist = levenshtein_common(args, "ratio", 1, &lensum)) < 0) + return NULL; + + if (lensum == 0) + return PyFloat_FromDouble(1.0); + + return PyFloat_FromDouble((double)(lensum - ldist)/(lensum)); +} + +static PyObject* +hamming_py(PyObject *self, PyObject *args) +{ + PyObject *arg1, *arg2; + const char *name = "hamming"; + size_t len1, len2; + long int dist; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) + return NULL; + + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2; + + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + if (len1 != len2) { + PyErr_Format(PyExc_ValueError, + "%s expected two strings of the same length", name); + return NULL; + } + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + dist = lev_hamming_distance(len1, string1, string2); + return PyInt_FromLong(dist); + } + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2; + + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + if (len1 != len2) { + PyErr_Format(PyExc_ValueError, + "%s expected two unicodes of the same length", name); + return NULL; + } + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + dist = lev_u_hamming_distance(len1, string1, string2); + return PyInt_FromLong(dist); + } + else { + PyErr_Format(PyExc_TypeError, + "%s expected two Strings or two Unicodes", name); + return NULL; + } +} + +static PyObject* +jaro_py(PyObject *self, PyObject *args) +{ + PyObject *arg1, *arg2; + const char *name = "jaro"; + size_t len1, len2; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) + return NULL; + + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2; + + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + return PyFloat_FromDouble(lev_jaro_ratio(len1, string1, len2, string2)); + } + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2; + + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + return PyFloat_FromDouble(lev_u_jaro_ratio(len1, string1, len2, string2)); + } + else { + PyErr_Format(PyExc_TypeError, + "%s expected two Strings or two Unicodes", name); + return NULL; + } +} + +static PyObject* +jaro_winkler_py(PyObject *self, PyObject *args) +{ + PyObject *arg1, *arg2, *arg3 = NULL; + double pfweight = 0.1; + const char *name = "jaro_winkler"; + size_t len1, len2; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &arg2, &arg3)) + return NULL; + + if (arg3) { + if (!PyObject_TypeCheck(arg3, &PyFloat_Type)) { + PyErr_Format(PyExc_TypeError, "%s third argument must be a Float", name); + return NULL; + } + pfweight = PyFloat_AS_DOUBLE(arg3); + if (pfweight < 0.0) { + PyErr_Format(PyExc_ValueError, "%s negative prefix weight", name); + return NULL; + } + } + + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2; + + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + return PyFloat_FromDouble(lev_jaro_winkler_ratio(len1, string1, + len2, string2, + pfweight)); + } + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2; + + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + return PyFloat_FromDouble(lev_u_jaro_winkler_ratio(len1, string1, + len2, string2, + pfweight)); + } + else { + PyErr_Format(PyExc_TypeError, + "%s expected two Strings or two Unicodes", name); + return NULL; + } +} + +static PyObject* +median_py(PyObject *self, PyObject *args) +{ + MedianFuncs engines = { lev_greedy_median, lev_u_greedy_median }; + return median_common(args, "median", engines); +} + +static PyObject* +median_improve_py(PyObject *self, PyObject *args) +{ + MedianImproveFuncs engines = { lev_median_improve, lev_u_median_improve }; + return median_improve_common(args, "median_improve", engines); +} + +static PyObject* +quickmedian_py(PyObject *self, PyObject *args) +{ + MedianFuncs engines = { lev_quick_median, lev_u_quick_median }; + return median_common(args, "quickmedian", engines); +} + +static PyObject* +setmedian_py(PyObject *self, PyObject *args) +{ + MedianFuncs engines = { lev_set_median, lev_u_set_median }; + return median_common(args, "setmedian", engines); +} + +static PyObject* +median_common(PyObject *args, const char *name, MedianFuncs foo) +{ + size_t n, len; + void *strings = NULL; + size_t *sizes = NULL; + PyObject *strlist = NULL; + PyObject *wlist = NULL; + PyObject *strseq = NULL; + double *weights; + int stringtype; + PyObject *result = NULL; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 1, 2, &strlist, &wlist)) + return NULL; + + if (!PySequence_Check(strlist)) { + PyErr_Format(PyExc_TypeError, + "%s first argument must be a Sequence", name); + return NULL; + } + strseq = PySequence_Fast(strlist, name); + + n = PySequence_Fast_GET_SIZE(strseq); + if (n == 0) { + Py_INCREF(Py_None); + Py_DECREF(strseq); + return Py_None; + } + + /* get (optional) weights, use 1 if none specified. */ + weights = extract_weightlist(wlist, name, n); + if (!weights) { + Py_DECREF(strseq); + return NULL; + } + + stringtype = extract_stringlist(strseq, name, n, &sizes, &strings); + Py_DECREF(strseq); + if (stringtype < 0) { + free(weights); + return NULL; + } + + if (stringtype == 0) { + lev_byte *medstr = foo.s(n, sizes, strings, weights, &len); + if (!medstr && len) + result = PyErr_NoMemory(); + else { + result = PyString_FromStringAndSize(medstr, len); + free(medstr); + } + } + else if (stringtype == 1) { + Py_UNICODE *medstr = foo.u(n, sizes, strings, weights, &len); + if (!medstr && len) + result = PyErr_NoMemory(); + else { + result = PyUnicode_FromUnicode(medstr, len); + free(medstr); + } + } + else + PyErr_Format(PyExc_SystemError, "%s internal error", name); + + free(strings); + free(weights); + free(sizes); + return result; +} + +static PyObject* +median_improve_common(PyObject *args, const char *name, MedianImproveFuncs foo) +{ + size_t n, len; + void *strings = NULL; + size_t *sizes = NULL; + PyObject *arg1 = NULL; + PyObject *strlist = NULL; + PyObject *wlist = NULL; + PyObject *strseq = NULL; + double *weights; + int stringtype; + PyObject *result = NULL; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &strlist, &wlist)) + return NULL; + + if (PyObject_TypeCheck(arg1, &PyString_Type)) + stringtype = 0; + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)) + stringtype = 1; + else { + PyErr_Format(PyExc_TypeError, + "%s first argument must be a String or Unicode", name); + return NULL; + } + + if (!PySequence_Check(strlist)) { + PyErr_Format(PyExc_TypeError, + "%s second argument must be a Sequence", name); + return NULL; + } + strseq = PySequence_Fast(strlist, name); + + n = PySequence_Fast_GET_SIZE(strseq); + if (n == 0) { + Py_INCREF(Py_None); + Py_DECREF(strseq); + return Py_None; + } + + /* get (optional) weights, use 1 if none specified. */ + weights = extract_weightlist(wlist, name, n); + if (!weights) { + Py_DECREF(strseq); + return NULL; + } + + if (extract_stringlist(strseq, name, n, &sizes, &strings) != stringtype) { + PyErr_Format(PyExc_TypeError, + "%s argument types don't match", name); + free(weights); + return NULL; + } + + Py_DECREF(strseq); + if (stringtype == 0) { + lev_byte *s = PyString_AS_STRING(arg1); + size_t l = PyString_GET_SIZE(arg1); + lev_byte *medstr = foo.s(l, s, n, sizes, strings, weights, &len); + if (!medstr && len) + result = PyErr_NoMemory(); + else { + result = PyString_FromStringAndSize(medstr, len); + free(medstr); + } + } + else if (stringtype == 1) { + Py_UNICODE *s = PyUnicode_AS_UNICODE(arg1); + size_t l = PyUnicode_GET_SIZE(arg1); + Py_UNICODE *medstr = foo.u(l, s, n, sizes, strings, weights, &len); + if (!medstr && len) + result = PyErr_NoMemory(); + else { + result = PyUnicode_FromUnicode(medstr, len); + free(medstr); + } + } + else + PyErr_Format(PyExc_SystemError, "%s internal error", name); + + free(strings); + free(weights); + free(sizes); + return result; +} + +static double* +extract_weightlist(PyObject *wlist, const char *name, size_t n) +{ + size_t i; + double *weights = NULL; + PyObject *seq; + + if (wlist) { + if (!PySequence_Check(wlist)) { + PyErr_Format(PyExc_TypeError, + "%s second argument must be a Sequence", name); + return NULL; + } + seq = PySequence_Fast(wlist, name); + if (PySequence_Fast_GET_SIZE(wlist) != n) { + PyErr_Format(PyExc_ValueError, + "%s got %i strings but %i weights", + name, n, PyList_GET_SIZE(wlist)); + Py_DECREF(seq); + return NULL; + } + weights = (double*)safe_malloc(n, sizeof(double)); + if (!weights) + return (double*)PyErr_NoMemory(); + for (i = 0; i < n; i++) { + PyObject *item = PySequence_Fast_GET_ITEM(wlist, i); + PyObject *number = PyNumber_Float(item); + + if (!number) { + free(weights); + PyErr_Format(PyExc_TypeError, + "%s weight #%i is not a Number", name, i); + Py_DECREF(seq); + return NULL; + } + weights[i] = PyFloat_AS_DOUBLE(number); + Py_DECREF(number); + if (weights[i] < 0) { + free(weights); + PyErr_Format(PyExc_ValueError, + "%s weight #%i is negative", name, i); + Py_DECREF(seq); + return NULL; + } + } + Py_DECREF(seq); + } + else { + weights = (double*)safe_malloc(n, sizeof(double)); + if (!weights) + return (double*)PyErr_NoMemory(); + for (i = 0; i < n; i++) + weights[i] = 1.0; + } + + return weights; +} + +/* extract a list of strings or unicode strings, returns + * 0 -- strings + * 1 -- unicode strings + * <0 -- failure + */ +static int +extract_stringlist(PyObject *list, const char *name, + size_t n, size_t **sizelist, void *strlist) +{ + size_t i; + PyObject *first; + + /* check first object type. when it's a string then all others must be + * strings too; when it's a unicode string then all others must be unicode + * strings too. */ + first = PySequence_Fast_GET_ITEM(list, 0); + /* i don't exactly understand why the problem doesn't exhibit itself earlier + * but a queer error message is better than a segfault :o) */ + if (first == (PyObject*)-1) { + PyErr_Format(PyExc_TypeError, + "%s undecomposable Sequence???", name); + return -1; + } + + if (PyObject_TypeCheck(first, &PyString_Type)) { + lev_byte **strings; + size_t *sizes; + + strings = (lev_byte**)safe_malloc(n, sizeof(lev_byte*)); + if (!strings) { + PyErr_Format(PyExc_MemoryError, + "%s cannot allocate memory", name); + return -1; + } + sizes = (size_t*)safe_malloc(n, sizeof(size_t)); + if (!sizes) { + free(strings); + PyErr_Format(PyExc_MemoryError, + "%s cannot allocate memory", name); + return -1; + } + + strings[0] = PyString_AS_STRING(first); + sizes[0] = PyString_GET_SIZE(first); + for (i = 1; i < n; i++) { + PyObject *item = PySequence_Fast_GET_ITEM(list, i); + + if (!PyObject_TypeCheck(item, &PyString_Type)) { + free(strings); + free(sizes); + PyErr_Format(PyExc_TypeError, + "%s item #%i is not a String", name, i); + return -1; + } + strings[i] = PyString_AS_STRING(item); + sizes[i] = PyString_GET_SIZE(item); + } + + *(lev_byte***)strlist = strings; + *sizelist = sizes; + return 0; + } + if (PyObject_TypeCheck(first, &PyUnicode_Type)) { + Py_UNICODE **strings; + size_t *sizes; + + strings = (Py_UNICODE**)safe_malloc(n, sizeof(Py_UNICODE*)); + if (!strings) { + PyErr_NoMemory(); + return -1; + } + sizes = (size_t*)safe_malloc(n, sizeof(size_t)); + if (!sizes) { + free(strings); + PyErr_NoMemory(); + return -1; + } + + strings[0] = PyUnicode_AS_UNICODE(first); + sizes[0] = PyUnicode_GET_SIZE(first); + for (i = 1; i < n; i++) { + PyObject *item = PySequence_Fast_GET_ITEM(list, i); + + if (!PyObject_TypeCheck(item, &PyUnicode_Type)) { + free(strings); + free(sizes); + PyErr_Format(PyExc_TypeError, + "%s item #%i is not a Unicode", name, i); + return -1; + } + strings[i] = PyUnicode_AS_UNICODE(item); + sizes[i] = PyUnicode_GET_SIZE(item); + } + + *(Py_UNICODE***)strlist = strings; + *sizelist = sizes; + return 1; + } + + PyErr_Format(PyExc_TypeError, + "%s expected list of Strings or Unicodes", name); + return -1; +} + +static PyObject* +seqratio_py(PyObject *self, PyObject *args) +{ + SetSeqFuncs engines = { lev_edit_seq_distance, lev_u_edit_seq_distance }; + size_t lensum; + double r = setseq_common(args, "seqratio", engines, &lensum); + if (r < 0) + return NULL; + if (lensum == 0) + return PyFloat_FromDouble(1.0); + return PyFloat_FromDouble((lensum - r)/lensum); +} + +static PyObject* +setratio_py(PyObject *self, PyObject *args) +{ + SetSeqFuncs engines = { lev_set_distance, lev_u_set_distance }; + size_t lensum; + double r = setseq_common(args, "setratio", engines, &lensum); + if (r < 0) + return NULL; + if (lensum == 0) + return PyFloat_FromDouble(1.0); + return PyFloat_FromDouble((lensum - r)/lensum); +} + +static double +setseq_common(PyObject *args, const char *name, SetSeqFuncs foo, + size_t *lensum) +{ + size_t n1, n2; + void *strings1 = NULL; + void *strings2 = NULL; + size_t *sizes1 = NULL; + size_t *sizes2 = NULL; + PyObject *strlist1; + PyObject *strlist2; + PyObject *strseq1; + PyObject *strseq2; + int stringtype1, stringtype2; + double r = -1.0; + + if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &strlist1, &strlist2)) + return r; + + if (!PySequence_Check(strlist1)) { + PyErr_Format(PyExc_TypeError, + "%s first argument must be a Sequence", name); + return r; + } + if (!PySequence_Check(strlist2)) { + PyErr_Format(PyExc_TypeError, + "%s second argument must be a Sequence", name); + return r; + } + + strseq1 = PySequence_Fast(strlist1, name); + strseq2 = PySequence_Fast(strlist2, name); + + n1 = PySequence_Fast_GET_SIZE(strseq1); + n2 = PySequence_Fast_GET_SIZE(strseq2); + *lensum = n1 + n2; + if (n1 == 0) { + Py_DECREF(strseq1); + Py_DECREF(strseq2); + return (double)n2; + } + if (n2 == 0) { + Py_DECREF(strseq1); + Py_DECREF(strseq2); + return (double)n1; + } + + stringtype1 = extract_stringlist(strseq1, name, n1, &sizes1, &strings1); + Py_DECREF(strseq1); + if (stringtype1 < 0) { + Py_DECREF(strseq2); + return r; + } + stringtype2 = extract_stringlist(strseq2, name, n2, &sizes2, &strings2); + Py_DECREF(strseq2); + if (stringtype2 < 0) { + free(sizes1); + free(strings1); + return r; + } + + if (stringtype1 != stringtype2) { + PyErr_Format(PyExc_TypeError, + "%s both sequences must consist of items of the same type", + name); + } + else if (stringtype1 == 0) { + r = foo.s(n1, sizes1, strings1, n2, sizes2, strings2); + if (r < 0.0) + PyErr_NoMemory(); + } + else if (stringtype1 == 1) { + r = foo.u(n1, sizes1, strings1, n2, sizes2, strings2); + if (r < 0.0) + PyErr_NoMemory(); + } + else + PyErr_Format(PyExc_SystemError, "%s internal error", name); + + free(strings1); + free(strings2); + free(sizes1); + free(sizes2); + return r; +} + +static inline LevEditType +string_to_edittype(PyObject *string) +{ + const char *s; + size_t i, len; + + for (i = 0; i < N_OPCODE_NAMES; i++) { + if (string == opcode_names[i].pystring) + return i; + } + + /* With Python >= 2.2, we shouldn't get here, except when the strings are + * not Strings but subtypes. */ +#ifdef LEV_PYTHON3 + /* For Python 3, the string is an unicode object; use CompareWithAsciiString */ + if (!PyUnicode_Check(string)) { + return LEV_EDIT_LAST; + } + + for (i = 0; i < N_OPCODE_NAMES; i++) { + if (PyUnicode_CompareWithASCIIString(string, opcode_names[i].cstring) == 0) { + return i; + } + } + +#else + + if (!PyString_Check(string)) + return LEV_EDIT_LAST; + + s = PyString_AS_STRING(string); + len = PyString_GET_SIZE(string); + for (i = 0; i < N_OPCODE_NAMES; i++) { + if (len == opcode_names[i].len + && memcmp(s, opcode_names[i].cstring, len) == 0) { + return i; + } + } +#endif + + return LEV_EDIT_LAST; +} + +static LevEditOp* +extract_editops(PyObject *list) +{ + LevEditOp *ops; + size_t i; + LevEditType type; + size_t n = PyList_GET_SIZE(list); + + ops = (LevEditOp*)safe_malloc(n, sizeof(LevEditOp)); + if (!ops) + return (LevEditOp*)PyErr_NoMemory(); + for (i = 0; i < n; i++) { + PyObject *item; + PyObject *tuple = PyList_GET_ITEM(list, i); + + if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 3) { + free(ops); + return NULL; + } + item = PyTuple_GET_ITEM(tuple, 0); + if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) { + free(ops); + return NULL; + } + ops[i].type = type; + item = PyTuple_GET_ITEM(tuple, 1); + if (!PyInt_Check(item)) { + free(ops); + return NULL; + } + ops[i].spos = (size_t)PyInt_AS_LONG(item); + item = PyTuple_GET_ITEM(tuple, 2); + if (!PyInt_Check(item)) { + free(ops); + return NULL; + } + ops[i].dpos = (size_t)PyInt_AS_LONG(item); + } + return ops; +} + +static LevOpCode* +extract_opcodes(PyObject *list) +{ + LevOpCode *bops; + size_t i; + LevEditType type; + size_t nb = PyList_GET_SIZE(list); + + bops = (LevOpCode*)safe_malloc(nb, sizeof(LevOpCode)); + if (!bops) + return (LevOpCode*)PyErr_NoMemory(); + for (i = 0; i < nb; i++) { + PyObject *item; + PyObject *tuple = PyList_GET_ITEM(list, i); + + if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 5) { + free(bops); + return NULL; + } + + item = PyTuple_GET_ITEM(tuple, 0); + if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) { + free(bops); + return NULL; + } + bops[i].type = type; + + item = PyTuple_GET_ITEM(tuple, 1); + if (!PyInt_Check(item)) { + free(bops); + return NULL; + } + bops[i].sbeg = (size_t)PyInt_AS_LONG(item); + + item = PyTuple_GET_ITEM(tuple, 2); + if (!PyInt_Check(item)) { + free(bops); + return NULL; + } + bops[i].send = (size_t)PyInt_AS_LONG(item); + + item = PyTuple_GET_ITEM(tuple, 3); + if (!PyInt_Check(item)) { + free(bops); + return NULL; + } + bops[i].dbeg = (size_t)PyInt_AS_LONG(item); + + item = PyTuple_GET_ITEM(tuple, 4); + if (!PyInt_Check(item)) { + free(bops); + return NULL; + } + bops[i].dend = (size_t)PyInt_AS_LONG(item); + } + return bops; +} + +static PyObject* +editops_to_tuple_list(size_t n, LevEditOp *ops) +{ + PyObject *list; + size_t i; + + list = PyList_New(n); + for (i = 0; i < n; i++, ops++) { + PyObject *tuple = PyTuple_New(3); + PyObject *is = opcode_names[ops->type].pystring; + Py_INCREF(is); + PyTuple_SET_ITEM(tuple, 0, is); + PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)ops->spos)); + PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)ops->dpos)); + PyList_SET_ITEM(list, i, tuple); + } + + return list; +} + +static PyObject* +matching_blocks_to_tuple_list(size_t len1, size_t len2, + size_t nmb, LevMatchingBlock *mblocks) +{ + PyObject *list, *tuple; + size_t i; + + list = PyList_New(nmb + 1); + for (i = 0; i < nmb; i++, mblocks++) { + tuple = PyTuple_New(3); + PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)mblocks->spos)); + PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)mblocks->dpos)); + PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)mblocks->len)); + PyList_SET_ITEM(list, i, tuple); + } + tuple = PyTuple_New(3); + PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)len1)); + PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)len2)); + PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)0)); + PyList_SET_ITEM(list, nmb, tuple); + + return list; +} + +static size_t +get_length_of_anything(PyObject *object) +{ + if (PyInt_Check(object)) { + long int len = PyInt_AS_LONG(object); + if (len < 0) + len = -1; + return (size_t)len; + } + if (PySequence_Check(object)) + return PySequence_Length(object); + return (size_t)-1; +} + +static PyObject* +editops_py(PyObject *self, PyObject *args) +{ + PyObject *arg1, *arg2, *arg3 = NULL; + PyObject *oplist; + size_t len1, len2, n; + LevEditOp *ops; + LevOpCode *bops; + + if (!PyArg_UnpackTuple(args, PYARGCFIX("editops"), 2, 3, + &arg1, &arg2, &arg3)) { + return NULL; + } + + /* convert: we were called (bops, s1, s2) */ + if (arg3) { + if (!PyList_Check(arg1)) { + PyErr_Format(PyExc_ValueError, + "editops first argument must be a List of edit operations"); + return NULL; + } + n = PyList_GET_SIZE(arg1); + if (!n) { + Py_INCREF(arg1); + return arg1; + } + len1 = get_length_of_anything(arg2); + len2 = get_length_of_anything(arg3); + if (len1 == (size_t)-1 || len2 == (size_t)-1) { + PyErr_Format(PyExc_ValueError, + "editops second and third argument must specify sizes"); + return NULL; + } + + if ((bops = extract_opcodes(arg1)) != NULL) { + if (lev_opcodes_check_errors(len1, len2, n, bops)) { + PyErr_Format(PyExc_ValueError, + "editops edit operation list is invalid"); + free(bops); + return NULL; + } + ops = lev_opcodes_to_editops(n, bops, &n, 0); /* XXX: different n's! */ + if (!ops && n) { + free(bops); + return PyErr_NoMemory(); + } + oplist = editops_to_tuple_list(n, ops); + free(ops); + free(bops); + return oplist; + } + if ((ops = extract_editops(arg1)) != NULL) { + if (lev_editops_check_errors(len1, len2, n, ops)) { + PyErr_Format(PyExc_ValueError, + "editops edit operation list is invalid"); + free(ops); + return NULL; + } + free(ops); + Py_INCREF(arg1); /* editops -> editops is identity */ + return arg1; + } + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "editops first argument must be a List of edit operations"); + return NULL; + } + + /* find editops: we were called (s1, s2) */ + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2; + + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + ops = lev_editops_find(len1, string1, len2, string2, &n); + } + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2; + + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + ops = lev_u_editops_find(len1, string1, len2, string2, &n); + } + else { + PyErr_Format(PyExc_TypeError, + "editops expected two Strings or two Unicodes"); + return NULL; + } + if (!ops && n) + return PyErr_NoMemory(); + oplist = editops_to_tuple_list(n, ops); + free(ops); + return oplist; +} + +static PyObject* +opcodes_to_tuple_list(size_t nb, LevOpCode *bops) +{ + PyObject *list; + size_t i; + + list = PyList_New(nb); + for (i = 0; i < nb; i++, bops++) { + PyObject *tuple = PyTuple_New(5); + PyObject *is = opcode_names[bops->type].pystring; + Py_INCREF(is); + PyTuple_SET_ITEM(tuple, 0, is); + PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)bops->sbeg)); + PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)bops->send)); + PyTuple_SET_ITEM(tuple, 3, PyInt_FromLong((long)bops->dbeg)); + PyTuple_SET_ITEM(tuple, 4, PyInt_FromLong((long)bops->dend)); + PyList_SET_ITEM(list, i, tuple); + } + + return list; +} + +static PyObject* +opcodes_py(PyObject *self, PyObject *args) +{ + PyObject *arg1, *arg2, *arg3 = NULL; + PyObject *oplist; + size_t len1, len2, n, nb; + LevEditOp *ops; + LevOpCode *bops; + + if (!PyArg_UnpackTuple(args, PYARGCFIX("opcodes"), 2, 3, + &arg1, &arg2, &arg3)) + return NULL; + + /* convert: we were called (ops, s1, s2) */ + if (arg3) { + if (!PyList_Check(arg1)) { + PyErr_Format(PyExc_TypeError, + "opcodes first argument must be a List of edit operations"); + return NULL; + } + n = PyList_GET_SIZE(arg1); + len1 = get_length_of_anything(arg2); + len2 = get_length_of_anything(arg3); + if (len1 == (size_t)-1 || len2 == (size_t)-1) { + PyErr_Format(PyExc_ValueError, + "opcodes second and third argument must specify sizes"); + return NULL; + } + + if ((ops = extract_editops(arg1)) != NULL) { + if (lev_editops_check_errors(len1, len2, n, ops)) { + PyErr_Format(PyExc_ValueError, + "opcodes edit operation list is invalid"); + free(ops); + return NULL; + } + bops = lev_editops_to_opcodes(n, ops, &n, len1, len2); /* XXX: n != n */ + if (!bops && n) { + free(ops); + return PyErr_NoMemory(); + } + oplist = opcodes_to_tuple_list(n, bops); + free(bops); + free(ops); + return oplist; + } + if ((bops = extract_opcodes(arg1)) != NULL) { + if (lev_opcodes_check_errors(len1, len2, n, bops)) { + PyErr_Format(PyExc_ValueError, + "opcodes edit operation list is invalid"); + free(bops); + return NULL; + } + free(bops); + Py_INCREF(arg1); /* opcodes -> opcodes is identity */ + return arg1; + } + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "opcodes first argument must be a List of edit operations"); + return NULL; + } + + /* find opcodes: we were called (s1, s2) */ + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2; + + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + ops = lev_editops_find(len1, string1, len2, string2, &n); + } + else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2; + + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + ops = lev_u_editops_find(len1, string1, len2, string2, &n); + } + else { + PyErr_Format(PyExc_TypeError, + "opcodes expected two Strings or two Unicodes"); + return NULL; + } + if (!ops && n) + return PyErr_NoMemory(); + bops = lev_editops_to_opcodes(n, ops, &nb, len1, len2); + free(ops); + if (!bops && nb) + return PyErr_NoMemory(); + oplist = opcodes_to_tuple_list(nb, bops); + free(bops); + return oplist; +} + +static PyObject* +inverse_py(PyObject *self, PyObject *args) +{ + PyObject *list, *result; + size_t n; + LevEditOp *ops; + LevOpCode *bops; + + if (!PyArg_UnpackTuple(args, PYARGCFIX("inverse"), 1, 1, &list) + || !PyList_Check(list)) + return NULL; + + n = PyList_GET_SIZE(list); + if (!n) { + Py_INCREF(list); + return list; + } + if ((ops = extract_editops(list)) != NULL) { + lev_editops_invert(n, ops); + result = editops_to_tuple_list(n, ops); + free(ops); + return result; + } + if ((bops = extract_opcodes(list)) != NULL) { + lev_opcodes_invert(n, bops); + result = opcodes_to_tuple_list(n, bops); + free(bops); + return result; + } + + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "inverse expected a list of edit operations"); + return NULL; +} + +static PyObject* +apply_edit_py(PyObject *self, PyObject *args) +{ + PyObject *list, *result, *arg1, *arg2; + size_t n, len, len1, len2; + LevEditOp *ops; + LevOpCode *bops; + + if (!PyArg_UnpackTuple(args, PYARGCFIX("apply_edit"), 3, 3, + &list, &arg1, &arg2)) + return NULL; + + if (!PyList_Check(list)) { + PyErr_Format(PyExc_TypeError, + "apply_edit first argument must be a List of edit operations"); + return NULL; + } + n = PyList_GET_SIZE(list); + + if (PyObject_TypeCheck(arg1, &PyString_Type) + && PyObject_TypeCheck(arg2, &PyString_Type)) { + lev_byte *string1, *string2, *s; + + if (!n) { + Py_INCREF(arg1); + return arg1; + } + len1 = PyString_GET_SIZE(arg1); + len2 = PyString_GET_SIZE(arg2); + string1 = PyString_AS_STRING(arg1); + string2 = PyString_AS_STRING(arg2); + + if ((ops = extract_editops(list)) != NULL) { + if (lev_editops_check_errors(len1, len2, n, ops)) { + PyErr_Format(PyExc_ValueError, + "apply_edit edit operations are invalid or inapplicable"); + free(ops); + return NULL; + } + s = lev_editops_apply(len1, string1, len2, string2, + n, ops, &len); + free(ops); + if (!s && len) + return PyErr_NoMemory(); + result = PyString_FromStringAndSize(s, len); + free(s); + return result; + } + if ((bops = extract_opcodes(list)) != NULL) { + if (lev_opcodes_check_errors(len1, len2, n, bops)) { + PyErr_Format(PyExc_ValueError, + "apply_edit edit operations are invalid or inapplicable"); + free(bops); + return NULL; + } + s = lev_opcodes_apply(len1, string1, len2, string2, + n, bops, &len); + free(bops); + if (!s && len) + return PyErr_NoMemory(); + result = PyString_FromStringAndSize(s, len); + free(s); + return result; + } + + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "apply_edit first argument must be " + "a list of edit operations"); + return NULL; + } + if (PyObject_TypeCheck(arg1, &PyUnicode_Type) + && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { + Py_UNICODE *string1, *string2, *s; + + if (!n) { + Py_INCREF(arg1); + return arg1; + } + len1 = PyUnicode_GET_SIZE(arg1); + len2 = PyUnicode_GET_SIZE(arg2); + string1 = PyUnicode_AS_UNICODE(arg1); + string2 = PyUnicode_AS_UNICODE(arg2); + + if ((ops = extract_editops(list)) != NULL) { + if (lev_editops_check_errors(len1, len2, n, ops)) { + PyErr_Format(PyExc_ValueError, + "apply_edit edit operations are invalid or inapplicable"); + free(ops); + return NULL; + } + s = lev_u_editops_apply(len1, string1, len2, string2, + n, ops, &len); + free(ops); + if (!s && len) + return PyErr_NoMemory(); + result = PyUnicode_FromUnicode(s, len); + free(s); + return result; + } + if ((bops = extract_opcodes(list)) != NULL) { + if (lev_opcodes_check_errors(len1, len2, n, bops)) { + PyErr_Format(PyExc_ValueError, + "apply_edit edit operations are invalid or inapplicable"); + free(bops); + return NULL; + } + s = lev_u_opcodes_apply(len1, string1, len2, string2, + n, bops, &len); + free(bops); + if (!s && len) + return PyErr_NoMemory(); + result = PyUnicode_FromUnicode(s, len); + free(s); + return result; + } + + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "apply_edit first argument must be " + "a list of edit operations"); + return NULL; + } + + PyErr_Format(PyExc_TypeError, + "apply_edit expected two Strings or two Unicodes"); + return NULL; +} + +static PyObject* +matching_blocks_py(PyObject *self, PyObject *args) +{ + PyObject *list, *arg1, *arg2, *result; + size_t n, nmb, len1, len2; + LevEditOp *ops; + LevOpCode *bops; + LevMatchingBlock *mblocks; + + if (!PyArg_UnpackTuple(args, PYARGCFIX("matching_blocks"), 3, 3, + &list, &arg1, &arg2) + || !PyList_Check(list)) + return NULL; + + if (!PyList_Check(list)) { + PyErr_Format(PyExc_TypeError, + "matching_blocks first argument must be " + "a List of edit operations"); + return NULL; + } + n = PyList_GET_SIZE(list); + len1 = get_length_of_anything(arg1); + len2 = get_length_of_anything(arg2); + if (len1 == (size_t)-1 || len2 == (size_t)-1) { + PyErr_Format(PyExc_ValueError, + "matching_blocks second and third argument " + "must specify sizes"); + return NULL; + } + + if ((ops = extract_editops(list)) != NULL) { + if (lev_editops_check_errors(len1, len2, n, ops)) { + PyErr_Format(PyExc_ValueError, + "apply_edit edit operations are invalid or inapplicable"); + free(ops); + return NULL; + } + mblocks = lev_editops_matching_blocks(len1, len2, n, ops, &nmb); + free(ops); + if (!mblocks && nmb) + return PyErr_NoMemory(); + result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks); + free(mblocks); + return result; + } + if ((bops = extract_opcodes(list)) != NULL) { + if (lev_opcodes_check_errors(len1, len2, n, bops)) { + PyErr_Format(PyExc_ValueError, + "apply_edit edit operations are invalid or inapplicable"); + free(bops); + return NULL; + } + mblocks = lev_opcodes_matching_blocks(len1, len2, n, bops, &nmb); + free(bops); + if (!mblocks && nmb) + return PyErr_NoMemory(); + result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks); + free(mblocks); + return result; + } + + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "inverse expected a list of edit operations"); + return NULL; +} + +static PyObject* +subtract_edit_py(PyObject *self, PyObject *args) +{ + PyObject *list, *sub, *result; + size_t n, ns, nr; + LevEditOp *ops, *osub, *orem; + + if (!PyArg_UnpackTuple(args, PYARGCFIX("subtract_edit"), 2, 2, &list, &sub) + || !PyList_Check(list)) + return NULL; + + ns = PyList_GET_SIZE(sub); + if (!ns) { + Py_INCREF(list); + return list; + } + n = PyList_GET_SIZE(list); + if (!n) { + PyErr_Format(PyExc_ValueError, + "subtract_edit subsequence is not a subsequence " + "or is invalid"); + return NULL; + } + + if ((ops = extract_editops(list)) != NULL) { + if ((osub = extract_editops(sub)) != NULL) { + orem = lev_editops_subtract(n, ops, ns, osub, &nr); + free(ops); + free(osub); + + if (!orem && nr == -1) { + PyErr_Format(PyExc_ValueError, + "subtract_edit subsequence is not a subsequence " + "or is invalid"); + return NULL; + } + result = editops_to_tuple_list(nr, orem); + free(orem); + + return result; + } + free(ops); + } + + if (!PyErr_Occurred()) + PyErr_Format(PyExc_TypeError, + "subtract_edit expected two lists of edit operations"); + return NULL; +} + + +PY_MOD_INIT_FUNC_DEF(_levenshtein) +{ +#ifdef LEV_PYTHON3 + PyObject *module; +#endif + size_t i; + + PY_INIT_MOD(module, "_levenshtein", Levenshtein_DESC, methods) + /* create intern strings for edit operation names */ + if (opcode_names[0].pystring) + abort(); + for (i = 0; i < N_OPCODE_NAMES; i++) { +#ifdef LEV_PYTHON3 + opcode_names[i].pystring + = PyUnicode_InternFromString(opcode_names[i].cstring); +#else + opcode_names[i].pystring + = PyString_InternFromString(opcode_names[i].cstring); +#endif + opcode_names[i].len = strlen(opcode_names[i].cstring); + } + lev_init_rng(0); +#ifdef LEV_PYTHON3 + return module; +#endif +} +/* }}} */ +#endif /* not NO_PYTHON */ + +/**************************************************************************** + * + * C (i.e. executive) part + * + ****************************************************************************/ + +/**************************************************************************** + * + * Taus113 + * + ****************************************************************************/ +/* {{{ */ + +/* + * Based on Tausworthe random generator implementation rng/taus113.c + * from the GNU Scientific Library (http://sources.redhat.com/gsl/) + * which has the notice + * Copyright (C) 2002 Atakan Gurkan + * Based on the file taus.c which has the notice + * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough + */ + +static inline unsigned long +taus113_get(taus113_state_t *state) +{ + unsigned long b1, b2, b3, b4; + + b1 = ((((state->z1 << 6UL) & TAUS113_MASK) ^ state->z1) >> 13UL); + state->z1 = ((((state->z1 & 4294967294UL) << 18UL) & TAUS113_MASK) ^ b1); + + b2 = ((((state->z2 << 2UL) & TAUS113_MASK) ^ state->z2) >> 27UL); + state->z2 = ((((state->z2 & 4294967288UL) << 2UL) & TAUS113_MASK) ^ b2); + + b3 = ((((state->z3 << 13UL) & TAUS113_MASK) ^ state->z3) >> 21UL); + state->z3 = ((((state->z3 & 4294967280UL) << 7UL) & TAUS113_MASK) ^ b3); + + b4 = ((((state->z4 << 3UL) & TAUS113_MASK) ^ state->z4) >> 12UL); + state->z4 = ((((state->z4 & 4294967168UL) << 13UL) & TAUS113_MASK) ^ b4); + + return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4); + +} + +static void +taus113_set(taus113_state_t *state, + unsigned long int s) +{ + if (!s) + s = 1UL; /* default seed is 1 */ + + state->z1 = TAUS113_LCG(s); + if (state->z1 < 2UL) + state->z1 += 2UL; + + state->z2 = TAUS113_LCG(state->z1); + if (state->z2 < 8UL) + state->z2 += 8UL; + + state->z3 = TAUS113_LCG(state->z2); + if (state->z3 < 16UL) + state->z3 += 16UL; + + state->z4 = TAUS113_LCG(state->z3); + if (state->z4 < 128UL) + state->z4 += 128UL; + + /* Calling RNG ten times to satify recurrence condition */ + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); + taus113_get(state); +} + + +/** + * lev_init_rng: + * @seed: A seed. If zero, a default value (currently 1) is used instead. + * + * Initializes the random generator used by some Levenshtein functions. + * + * This does NOT happen automatically when these functions are used. + **/ +_LEV_STATIC_PY void +lev_init_rng(unsigned long int seed) +{ + static taus113_state_t state; + + taus113_set(&state, seed); +} +/* }}} */ + +/**************************************************************************** + * + * Basic stuff, Levenshtein distance + * + ****************************************************************************/ +/* {{{ */ + +/** + * lev_edit_distance: + * @len1: The length of @string1. + * @string1: A sequence of bytes of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A sequence of bytes of length @len2, may contain NUL characters. + * @xcost: If nonzero, the replace operation has weight 2, otherwise all + * edit operations have equal weights of 1. + * + * Computes Levenshtein edit distance of two strings. + * + * Returns: The edit distance. + **/ +_LEV_STATIC_PY size_t +lev_edit_distance(size_t len1, const lev_byte *string1, + size_t len2, const lev_byte *string2, + int xcost) +{ + size_t i; + size_t *row; /* we only need to keep one row of costs */ + size_t *end; + size_t half; + + /* strip common prefix */ + while (len1 > 0 && len2 > 0 && *string1 == *string2) { + len1--; + len2--; + string1++; + string2++; + } + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { + len1--; + len2--; + } + + /* catch trivial cases */ + if (len1 == 0) + return len2; + if (len2 == 0) + return len1; + + /* make the inner cycle (i.e. string2) the longer one */ + if (len1 > len2) { + size_t nx = len1; + const lev_byte *sx = string1; + len1 = len2; + len2 = nx; + string1 = string2; + string2 = sx; + } + /* check len1 == 1 separately */ + if (len1 == 1) { + if (xcost) + return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL); + else + return len2 - (memchr(string2, *string1, len2) != NULL); + } + len1++; + len2++; + half = len1 >> 1; + + /* initalize first row */ + row = (size_t*)safe_malloc(len2, sizeof(size_t)); + if (!row) + return (size_t)(-1); + end = row + len2 - 1; + for (i = 0; i < len2 - (xcost ? 0 : half); i++) + row[i] = i; + + /* go through the matrix and compute the costs. yes, this is an extremely + * obfuscated version, but also extremely memory-conservative and relatively + * fast. */ + if (xcost) { + for (i = 1; i < len1; i++) { + size_t *p = row + 1; + const lev_byte char1 = string1[i - 1]; + const lev_byte *char2p = string2; + size_t D = i; + size_t x = i; + while (p <= end) { + if (char1 == *(char2p++)) + x = --D; + else + x++; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + } + } + else { + /* in this case we don't have to scan two corner triangles (of size len1/2) + * in the matrix because no best path can go throught them. note this + * breaks when len1 == len2 == 2 so the memchr() special case above is + * necessary */ + row[0] = len1 - half - 1; + for (i = 1; i < len1; i++) { + size_t *p; + const lev_byte char1 = string1[i - 1]; + const lev_byte *char2p; + size_t D, x; + /* skip the upper triangle */ + if (i >= len1 - half) { + size_t offset = i - (len1 - half); + size_t c3; + + char2p = string2 + offset; + p = row + offset; + c3 = *(p++) + (char1 != *(char2p++)); + x = *p; + x++; + D = x; + if (x > c3) + x = c3; + *(p++) = x; + } + else { + p = row + 1; + char2p = string2; + D = x = i; + } + /* skip the lower triangle */ + if (i <= half + 1) + end = row + len2 + i - half - 2; + /* main */ + while (p <= end) { + size_t c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + /* lower triangle sentinel */ + if (i <= half) { + size_t c3 = --D + (char1 != *char2p); + x++; + if (x > c3) + x = c3; + *p = x; + } + } + } + + i = *end; + free(row); + return i; +} + +_LEV_STATIC_PY double +lev_edit_distance_sod(size_t len, const lev_byte *string, + size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights, + int xcost) +{ + size_t i, d; + double sum = 0.0; + + for (i = 0; i < n; i++) { + d = lev_edit_distance(len, string, lengths[i], strings[i], xcost); + if (d == (size_t)-1) + return -1.0; + sum += weights[i]*d; + } + return sum; +} + +/** + * lev_u_edit_distance: + * @len1: The length of @string1. + * @string1: A sequence of Unicode characters of length @len1, may contain NUL + * characters. + * @len2: The length of @string2. + * @string2: A sequence of Unicode characters of length @len2, may contain NUL + * characters. + * @xcost: If nonzero, the replace operation has weight 2, otherwise all + * edit operations have equal weights of 1. + * + * Computes Levenshtein edit distance of two Unicode strings. + * + * Returns: The edit distance. + **/ +_LEV_STATIC_PY size_t +lev_u_edit_distance(size_t len1, const lev_wchar *string1, + size_t len2, const lev_wchar *string2, + int xcost) +{ + size_t i; + size_t *row; /* we only need to keep one row of costs */ + size_t *end; + size_t half; + + /* strip common prefix */ + while (len1 > 0 && len2 > 0 && *string1 == *string2) { + len1--; + len2--; + string1++; + string2++; + } + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { + len1--; + len2--; + } + + /* catch trivial cases */ + if (len1 == 0) + return len2; + if (len2 == 0) + return len1; + + /* make the inner cycle (i.e. string2) the longer one */ + if (len1 > len2) { + size_t nx = len1; + const lev_wchar *sx = string1; + len1 = len2; + len2 = nx; + string1 = string2; + string2 = sx; + } + /* check len1 == 1 separately */ + if (len1 == 1) { + lev_wchar z = *string1; + const lev_wchar *p = string2; + for (i = len2; i; i--) { + if (*(p++) == z) + return len2 - 1; + } + return len2 + (xcost != 0); + } + len1++; + len2++; + half = len1 >> 1; + + /* initalize first row */ + row = (size_t*)safe_malloc(len2, sizeof(size_t)); + if (!row) + return (size_t)(-1); + end = row + len2 - 1; + for (i = 0; i < len2 - (xcost ? 0 : half); i++) + row[i] = i; + + /* go through the matrix and compute the costs. yes, this is an extremely + * obfuscated version, but also extremely memory-conservative and relatively + * fast. */ + if (xcost) { + for (i = 1; i < len1; i++) { + size_t *p = row + 1; + const lev_wchar char1 = string1[i - 1]; + const lev_wchar *char2p = string2; + size_t D = i - 1; + size_t x = i; + while (p <= end) { + if (char1 == *(char2p++)) + x = D; + else + x++; + D = *p; + if (x > D + 1) + x = D + 1; + *(p++) = x; + } + } + } + else { + /* in this case we don't have to scan two corner triangles (of size len1/2) + * in the matrix because no best path can go throught them. note this + * breaks when len1 == len2 == 2 so the memchr() special case above is + * necessary */ + row[0] = len1 - half - 1; + for (i = 1; i < len1; i++) { + size_t *p; + const lev_wchar char1 = string1[i - 1]; + const lev_wchar *char2p; + size_t D, x; + /* skip the upper triangle */ + if (i >= len1 - half) { + size_t offset = i - (len1 - half); + size_t c3; + + char2p = string2 + offset; + p = row + offset; + c3 = *(p++) + (char1 != *(char2p++)); + x = *p; + x++; + D = x; + if (x > c3) + x = c3; + *(p++) = x; + } + else { + p = row + 1; + char2p = string2; + D = x = i; + } + /* skip the lower triangle */ + if (i <= half + 1) + end = row + len2 + i - half - 2; + /* main */ + while (p <= end) { + size_t c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + /* lower triangle sentinel */ + if (i <= half) { + size_t c3 = --D + (char1 != *char2p); + x++; + if (x > c3) + x = c3; + *p = x; + } + } + } + + i = *end; + free(row); + return i; +} + +_LEV_STATIC_PY double +lev_u_edit_distance_sod(size_t len, const lev_wchar *string, + size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + int xcost) +{ + size_t i, d; + double sum = 0.0; + + for (i = 0; i < n; i++) { + d = lev_u_edit_distance(len, string, lengths[i], strings[i], xcost); + if (d == (size_t)-1) + return -1.0; + sum += weights[i]*d; + } + return sum; +} +/* }}} */ + + +/**************************************************************************** + * + * Other simple distances: Hamming, Jaro, Jaro-Winkler + * + ****************************************************************************/ +/* {{{ */ +/** + * lev_hamming_distance: + * @len: The length of @string1 and @string2. + * @string1: A sequence of bytes of length @len1, may contain NUL characters. + * @string2: A sequence of bytes of length @len2, may contain NUL characters. + * + * Computes Hamming distance of two strings. + * + * The strings must have the same length. + * + * Returns: The Hamming distance. + **/ +_LEV_STATIC_PY size_t +lev_hamming_distance(size_t len, + const lev_byte *string1, + const lev_byte *string2) +{ + size_t dist, i; + + dist = 0; + for (i = len; i; i--) { + if (*(string1++) != *(string2++)) + dist++; + } + + return dist; +} + +/** + * lev_u_hamming_distance: + * @len: The length of @string1 and @string2. + * @string1: A sequence of Unicode characters of length @len1, may contain NUL + * characters. + * @string2: A sequence of Unicode characters of length @len2, may contain NUL + * characters. + * + * Computes Hamming distance of two strings. + * + * The strings must have the same length. + * + * Returns: The Hamming distance. + **/ +_LEV_STATIC_PY size_t +lev_u_hamming_distance(size_t len, + const lev_wchar *string1, + const lev_wchar *string2) +{ + size_t dist, i; + + dist = 0; + for (i = len; i; i--) { + if (*(string1++) != *(string2++)) + dist++; + } + + return dist; +} + +/** + * lev_jaro_ratio: + * @len1: The length of @string1. + * @string1: A sequence of bytes of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A sequence of bytes of length @len2, may contain NUL characters. + * + * Computes Jaro string similarity metric of two strings. + * + * Returns: The Jaro metric of @string1 and @string2. + **/ +_LEV_STATIC_PY double +lev_jaro_ratio(size_t len1, const lev_byte *string1, + size_t len2, const lev_byte *string2) +{ + size_t i, j, halflen, trans, match, to; + size_t *idx; + double md; + + if (len1 == 0 || len2 == 0) { + if (len1 == 0 && len2 == 0) + return 1.0; + return 0.0; + } + /* make len1 always shorter (or equally long) */ + if (len1 > len2) { + const lev_byte *b; + + b = string1; + string1 = string2; + string2 = b; + + i = len1; + len1 = len2; + len2 = i; + } + + halflen = (len1 + 1)/2; + idx = (size_t*)calloc(len1, sizeof(size_t)); + if (!idx) + return -1.0; + + /* The literature about Jaro metric is confusing as the method of assigment + * of common characters is nowhere specified. There are several possible + * deterministic mutual assignments of common characters of two strings. + * We use earliest-position method, which is however suboptimal (e.g., it + * gives two transpositions in jaro("Jaro", "Joaro") because of assigment + * of the first `o'). No reasonable algorithm for the optimal one is + * currently known to me. */ + match = 0; + /* the part with allowed range overlapping left */ + for (i = 0; i < halflen; i++) { + for (j = 0; j <= i+halflen; j++) { + if (string1[j] == string2[i] && !idx[j]) { + match++; + idx[j] = match; + break; + } + } + } + /* the part with allowed range overlapping right */ + to = len1+halflen < len2 ? len1+halflen : len2; + for (i = halflen; i < to; i++) { + for (j = i - halflen; j < len1; j++) { + if (string1[j] == string2[i] && !idx[j]) { + match++; + idx[j] = match; + break; + } + } + } + if (!match) { + free(idx); + return 0.0; + } + /* count transpositions */ + i = 0; + trans = 0; + for (j = 0; j < len1; j++) { + if (idx[j]) { + i++; + if (idx[j] != i) + trans++; + } + } + free(idx); + + md = (double)match; + return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0; +} + +/** + * lev_u_jaro_ratio: + * @len1: The length of @string1. + * @string1: A sequence of Unicode characters of length @len1, may contain NUL + * characters. + * @len2: The length of @string2. + * @string2: A sequence of Unicode characters of length @len2, may contain NUL + * characters. + * + * Computes Jaro string similarity metric of two Unicode strings. + * + * Returns: The Jaro metric of @string1 and @string2. + **/ +_LEV_STATIC_PY double +lev_u_jaro_ratio(size_t len1, const lev_wchar *string1, + size_t len2, const lev_wchar *string2) +{ + size_t i, j, halflen, trans, match, to; + size_t *idx; + double md; + + if (len1 == 0 || len2 == 0) { + if (len1 == 0 && len2 == 0) + return 1.0; + return 0.0; + } + /* make len1 always shorter (or equally long) */ + if (len1 > len2) { + const lev_wchar *b; + + b = string1; + string1 = string2; + string2 = b; + + i = len1; + len1 = len2; + len2 = i; + } + + halflen = (len1 + 1)/2; + idx = (size_t*)calloc(len1, sizeof(size_t)); + if (!idx) + return -1.0; + + match = 0; + /* the part with allowed range overlapping left */ + for (i = 0; i < halflen; i++) { + for (j = 0; j <= i+halflen; j++) { + if (string1[j] == string2[i] && !idx[j]) { + match++; + idx[j] = match; + break; + } + } + } + /* the part with allowed range overlapping right */ + to = len1+halflen < len2 ? len1+halflen : len2; + for (i = halflen; i < to; i++) { + for (j = i - halflen; j < len1; j++) { + if (string1[j] == string2[i] && !idx[j]) { + match++; + idx[j] = match; + break; + } + } + } + if (!match) { + free(idx); + return 0.0; + } + /* count transpositions */ + i = 0; + trans = 0; + for (j = 0; j < len1; j++) { + if (idx[j]) { + i++; + if (idx[j] != i) + trans++; + } + } + free(idx); + + md = (double)match; + return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0; +} + +/** + * lev_jaro_winkler_ratio: + * @len1: The length of @string1. + * @string1: A sequence of bytes of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A sequence of bytes of length @len2, may contain NUL characters. + * @pfweight: Prefix weight, i.e., how much weight should be given to a + * common prefix. + * + * Computes Jaro-Winkler string similarity metric of two strings. + * + * The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the + * length of common prefix. + * + * Returns: The Jaro-Winkler metric of @string1 and @string2. + **/ +_LEV_STATIC_PY double +lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1, + size_t len2, const lev_byte *string2, + double pfweight) +{ + double j; + size_t p, m; + + j = lev_jaro_ratio(len1, string1, len2, string2); + m = len1 < len2 ? len1 : len2; + for (p = 0; p < m; p++) { + if (string1[p] != string2[p]) + break; + } + j += (1.0 - j)*p*pfweight; + return j > 1.0 ? 1.0 : j; +} + +/** + * lev_u_jaro_winkler_ratio: + * @len1: The length of @string1. + * @string1: A sequence of Unicode characters of length @len1, may contain NUL + * characters. + * @len2: The length of @string2. + * @string2: A sequence of Unicode characters of length @len2, may contain NUL + * characters. + * @pfweight: Prefix weight, i.e., how much weight should be given to a + * common prefix. + * + * Computes Jaro-Winkler string similarity metric of two Unicode strings. + * + * The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the + * length of common prefix. + * + * Returns: The Jaro-Winkler metric of @string1 and @string2. + **/ +_LEV_STATIC_PY double +lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1, + size_t len2, const lev_wchar *string2, + double pfweight) +{ + double j; + size_t p, m; + + j = lev_u_jaro_ratio(len1, string1, len2, string2); + m = len1 < len2 ? len1 : len2; + for (p = 0; p < m; p++) { + if (string1[p] != string2[p]) + break; + } + j += (1.0 - j)*p*pfweight; + return j > 1.0 ? 1.0 : j; +} +/* }}} */ + +/**************************************************************************** + * + * Generalized medians, the greedy algorithm, and greedy improvements + * + ****************************************************************************/ +/* {{{ */ + +/* compute the sets of symbols each string contains, and the set of symbols + * in any of them (symset). meanwhile, count how many different symbols + * there are (used below for symlist). */ +static lev_byte* +make_symlist(size_t n, const size_t *lengths, + const lev_byte *strings[], size_t *symlistlen) +{ + short int *symset; /* indexed by ALL symbols, contains 1 for symbols + present in the strings, zero for others */ + size_t i, j; + lev_byte *symlist; + + symset = calloc(0x100, sizeof(short int)); + if (!symset) { + *symlistlen = (size_t)(-1); + return NULL; + } + *symlistlen = 0; + for (i = 0; i < n; i++) { + const lev_byte *stri = strings[i]; + for (j = 0; j < lengths[i]; j++) { + int c = stri[j]; + if (!symset[c]) { + (*symlistlen)++; + symset[c] = 1; + } + } + } + if (!*symlistlen) { + free(symset); + return NULL; + } + + /* create dense symbol table, so we can easily iterate over only characters + * present in the strings */ + { + size_t pos = 0; + symlist = (lev_byte*)safe_malloc((*symlistlen), sizeof(lev_byte)); + if (!symlist) { + *symlistlen = (size_t)(-1); + free(symset); + return NULL; + } + for (j = 0; j < 0x100; j++) { + if (symset[j]) + symlist[pos++] = (lev_byte)j; + } + } + free(symset); + + return symlist; +} + +/** + * lev_greedy_median: + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * @medlength: Where the length of the median should be stored. + * + * Finds a generalized median string of @strings using the greedy algorithm. + * + * Note it's considerably more efficient to give a string with weight 2 + * than to store two identical strings in @strings (with weights 1). + * + * Returns: The generalized median, as a newly allocated string; its length + * is stored in @medlength. + **/ +_LEV_STATIC_PY lev_byte* +lev_greedy_median(size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength) +{ + size_t i; /* usually iterates over strings (n) */ + size_t j; /* usually iterates over characters */ + size_t len; /* usually iterates over the approximate median string */ + lev_byte *symlist; /* list of symbols present in the strings, + we iterate over it insead of set of all + existing symbols */ + size_t symlistlen; /* length of symlist */ + size_t maxlen; /* maximum input string length */ + size_t stoplen; /* maximum tried median string length -- this is slightly + higher than maxlen, because the median string may be + longer than any of the input strings */ + size_t **rows; /* Levenshtein matrix rows for each string, we need to keep + only one previous row to construct the current one */ + size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, + shared among all strings */ + lev_byte *median; /* the resulting approximate median string */ + double *mediandist; /* the total distance of the best median string of + given length. warning! mediandist[0] is total + distance for empty string, while median[] itself + is normally zero-based */ + size_t bestlen; /* the best approximate median string length */ + + /* find all symbols */ + symlist = make_symlist(n, lengths, strings, &symlistlen); + if (!symlist) { + *medlength = 0; + if (symlistlen != 0) + return NULL; + else + return calloc(1, sizeof(lev_byte)); + } + + /* allocate and initialize per-string matrix rows and a common work buffer */ + rows = (size_t**)safe_malloc(n, sizeof(size_t*)); + if (!rows) { + free(symlist); + return NULL; + } + maxlen = 0; + for (i = 0; i < n; i++) { + size_t *ri; + size_t leni = lengths[i]; + if (leni > maxlen) + maxlen = leni; + ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); + if (!ri) { + for (j = 0; j < i; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + for (j = 0; j <= leni; j++) + ri[j] = j; + } + stoplen = 2*maxlen + 1; + row = (size_t*)safe_malloc((stoplen + 1), sizeof(size_t)); + if (!row) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + + /* compute final cost of string of length 0 (empty string may be also + * a valid answer) */ + median = (lev_byte*)safe_malloc(stoplen, sizeof(lev_byte)); + if (!median) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(row); + free(symlist); + return NULL; + } + mediandist = (double*)safe_malloc((stoplen + 1), sizeof(double)); + if (!mediandist) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(row); + free(symlist); + free(median); + return NULL; + } + mediandist[0] = 0.0; + for (i = 0; i < n; i++) + mediandist[0] += lengths[i]*weights[i]; + + /* build up the approximate median string symbol by symbol + * XXX: we actually exit on break below, but on the same condition */ + for (len = 1; len <= stoplen; len++) { + lev_byte symbol; + double minminsum = LEV_INFINITY; + row[0] = len; + /* iterate over all symbols we may want to add */ + for (j = 0; j < symlistlen; j++) { + double totaldist = 0.0; + double minsum = 0.0; + symbol = symlist[j]; + /* sum Levenshtein distances from all the strings, with given weights */ + for (i = 0; i < n; i++) { + const lev_byte *stri = strings[i]; + size_t *p = rows[i]; + size_t leni = lengths[i]; + size_t *end = rows[i] + leni; + size_t min = len; + size_t x = len; /* == row[0] */ + /* compute how another row of Levenshtein matrix would look for median + * string with this symbol added */ + while (p < end) { + size_t D = *(p++) + (symbol != *(stri++)); + x++; + if (x > D) + x = D; + if (x > *p + 1) + x = *p + 1; + if (x < min) + min = x; + } + minsum += min*weights[i]; + totaldist += x*weights[i]; + } + /* is this symbol better than all the others? */ + if (minsum < minminsum) { + minminsum = minsum; + mediandist[len] = totaldist; + median[len - 1] = symbol; + } + } + /* stop the iteration if we no longer need to recompute the matrix rows + * or when we are over maxlen and adding more characters doesn't seem + * useful */ + if (len == stoplen + || (len > maxlen && mediandist[len] > mediandist[len - 1])) { + stoplen = len; + break; + } + /* now the best symbol is known, so recompute all matrix rows for this + * one */ + symbol = median[len - 1]; + for (i = 0; i < n; i++) { + const lev_byte *stri = strings[i]; + size_t *oldrow = rows[i]; + size_t leni = lengths[i]; + size_t k; + /* compute a row of Levenshtein matrix */ + for (k = 1; k <= leni; k++) { + size_t c1 = oldrow[k] + 1; + size_t c2 = row[k - 1] + 1; + size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); + row[k] = c2 > c3 ? c3 : c2; + if (row[k] > c1) + row[k] = c1; + } + memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); + } + } + + /* find the string with minimum total distance */ + bestlen = 0; + for (len = 1; len <= stoplen; len++) { + if (mediandist[len] < mediandist[bestlen]) + bestlen = len; + } + + /* clean up */ + for (i = 0; i < n; i++) + free(rows[i]); + free(rows); + free(row); + free(symlist); + free(mediandist); + + /* return result */ + { + lev_byte *result = (lev_byte*)safe_malloc(bestlen, sizeof(lev_byte)); + if (!result) { + free(median); + return NULL; + } + memcpy(result, median, bestlen*sizeof(lev_byte)); + free(median); + *medlength = bestlen; + return result; + } +} + +/* + * Knowing the distance matrices up to some row, finish the distance + * computations. + * + * string1, len1 are already shortened. + */ +static double +finish_distance_computations(size_t len1, lev_byte *string1, + size_t n, const size_t *lengths, + const lev_byte **strings, + const double *weights, size_t **rows, + size_t *row) +{ + size_t *end; + size_t i, j; + size_t offset; /* row[0]; offset + len1 give together real len of string1 */ + double distsum = 0.0; /* sum of distances */ + + /* catch trivia case */ + if (len1 == 0) { + for (j = 0; j < n; j++) + distsum += rows[j][lengths[j]]*weights[j]; + return distsum; + } + + /* iterate through the strings and sum the distances */ + for (j = 0; j < n; j++) { + size_t *rowi = rows[j]; /* current row */ + size_t leni = lengths[j]; /* current length */ + size_t len = len1; /* temporary len1 for suffix stripping */ + const lev_byte *stringi = strings[j]; /* current string */ + + /* strip common suffix (prefix CAN'T be stripped) */ + while (len && leni && stringi[leni-1] == string1[len-1]) { + len--; + leni--; + } + + /* catch trivial cases */ + if (len == 0) { + distsum += rowi[leni]*weights[j]; + continue; + } + offset = rowi[0]; + if (leni == 0) { + distsum += (offset + len)*weights[j]; + continue; + } + + /* complete the matrix */ + memcpy(row, rowi, (leni + 1)*sizeof(size_t)); + end = row + leni; + + for (i = 1; i <= len; i++) { + size_t *p = row + 1; + const lev_byte char1 = string1[i - 1]; + const lev_byte *char2p = stringi; + size_t D, x; + + D = x = i + offset; + while (p <= end) { + size_t c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + } + distsum += weights[j]*(*end); + } + + return distsum; +} + +/** + * lev_median_improve: + * @len: The length of @s. + * @s: The approximate generalized median string to be improved. + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * @medlength: Where the new length of the median should be stored. + * + * Tries to make @s a better generalized median string of @strings with + * small perturbations. + * + * It never returns a string with larger SOD than @s; in the worst case, a + * string identical to @s is returned. + * + * Returns: The improved generalized median, as a newly allocated string; its + * length is stored in @medlength. + **/ +_LEV_STATIC_PY lev_byte* +lev_median_improve(size_t len, const lev_byte *s, + size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength) +{ + size_t i; /* usually iterates over strings (n) */ + size_t j; /* usually iterates over characters */ + size_t pos; /* the position in the approximate median string we are + trying to change */ + lev_byte *symlist; /* list of symbols present in the strings, + we iterate over it insead of set of all + existing symbols */ + size_t symlistlen; /* length of symlist */ + size_t maxlen; /* maximum input string length */ + size_t stoplen; /* maximum tried median string length -- this is slightly + higher than maxlen, because the median string may be + longer than any of the input strings */ + size_t **rows; /* Levenshtein matrix rows for each string, we need to keep + only one previous row to construct the current one */ + size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, + shared among all strings */ + lev_byte *median; /* the resulting approximate median string */ + size_t medlen; /* the current approximate median string length */ + double minminsum; /* the current total distance sum */ + + /* find all symbols */ + symlist = make_symlist(n, lengths, strings, &symlistlen); + if (!symlist) { + *medlength = 0; + if (symlistlen != 0) + return NULL; + else + return calloc(1, sizeof(lev_byte)); + } + + /* allocate and initialize per-string matrix rows and a common work buffer */ + rows = (size_t**)safe_malloc(n, sizeof(size_t*)); + if (!rows) { + free(symlist); + return NULL; + } + maxlen = 0; + for (i = 0; i < n; i++) { + size_t *ri; + size_t leni = lengths[i]; + if (leni > maxlen) + maxlen = leni; + ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); + if (!ri) { + for (j = 0; j < i; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + for (j = 0; j <= leni; j++) + ri[j] = j; + } + stoplen = 2*maxlen + 1; + row = (size_t*)safe_malloc((stoplen + 2), sizeof(size_t)); + if (!row) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + + /* initialize median to given string */ + median = (lev_byte*)safe_malloc((stoplen+1), sizeof(lev_byte)); + if (!median) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(row); + free(symlist); + return NULL; + } + median++; /* we need -1st element for insertions a pos 0 */ + medlen = len; + memcpy(median, s, (medlen)*sizeof(lev_byte)); + minminsum = finish_distance_computations(medlen, median, + n, lengths, strings, + weights, rows, row); + + /* sequentially try perturbations on all positions */ + for (pos = 0; pos <= medlen; ) { + lev_byte orig_symbol, symbol; + LevEditType operation; + double sum; + + symbol = median[pos]; + operation = LEV_EDIT_KEEP; + /* IF pos < medlength: FOREACH symbol: try to replace the symbol + * at pos, if some lower the total distance, chooste the best */ + if (pos < medlen) { + orig_symbol = median[pos]; + for (j = 0; j < symlistlen; j++) { + if (symlist[j] == orig_symbol) + continue; + median[pos] = symlist[j]; + sum = finish_distance_computations(medlen - pos, median + pos, + n, lengths, strings, + weights, rows, row); + if (sum < minminsum) { + minminsum = sum; + symbol = symlist[j]; + operation = LEV_EDIT_REPLACE; + } + } + median[pos] = orig_symbol; + } + /* FOREACH symbol: try to add it at pos, if some lower the total + * distance, chooste the best (increase medlength) + * We simulate insertion by replacing the character at pos-1 */ + orig_symbol = *(median + pos - 1); + for (j = 0; j < symlistlen; j++) { + *(median + pos - 1) = symlist[j]; + sum = finish_distance_computations(medlen - pos + 1, median + pos - 1, + n, lengths, strings, + weights, rows, row); + if (sum < minminsum) { + minminsum = sum; + symbol = symlist[j]; + operation = LEV_EDIT_INSERT; + } + } + *(median + pos - 1) = orig_symbol; + /* IF pos < medlength: try to delete the symbol at pos, if it lowers + * the total distance remember it (decrease medlength) */ + if (pos < medlen) { + sum = finish_distance_computations(medlen - pos - 1, median + pos + 1, + n, lengths, strings, + weights, rows, row); + if (sum < minminsum) { + minminsum = sum; + operation = LEV_EDIT_DELETE; + } + } + /* actually perform the operation */ + switch (operation) { + case LEV_EDIT_REPLACE: + median[pos] = symbol; + break; + + case LEV_EDIT_INSERT: + memmove(median+pos+1, median+pos, + (medlen - pos)*sizeof(lev_byte)); + median[pos] = symbol; + medlen++; + break; + + case LEV_EDIT_DELETE: + memmove(median+pos, median + pos+1, + (medlen - pos-1)*sizeof(lev_byte)); + medlen--; + break; + + default: + break; + } + assert(medlen <= stoplen); + /* now the result is known, so recompute all matrix rows and move on */ + if (operation != LEV_EDIT_DELETE) { + symbol = median[pos]; + row[0] = pos + 1; + for (i = 0; i < n; i++) { + const lev_byte *stri = strings[i]; + size_t *oldrow = rows[i]; + size_t leni = lengths[i]; + size_t k; + /* compute a row of Levenshtein matrix */ + for (k = 1; k <= leni; k++) { + size_t c1 = oldrow[k] + 1; + size_t c2 = row[k - 1] + 1; + size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); + row[k] = c2 > c3 ? c3 : c2; + if (row[k] > c1) + row[k] = c1; + } + memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); + } + pos++; + } + } + + /* clean up */ + for (i = 0; i < n; i++) + free(rows[i]); + free(rows); + free(row); + free(symlist); + + /* return result */ + { + lev_byte *result = (lev_byte*)safe_malloc(medlen, sizeof(lev_byte)); + if (!result) { + free(median); + return NULL; + } + *medlength = medlen; + memcpy(result, median, medlen*sizeof(lev_byte)); + median--; + free(median); + return result; + } +} + +/* used internally in make_usymlist */ +typedef struct _HItem HItem; +struct _HItem { + lev_wchar c; + HItem *n; +}; + +/* free usmylist hash + * this is a separate function because we need it in more than one place */ +static void +free_usymlist_hash(HItem *symmap) +{ + size_t j; + + for (j = 0; j < 0x100; j++) { + HItem *p = symmap + j; + if (p->n == symmap || p->n == NULL) + continue; + p = p->n; + while (p) { + HItem *q = p; + p = p->n; + free(q); + } + } + free(symmap); +} + +/* compute the sets of symbols each string contains, and the set of symbols + * in any of them (symset). meanwhile, count how many different symbols + * there are (used below for symlist). */ +static lev_wchar* +make_usymlist(size_t n, const size_t *lengths, + const lev_wchar *strings[], size_t *symlistlen) +{ + lev_wchar *symlist; + size_t i, j; + HItem *symmap; + + j = 0; + for (i = 0; i < n; i++) + j += lengths[i]; + + *symlistlen = 0; + if (j == 0) + return NULL; + + /* find all symbols, use a kind of hash for storage */ + symmap = (HItem*)safe_malloc(0x100, sizeof(HItem)); + if (!symmap) { + *symlistlen = (size_t)(-1); + return NULL; + } + /* this is an ugly memory allocation avoiding hack: most hash elements + * will probably contain none or one symbols only so, when p->n is equal + * to symmap, it means there're no symbols yet, afters insterting the + * first one, p->n becomes normally NULL and then it behaves like an + * usual singly linked list */ + for (i = 0; i < 0x100; i++) + symmap[i].n = symmap; + for (i = 0; i < n; i++) { + const lev_wchar *stri = strings[i]; + for (j = 0; j < lengths[i]; j++) { + int c = stri[j]; + int key = (c + (c >> 7)) & 0xff; + HItem *p = symmap + key; + if (p->n == symmap) { + p->c = c; + p->n = NULL; + (*symlistlen)++; + continue; + } + while (p->c != c && p->n != NULL) + p = p->n; + if (p->c != c) { + p->n = (HItem*)malloc(sizeof(HItem)); + if (!p->n) { + free_usymlist_hash(symmap); + *symlistlen = (size_t)(-1); + return NULL; + } + p = p->n; + p->n = NULL; + p->c = c; + (*symlistlen)++; + } + } + } + /* create dense symbol table, so we can easily iterate over only characters + * present in the strings */ + { + size_t pos = 0; + symlist = (lev_wchar*)safe_malloc((*symlistlen), sizeof(lev_wchar)); + if (!symlist) { + free_usymlist_hash(symmap); + *symlistlen = (size_t)(-1); + return NULL; + } + for (j = 0; j < 0x100; j++) { + HItem *p = symmap + j; + while (p != NULL && p->n != symmap) { + symlist[pos++] = p->c; + p = p->n; + } + } + } + + /* free memory */ + free_usymlist_hash(symmap); + + return symlist; +} + +/** + * lev_u_greedy_median: + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * @medlength: Where the length of the median should be stored. + * + * Finds a generalized median string of @strings using the greedy algorithm. + * + * Note it's considerably more efficient to give a string with weight 2 + * than to store two identical strings in @strings (with weights 1). + * + * Returns: The generalized median, as a newly allocated string; its length + * is stored in @medlength. + **/ +_LEV_STATIC_PY lev_wchar* +lev_u_greedy_median(size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength) +{ + size_t i; /* usually iterates over strings (n) */ + size_t j; /* usually iterates over characters */ + size_t len; /* usually iterates over the approximate median string */ + lev_wchar *symlist; /* list of symbols present in the strings, + we iterate over it insead of set of all + existing symbols */ + size_t symlistlen; /* length of symlistle */ + size_t maxlen; /* maximum input string length */ + size_t stoplen; /* maximum tried median string length -- this is slightly + higher than maxlen, because the median string may be + longer than any of the input strings */ + size_t **rows; /* Levenshtein matrix rows for each string, we need to keep + only one previous row to construct the current one */ + size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, + shared among all strings */ + lev_wchar *median; /* the resulting approximate median string */ + double *mediandist; /* the total distance of the best median string of + given length. warning! mediandist[0] is total + distance for empty string, while median[] itself + is normally zero-based */ + size_t bestlen; /* the best approximate median string length */ + + /* find all symbols */ + symlist = make_usymlist(n, lengths, strings, &symlistlen); + if (!symlist) { + *medlength = 0; + if (symlistlen != 0) + return NULL; + else + return calloc(1, sizeof(lev_wchar)); + } + + /* allocate and initialize per-string matrix rows and a common work buffer */ + rows = (size_t**)safe_malloc(n, sizeof(size_t*)); + if (!rows) { + free(symlist); + return NULL; + } + maxlen = 0; + for (i = 0; i < n; i++) { + size_t *ri; + size_t leni = lengths[i]; + if (leni > maxlen) + maxlen = leni; + ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); + if (!ri) { + for (j = 0; j < i; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + for (j = 0; j <= leni; j++) + ri[j] = j; + } + stoplen = 2*maxlen + 1; + row = (size_t*)safe_malloc((stoplen + 1), sizeof(size_t)); + if (!row) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + + /* compute final cost of string of length 0 (empty string may be also + * a valid answer) */ + median = (lev_wchar*)safe_malloc(stoplen, sizeof(lev_wchar)); + if (!median) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(row); + free(symlist); + return NULL; + } + mediandist = (double*)safe_malloc((stoplen + 1), sizeof(double)); + if (!mediandist) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(row); + free(symlist); + free(median); + return NULL; + } + mediandist[0] = 0.0; + for (i = 0; i < n; i++) + mediandist[0] += lengths[i]*weights[i]; + + /* build up the approximate median string symbol by symbol + * XXX: we actually exit on break below, but on the same condition */ + for (len = 1; len <= stoplen; len++) { + lev_wchar symbol; + double minminsum = LEV_INFINITY; + row[0] = len; + /* iterate over all symbols we may want to add */ + for (j = 0; j < symlistlen; j++) { + double totaldist = 0.0; + double minsum = 0.0; + symbol = symlist[j]; + /* sum Levenshtein distances from all the strings, with given weights */ + for (i = 0; i < n; i++) { + const lev_wchar *stri = strings[i]; + size_t *p = rows[i]; + size_t leni = lengths[i]; + size_t *end = rows[i] + leni; + size_t min = len; + size_t x = len; /* == row[0] */ + /* compute how another row of Levenshtein matrix would look for median + * string with this symbol added */ + while (p < end) { + size_t D = *(p++) + (symbol != *(stri++)); + x++; + if (x > D) + x = D; + if (x > *p + 1) + x = *p + 1; + if (x < min) + min = x; + } + minsum += min*weights[i]; + totaldist += x*weights[i]; + } + /* is this symbol better than all the others? */ + if (minsum < minminsum) { + minminsum = minsum; + mediandist[len] = totaldist; + median[len - 1] = symbol; + } + } + /* stop the iteration if we no longer need to recompute the matrix rows + * or when we are over maxlen and adding more characters doesn't seem + * useful */ + if (len == stoplen + || (len > maxlen && mediandist[len] > mediandist[len - 1])) { + stoplen = len; + break; + } + /* now the best symbol is known, so recompute all matrix rows for this + * one */ + symbol = median[len - 1]; + for (i = 0; i < n; i++) { + const lev_wchar *stri = strings[i]; + size_t *oldrow = rows[i]; + size_t leni = lengths[i]; + size_t k; + /* compute a row of Levenshtein matrix */ + for (k = 1; k <= leni; k++) { + size_t c1 = oldrow[k] + 1; + size_t c2 = row[k - 1] + 1; + size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); + row[k] = c2 > c3 ? c3 : c2; + if (row[k] > c1) + row[k] = c1; + } + memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); + } + } + + /* find the string with minimum total distance */ + bestlen = 0; + for (len = 1; len <= stoplen; len++) { + if (mediandist[len] < mediandist[bestlen]) + bestlen = len; + } + + /* clean up */ + for (i = 0; i < n; i++) + free(rows[i]); + free(rows); + free(row); + free(symlist); + free(mediandist); + + /* return result */ + { + lev_wchar *result = (lev_wchar*)safe_malloc(bestlen, sizeof(lev_wchar)); + if (!result) { + free(median); + return NULL; + } + memcpy(result, median, bestlen*sizeof(lev_wchar)); + free(median); + *medlength = bestlen; + return result; + } +} + +/* + * Knowing the distance matrices up to some row, finish the distance + * computations. + * + * string1, len1 are already shortened. + */ +static double +finish_udistance_computations(size_t len1, lev_wchar *string1, + size_t n, const size_t *lengths, + const lev_wchar **strings, + const double *weights, size_t **rows, + size_t *row) +{ + size_t *end; + size_t i, j; + size_t offset; /* row[0]; offset + len1 give together real len of string1 */ + double distsum = 0.0; /* sum of distances */ + + /* catch trivia case */ + if (len1 == 0) { + for (j = 0; j < n; j++) + distsum += rows[j][lengths[j]]*weights[j]; + return distsum; + } + + /* iterate through the strings and sum the distances */ + for (j = 0; j < n; j++) { + size_t *rowi = rows[j]; /* current row */ + size_t leni = lengths[j]; /* current length */ + size_t len = len1; /* temporary len1 for suffix stripping */ + const lev_wchar *stringi = strings[j]; /* current string */ + + /* strip common suffix (prefix CAN'T be stripped) */ + while (len && leni && stringi[leni-1] == string1[len-1]) { + len--; + leni--; + } + + /* catch trivial cases */ + if (len == 0) { + distsum += rowi[leni]*weights[j]; + continue; + } + offset = rowi[0]; + if (leni == 0) { + distsum += (offset + len)*weights[j]; + continue; + } + + /* complete the matrix */ + memcpy(row, rowi, (leni + 1)*sizeof(size_t)); + end = row + leni; + + for (i = 1; i <= len; i++) { + size_t *p = row + 1; + const lev_wchar char1 = string1[i - 1]; + const lev_wchar *char2p = stringi; + size_t D, x; + + D = x = i + offset; + while (p <= end) { + size_t c3 = --D + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + D = *p; + D++; + if (x > D) + x = D; + *(p++) = x; + } + } + distsum += weights[j]*(*end); + } + + return distsum; +} + +/** + * lev_u_median_improve: + * @len: The length of @s. + * @s: The approximate generalized median string to be improved. + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * @medlength: Where the new length of the median should be stored. + * + * Tries to make @s a better generalized median string of @strings with + * small perturbations. + * + * It never returns a string with larger SOD than @s; in the worst case, a + * string identical to @s is returned. + * + * Returns: The improved generalized median, as a newly allocated string; its + * length is stored in @medlength. + **/ +_LEV_STATIC_PY lev_wchar* +lev_u_median_improve(size_t len, const lev_wchar *s, + size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength) +{ + size_t i; /* usually iterates over strings (n) */ + size_t j; /* usually iterates over characters */ + size_t pos; /* the position in the approximate median string we are + trying to change */ + lev_wchar *symlist; /* list of symbols present in the strings, + we iterate over it insead of set of all + existing symbols */ + size_t symlistlen; /* length of symlist */ + size_t maxlen; /* maximum input string length */ + size_t stoplen; /* maximum tried median string length -- this is slightly + higher than maxlen, because the median string may be + longer than any of the input strings */ + size_t **rows; /* Levenshtein matrix rows for each string, we need to keep + only one previous row to construct the current one */ + size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, + shared among all strings */ + lev_wchar *median; /* the resulting approximate median string */ + size_t medlen; /* the current approximate median string length */ + double minminsum; /* the current total distance sum */ + + /* find all symbols */ + symlist = make_usymlist(n, lengths, strings, &symlistlen); + if (!symlist) { + *medlength = 0; + if (symlistlen != 0) + return NULL; + else + return calloc(1, sizeof(lev_wchar)); + } + + /* allocate and initialize per-string matrix rows and a common work buffer */ + rows = (size_t**)safe_malloc(n, sizeof(size_t*)); + if (!rows) { + free(symlist); + return NULL; + } + maxlen = 0; + for (i = 0; i < n; i++) { + size_t *ri; + size_t leni = lengths[i]; + if (leni > maxlen) + maxlen = leni; + ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); + if (!ri) { + for (j = 0; j < i; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + for (j = 0; j <= leni; j++) + ri[j] = j; + } + stoplen = 2*maxlen + 1; + row = (size_t*)safe_malloc((stoplen + 2), sizeof(size_t)); + if (!row) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(symlist); + return NULL; + } + + /* initialize median to given string */ + median = (lev_wchar*)safe_malloc((stoplen+1), sizeof(lev_wchar)); + if (!median) { + for (j = 0; j < n; j++) + free(rows[j]); + free(rows); + free(row); + free(symlist); + return NULL; + } + median++; /* we need -1st element for insertions a pos 0 */ + medlen = len; + memcpy(median, s, (medlen)*sizeof(lev_wchar)); + minminsum = finish_udistance_computations(medlen, median, + n, lengths, strings, + weights, rows, row); + + /* sequentially try perturbations on all positions */ + for (pos = 0; pos <= medlen; ) { + lev_wchar orig_symbol, symbol; + LevEditType operation; + double sum; + + symbol = median[pos]; + operation = LEV_EDIT_KEEP; + /* IF pos < medlength: FOREACH symbol: try to replace the symbol + * at pos, if some lower the total distance, chooste the best */ + if (pos < medlen) { + orig_symbol = median[pos]; + for (j = 0; j < symlistlen; j++) { + if (symlist[j] == orig_symbol) + continue; + median[pos] = symlist[j]; + sum = finish_udistance_computations(medlen - pos, median + pos, + n, lengths, strings, + weights, rows, row); + if (sum < minminsum) { + minminsum = sum; + symbol = symlist[j]; + operation = LEV_EDIT_REPLACE; + } + } + median[pos] = orig_symbol; + } + /* FOREACH symbol: try to add it at pos, if some lower the total + * distance, chooste the best (increase medlength) + * We simulate insertion by replacing the character at pos-1 */ + orig_symbol = *(median + pos - 1); + for (j = 0; j < symlistlen; j++) { + *(median + pos - 1) = symlist[j]; + sum = finish_udistance_computations(medlen - pos + 1, median + pos - 1, + n, lengths, strings, + weights, rows, row); + if (sum < minminsum) { + minminsum = sum; + symbol = symlist[j]; + operation = LEV_EDIT_INSERT; + } + } + *(median + pos - 1) = orig_symbol; + /* IF pos < medlength: try to delete the symbol at pos, if it lowers + * the total distance remember it (decrease medlength) */ + if (pos < medlen) { + sum = finish_udistance_computations(medlen - pos - 1, median + pos + 1, + n, lengths, strings, + weights, rows, row); + if (sum < minminsum) { + minminsum = sum; + operation = LEV_EDIT_DELETE; + } + } + /* actually perform the operation */ + switch (operation) { + case LEV_EDIT_REPLACE: + median[pos] = symbol; + break; + + case LEV_EDIT_INSERT: + memmove(median+pos+1, median+pos, + (medlen - pos)*sizeof(lev_wchar)); + median[pos] = symbol; + medlen++; + break; + + case LEV_EDIT_DELETE: + memmove(median+pos, median + pos+1, + (medlen - pos-1)*sizeof(lev_wchar)); + medlen--; + break; + + default: + break; + } + assert(medlen <= stoplen); + /* now the result is known, so recompute all matrix rows and move on */ + if (operation != LEV_EDIT_DELETE) { + symbol = median[pos]; + row[0] = pos + 1; + for (i = 0; i < n; i++) { + const lev_wchar *stri = strings[i]; + size_t *oldrow = rows[i]; + size_t leni = lengths[i]; + size_t k; + /* compute a row of Levenshtein matrix */ + for (k = 1; k <= leni; k++) { + size_t c1 = oldrow[k] + 1; + size_t c2 = row[k - 1] + 1; + size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); + row[k] = c2 > c3 ? c3 : c2; + if (row[k] > c1) + row[k] = c1; + } + memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); + } + pos++; + } + } + + /* clean up */ + for (i = 0; i < n; i++) + free(rows[i]); + free(rows); + free(row); + free(symlist); + + /* return result */ + { + lev_wchar *result = (lev_wchar*)safe_malloc(medlen, sizeof(lev_wchar)); + if (!result) { + free(median); + return NULL; + } + *medlength = medlen; + memcpy(result, median, medlen*sizeof(lev_wchar)); + median--; + free(median); + return result; + } +} +/* }}} */ + +/**************************************************************************** + * + * Quick (voting) medians + * + ****************************************************************************/ +/* {{{ */ + +/* compute the sets of symbols each string contains, and the set of symbols + * in any of them (symset). meanwhile, count how many different symbols + * there are (used below for symlist). + * the symset is passed as an argument to avoid its allocation and + * deallocation when it's used in the caller too */ +static lev_byte* +make_symlistset(size_t n, const size_t *lengths, + const lev_byte *strings[], size_t *symlistlen, + double *symset) +{ + size_t i, j; + lev_byte *symlist; + + if (!symset) { + *symlistlen = (size_t)(-1); + return NULL; + } + memset(symset, 0, 0x100*sizeof(double)); /* XXX: needs IEEE doubles?! */ + *symlistlen = 0; + for (i = 0; i < n; i++) { + const lev_byte *stri = strings[i]; + for (j = 0; j < lengths[i]; j++) { + int c = stri[j]; + if (!symset[c]) { + (*symlistlen)++; + symset[c] = 1.0; + } + } + } + if (!*symlistlen) + return NULL; + + /* create dense symbol table, so we can easily iterate over only characters + * present in the strings */ + { + size_t pos = 0; + symlist = (lev_byte*)safe_malloc((*symlistlen), sizeof(lev_byte)); + if (!symlist) { + *symlistlen = (size_t)(-1); + return NULL; + } + for (j = 0; j < 0x100; j++) { + if (symset[j]) + symlist[pos++] = (lev_byte)j; + } + } + + return symlist; +} + +_LEV_STATIC_PY lev_byte* +lev_quick_median(size_t n, + const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength) +{ + size_t symlistlen, len, i, j, k; + lev_byte *symlist; + lev_byte *median; /* the resulting string */ + double *symset; + double ml, wl; + + /* first check whether the result would be an empty string + * and compute resulting string length */ + ml = wl = 0.0; + for (i = 0; i < n; i++) { + ml += lengths[i]*weights[i]; + wl += weights[i]; + } + if (wl == 0.0) + return (lev_byte*)calloc(1, sizeof(lev_byte)); + ml = floor(ml/wl + 0.499999); + *medlength = len = ml; + if (!len) + return (lev_byte*)calloc(1, sizeof(lev_byte)); + median = (lev_byte*)safe_malloc(len, sizeof(lev_byte)); + if (!median) + return NULL; + + /* find the symbol set; + * now an empty symbol set is really a failure */ + symset = (double*)calloc(0x100, sizeof(double)); + if (!symset) { + free(median); + return NULL; + } + symlist = make_symlistset(n, lengths, strings, &symlistlen, symset); + if (!symlist) { + free(median); + free(symset); + return NULL; + } + + for (j = 0; j < len; j++) { + /* clear the symbol probabilities */ + if (symlistlen < 32) { + for (i = 0; i < symlistlen; i++) + symset[symlist[i]] = 0.0; + } + else + memset(symset, 0, 0x100*sizeof(double)); + + /* let all strings vote */ + for (i = 0; i < n; i++) { + const lev_byte *stri = strings[i]; + double weighti = weights[i]; + size_t lengthi = lengths[i]; + double start = lengthi/ml*j; + double end = start + lengthi/ml; + size_t istart = floor(start); + size_t iend = ceil(end); + + /* rounding errors can overflow the buffer */ + if (iend > lengthi) + iend = lengthi; + + for (k = istart+1; k < iend; k++) + symset[stri[k]] += weighti; + symset[stri[istart]] += weighti*(1+istart - start); + symset[stri[iend-1]] -= weighti*(iend - end); + } + + /* find the elected symbol */ + k = symlist[0]; + for (i = 1; i < symlistlen; i++) { + if (symset[symlist[i]] > symset[k]) + k = symlist[i]; + } + median[j] = k; + } + + free(symset); + free(symlist); + + return median; +} + +/* used internally in make_usymlistset */ +typedef struct _HQItem HQItem; +struct _HQItem { + lev_wchar c; + double s; + HQItem *n; +}; + +/* free usmylistset hash + * this is a separate function because we need it in more than one place */ +static void +free_usymlistset_hash(HQItem *symmap) +{ + size_t j; + + for (j = 0; j < 0x100; j++) { + HQItem *p = symmap + j; + if (p->n == symmap || p->n == NULL) + continue; + p = p->n; + while (p) { + HQItem *q = p; + p = p->n; + free(q); + } + } + free(symmap); +} + +/* compute the sets of symbols each string contains, and the set of symbols + * in any of them (symset). meanwhile, count how many different symbols + * there are (used below for symlist). + * the symset is passed as an argument to avoid its allocation and + * deallocation when it's used in the caller too */ +static lev_wchar* +make_usymlistset(size_t n, const size_t *lengths, + const lev_wchar *strings[], size_t *symlistlen, + HQItem *symmap) +{ + lev_wchar *symlist; + size_t i, j; + + j = 0; + for (i = 0; i < n; i++) + j += lengths[i]; + + *symlistlen = 0; + if (j == 0) + return NULL; + + /* this is an ugly memory allocation avoiding hack: most hash elements + * will probably contain none or one symbols only so, when p->n is equal + * to symmap, it means there're no symbols yet, afters insterting the + * first one, p->n becomes normally NULL and then it behaves like an + * usual singly linked list */ + for (i = 0; i < 0x100; i++) + symmap[i].n = symmap; + for (i = 0; i < n; i++) { + const lev_wchar *stri = strings[i]; + for (j = 0; j < lengths[i]; j++) { + int c = stri[j]; + int key = (c + (c >> 7)) & 0xff; + HQItem *p = symmap + key; + if (p->n == symmap) { + p->c = c; + p->n = NULL; + (*symlistlen)++; + continue; + } + while (p->c != c && p->n != NULL) + p = p->n; + if (p->c != c) { + p->n = (HQItem*)malloc(sizeof(HQItem)); + if (!p->n) { + *symlistlen = (size_t)(-1); + return NULL; + } + p = p->n; + p->n = NULL; + p->c = c; + (*symlistlen)++; + } + } + } + /* create dense symbol table, so we can easily iterate over only characters + * present in the strings */ + { + size_t pos = 0; + symlist = (lev_wchar*)safe_malloc((*symlistlen), sizeof(lev_wchar)); + if (!symlist) { + *symlistlen = (size_t)(-1); + return NULL; + } + for (j = 0; j < 0x100; j++) { + HQItem *p = symmap + j; + while (p != NULL && p->n != symmap) { + symlist[pos++] = p->c; + p = p->n; + } + } + } + + return symlist; +} + +_LEV_STATIC_PY lev_wchar* +lev_u_quick_median(size_t n, + const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength) +{ + size_t symlistlen, len, i, j, k; + lev_wchar *symlist; + lev_wchar *median; /* the resulting string */ + HQItem *symmap; + double ml, wl; + + /* first check whether the result would be an empty string + * and compute resulting string length */ + ml = wl = 0.0; + for (i = 0; i < n; i++) { + ml += lengths[i]*weights[i]; + wl += weights[i]; + } + if (wl == 0.0) + return (lev_wchar*)calloc(1, sizeof(lev_wchar)); + ml = floor(ml/wl + 0.499999); + *medlength = len = ml; + if (!len) + return (lev_wchar*)calloc(1, sizeof(lev_wchar)); + median = (lev_wchar*)safe_malloc(len, sizeof(lev_wchar)); + if (!median) + return NULL; + + /* find the symbol set; + * now an empty symbol set is really a failure */ + symmap = (HQItem*)safe_malloc(0x100, sizeof(HQItem)); + if (!symmap) { + free(median); + return NULL; + } + symlist = make_usymlistset(n, lengths, strings, &symlistlen, symmap); + if (!symlist) { + free(median); + free_usymlistset_hash(symmap); + return NULL; + } + + for (j = 0; j < len; j++) { + /* clear the symbol probabilities */ + for (i = 0; i < 0x100; i++) { + HQItem *p = symmap + i; + if (p->n == symmap) + continue; + while (p) { + p->s = 0.0; + p = p->n; + } + } + + /* let all strings vote */ + for (i = 0; i < n; i++) { + const lev_wchar *stri = strings[i]; + double weighti = weights[i]; + size_t lengthi = lengths[i]; + double start = lengthi/ml*j; + double end = start + lengthi/ml; + size_t istart = floor(start); + size_t iend = ceil(end); + + /* rounding errors can overflow the buffer */ + if (iend > lengthi) + iend = lengthi; + + /* the inner part, including the complete last character */ + for (k = istart+1; k < iend; k++) { + int c = stri[k]; + int key = (c + (c >> 7)) & 0xff; + HQItem *p = symmap + key; + while (p->c != c) + p = p->n; + p->s += weighti; + } + /* the initial fraction */ + { + int c = stri[istart]; + int key = (c + (c >> 7)) & 0xff; + HQItem *p = symmap + key; + while (p->c != c) + p = p->n; + p->s += weighti*(1+istart - start); + } + /* subtract what we counted from the last character but doesn't + * actually belong here. + * this strategy works also when istart+1 == iend (i.e., everything + * happens inside a one character) */ + { + int c = stri[iend-1]; + int key = (c + (c >> 7)) & 0xff; + HQItem *p = symmap + key; + while (p->c != c) + p = p->n; + p->s -= weighti*(iend - end); + } + } + + /* find the elected symbol */ + { + HQItem *max = NULL; + + for (i = 0; i < 0x100; i++) { + HQItem *p = symmap + i; + if (p->n == symmap) + continue; + while (p) { + if (!max || p->s > max->s) + max = p; + p = p->n; + } + } + median[j] = max->c; + } + } + + free_usymlistset_hash(symmap); + free(symlist); + + return median; +} +/* }}} */ + +/**************************************************************************** + * + * Set medians + * + ****************************************************************************/ +/* {{{ */ + +/** + * lev_set_median_index: + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * + * Finds the median string of a string set @strings. + * + * Returns: An index in @strings pointing to the set median, -1 in case of + * failure. + **/ +_LEV_STATIC_PY size_t +lev_set_median_index(size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights) +{ + size_t minidx = 0; + double mindist = LEV_INFINITY; + size_t i; + long int *distances; + + distances = (long int*)safe_malloc((n*(n - 1)/2), sizeof(long int)); + if (!distances) + return (size_t)-1; + + memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); /* XXX */ + for (i = 0; i < n; i++) { + size_t j = 0; + double dist = 0.0; + const lev_byte *stri = strings[i]; + size_t leni = lengths[i]; + /* below diagonal */ + while (j < i && dist < mindist) { + size_t dindex = (i - 1)*(i - 2)/2 + j; + long int d; + if (distances[dindex] >= 0) + d = distances[dindex]; + else { + d = lev_edit_distance(lengths[j], strings[j], leni, stri, 0); + if (d < 0) { + free(distances); + return (size_t)-1; + } + } + dist += weights[j]*d; + j++; + } + j++; /* no need to compare item with itself */ + /* above diagonal */ + while (j < n && dist < mindist) { + size_t dindex = (j - 1)*(j - 2)/2 + i; + distances[dindex] = lev_edit_distance(lengths[j], strings[j], + leni, stri, 0); + if (distances[dindex] < 0) { + free(distances); + return (size_t)-1; + } + dist += weights[j]*distances[dindex]; + j++; + } + + if (dist < mindist) { + mindist = dist; + minidx = i; + } + } + + free(distances); + return minidx; +} + +/** + * lev_u_set_median_index: + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * + * Finds the median string of a string set @strings. + * + * Returns: An index in @strings pointing to the set median, -1 in case of + * failure. + **/ +_LEV_STATIC_PY size_t +lev_u_set_median_index(size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights) +{ + size_t minidx = 0; + double mindist = LEV_INFINITY; + size_t i; + long int *distances; + + distances = (long int*)safe_malloc((n*(n - 1)/2), sizeof(long int)); + if (!distances) + return (size_t)-1; + + memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); /* XXX */ + for (i = 0; i < n; i++) { + size_t j = 0; + double dist = 0.0; + const lev_wchar *stri = strings[i]; + size_t leni = lengths[i]; + /* below diagonal */ + while (j < i && dist < mindist) { + size_t dindex = (i - 1)*(i - 2)/2 + j; + long int d; + if (distances[dindex] >= 0) + d = distances[dindex]; + else { + d = lev_u_edit_distance(lengths[j], strings[j], leni, stri, 0); + if (d < 0) { + free(distances); + return (size_t)-1; + } + } + dist += weights[j]*d; + j++; + } + j++; /* no need to compare item with itself */ + /* above diagonal */ + while (j < n && dist < mindist) { + size_t dindex = (j - 1)*(j - 2)/2 + i; + distances[dindex] = lev_u_edit_distance(lengths[j], strings[j], + leni, stri, 0); + if (distances[dindex] < 0) { + free(distances); + return (size_t)-1; + } + dist += weights[j]*distances[dindex]; + j++; + } + + if (dist < mindist) { + mindist = dist; + minidx = i; + } + } + + free(distances); + return minidx; +} + +/** + * lev_set_median: + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * @medlength: Where the length of the median string should be stored. + * + * Finds the median string of a string set @strings. + * + * Returns: The set median as a newly allocate string, its length is stored + * in @medlength. %NULL in the case of failure. + **/ +_LEV_STATIC_PY lev_byte* +lev_set_median(size_t n, const size_t *lengths, + const lev_byte *strings[], + const double *weights, + size_t *medlength) +{ + size_t minidx = lev_set_median_index(n, lengths, strings, weights); + lev_byte *result; + + if (minidx == (size_t)-1) + return NULL; + + *medlength = lengths[minidx]; + if (!lengths[minidx]) + return (lev_byte*)calloc(1, sizeof(lev_byte)); + + result = (lev_byte*)safe_malloc(lengths[minidx], sizeof(lev_byte)); + if (!result) + return NULL; + return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_byte)); +} + +/** + * lev_u_set_median: + * @n: The size of @lengths, @strings, and @weights. + * @lengths: The lengths of @strings. + * @strings: An array of strings, that may contain NUL characters. + * @weights: The string weights (they behave exactly as multiplicities, though + * any positive value is allowed, not just integers). + * @medlength: Where the length of the median string should be stored. + * + * Finds the median string of a string set @strings. + * + * Returns: The set median as a newly allocate string, its length is stored + * in @medlength. %NULL in the case of failure. + **/ +_LEV_STATIC_PY lev_wchar* +lev_u_set_median(size_t n, const size_t *lengths, + const lev_wchar *strings[], + const double *weights, + size_t *medlength) +{ + size_t minidx = lev_u_set_median_index(n, lengths, strings, weights); + lev_wchar *result; + + if (minidx == (size_t)-1) + return NULL; + + *medlength = lengths[minidx]; + if (!lengths[minidx]) + return (lev_wchar*)calloc(1, sizeof(lev_wchar)); + + result = (lev_wchar*)safe_malloc(lengths[minidx], sizeof(lev_wchar)); + if (!result) + return NULL; + return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_wchar)); +} +/* }}} */ + +/**************************************************************************** + * + * Set, sequence distances + * + ****************************************************************************/ +/* {{{ */ + +/** + * lev_edit_seq_distance: + * @n1: The length of @lengths1 and @strings1. + * @lengths1: The lengths of strings in @strings1. + * @strings1: An array of strings that may contain NUL characters. + * @n2: The length of @lengths2 and @strings2. + * @lengths2: The lengths of strings in @strings2. + * @strings2: An array of strings that may contain NUL characters. + * + * Finds the distance between string sequences @strings1 and @strings2. + * + * In other words, this is a double-Levenshtein algorithm. + * + * The cost of string replace operation is based on string similarity: it's + * zero for identical strings and 2 for completely unsimilar strings. + * + * Returns: The distance of the two sequences. + **/ +_LEV_STATIC_PY double +lev_edit_seq_distance(size_t n1, const size_t *lengths1, + const lev_byte *strings1[], + size_t n2, const size_t *lengths2, + const lev_byte *strings2[]) +{ + size_t i; + double *row; /* we only need to keep one row of costs */ + double *end; + + /* strip common prefix */ + while (n1 > 0 && n2 > 0 + && *lengths1 == *lengths2 + && memcmp(*strings1, *strings2, + *lengths1*sizeof(lev_byte)) == 0) { + n1--; + n2--; + strings1++; + strings2++; + lengths1++; + lengths2++; + } + + /* strip common suffix */ + while (n1 > 0 && n2 > 0 + && lengths1[n1-1] == lengths2[n2-1] + && memcmp(strings1[n1-1], strings2[n2-1], + lengths1[n1-1]*sizeof(lev_byte)) == 0) { + n1--; + n2--; + } + + /* catch trivial cases */ + if (n1 == 0) + return n2; + if (n2 == 0) + return n1; + + /* make the inner cycle (i.e. strings2) the longer one */ + if (n1 > n2) { + size_t nx = n1; + const size_t *lx = lengths1; + const lev_byte **sx = strings1; + n1 = n2; + n2 = nx; + lengths1 = lengths2; + lengths2 = lx; + strings1 = strings2; + strings2 = sx; + } + n1++; + n2++; + + /* initalize first row */ + row = (double*)safe_malloc(n2, sizeof(double)); + if (!row) + return -1.0; + end = row + n2 - 1; + for (i = 0; i < n2; i++) + row[i] = (double)i; + + /* go through the matrix and compute the costs. yes, this is an extremely + * obfuscated version, but also extremely memory-conservative and relatively + * fast. */ + for (i = 1; i < n1; i++) { + double *p = row + 1; + const lev_byte *str1 = strings1[i - 1]; + const size_t len1 = lengths1[i - 1]; + const lev_byte **str2p = strings2; + const size_t *len2p = lengths2; + double D = i - 1.0; + double x = i; + while (p <= end) { + size_t l = len1 + *len2p; + double q; + if (l == 0) + q = D; + else { + size_t d = lev_edit_distance(len1, str1, *(len2p++), *(str2p++), 1); + if (d == (size_t)(-1)) { + free(row); + return -1.0; + } + q = D + 2.0/l*d; + } + x += 1.0; + if (x > q) + x = q; + D = *p; + if (x > D + 1.0) + x = D + 1.0; + *(p++) = x; + } + } + + { + double q = *end; + free(row); + return q; + } +} + +/** + * lev_u_edit_seq_distance: + * @n1: The length of @lengths1 and @strings1. + * @lengths1: The lengths of strings in @strings1. + * @strings1: An array of strings that may contain NUL characters. + * @n2: The length of @lengths2 and @strings2. + * @lengths2: The lengths of strings in @strings2. + * @strings2: An array of strings that may contain NUL characters. + * + * Finds the distance between string sequences @strings1 and @strings2. + * + * In other words, this is a double-Levenshtein algorithm. + * + * The cost of string replace operation is based on string similarity: it's + * zero for identical strings and 2 for completely unsimilar strings. + * + * Returns: The distance of the two sequences. + **/ +_LEV_STATIC_PY double +lev_u_edit_seq_distance(size_t n1, const size_t *lengths1, + const lev_wchar *strings1[], + size_t n2, const size_t *lengths2, + const lev_wchar *strings2[]) +{ + size_t i; + double *row; /* we only need to keep one row of costs */ + double *end; + + /* strip common prefix */ + while (n1 > 0 && n2 > 0 + && *lengths1 == *lengths2 + && memcmp(*strings1, *strings2, + *lengths1*sizeof(lev_wchar)) == 0) { + n1--; + n2--; + strings1++; + strings2++; + lengths1++; + lengths2++; + } + + /* strip common suffix */ + while (n1 > 0 && n2 > 0 + && lengths1[n1-1] == lengths2[n2-1] + && memcmp(strings1[n1-1], strings2[n2-1], + lengths1[n1-1]*sizeof(lev_wchar)) == 0) { + n1--; + n2--; + } + + /* catch trivial cases */ + if (n1 == 0) + return (double)n2; + if (n2 == 0) + return (double)n1; + + /* make the inner cycle (i.e. strings2) the longer one */ + if (n1 > n2) { + size_t nx = n1; + const size_t *lx = lengths1; + const lev_wchar **sx = strings1; + n1 = n2; + n2 = nx; + lengths1 = lengths2; + lengths2 = lx; + strings1 = strings2; + strings2 = sx; + } + n1++; + n2++; + + /* initalize first row */ + row = (double*)safe_malloc(n2, sizeof(double)); + if (!row) + return -1.0; + end = row + n2 - 1; + for (i = 0; i < n2; i++) + row[i] = (double)i; + + /* go through the matrix and compute the costs. yes, this is an extremely + * obfuscated version, but also extremely memory-conservative and relatively + * fast. */ + for (i = 1; i < n1; i++) { + double *p = row + 1; + const lev_wchar *str1 = strings1[i - 1]; + const size_t len1 = lengths1[i - 1]; + const lev_wchar **str2p = strings2; + const size_t *len2p = lengths2; + double D = i - 1.0; + double x = i; + while (p <= end) { + size_t l = len1 + *len2p; + double q; + if (l == 0) + q = D; + else { + size_t d = lev_u_edit_distance(len1, str1, *(len2p++), *(str2p++), 1); + if (d == (size_t)(-1)) { + free(row); + return -1.0; + } + q = D + 2.0/l*d; + } + x += 1.0; + if (x > q) + x = q; + D = *p; + if (x > D + 1.0) + x = D + 1.0; + *(p++) = x; + } + } + + { + double q = *end; + free(row); + return q; + } +} + +/** + * lev_set_distance: + * @n1: The length of @lengths1 and @strings1. + * @lengths1: The lengths of strings in @strings1. + * @strings1: An array of strings that may contain NUL characters. + * @n2: The length of @lengths2 and @strings2. + * @lengths2: The lengths of strings in @strings2. + * @strings2: An array of strings that may contain NUL characters. + * + * Finds the distance between string sets @strings1 and @strings2. + * + * The difference from lev_edit_seq_distance() is that order doesn't matter. + * The optimal association of @strings1 and @strings2 is found first and + * the similarity is computed for that. + * + * Uses sequential Munkers-Blackman algorithm. + * + * Returns: The distance of the two sets. + **/ +_LEV_STATIC_PY double +lev_set_distance(size_t n1, const size_t *lengths1, + const lev_byte *strings1[], + size_t n2, const size_t *lengths2, + const lev_byte *strings2[]) +{ + double *dists; /* the (modified) distance matrix, indexed [row*n1 + col] */ + double *r; + size_t i, j; + size_t *map; + double sum; + + /* catch trivial cases */ + if (n1 == 0) + return (double)n2; + if (n2 == 0) + return (double)n1; + + /* make the number of columns (n1) smaller than the number of rows */ + if (n1 > n2) { + size_t nx = n1; + const size_t *lx = lengths1; + const lev_byte **sx = strings1; + n1 = n2; + n2 = nx; + lengths1 = lengths2; + lengths2 = lx; + strings1 = strings2; + strings2 = sx; + } + + /* compute distances from each to each */ + r = dists = (double*)safe_malloc_3(n1, n2, sizeof(double)); + if (!r) + return -1.0; + for (i = 0; i < n2; i++) { + size_t len2 = lengths2[i]; + const lev_byte *str2 = strings2[i]; + const size_t *len1p = lengths1; + const lev_byte **str1p = strings1; + for (j = 0; j < n1; j++) { + size_t l = len2 + *len1p; + if (l == 0) + *(r++) = 0.0; + else { + size_t d = lev_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1); + if (d == (size_t)(-1)) { + free(r); + return -1.0; + } + *(r++) = (double)d/l; + } + } + } + + /* find the optimal mapping between the two sets */ + map = munkers_blackman(n1, n2, dists); + if (!map) + return -1.0; + + /* sum the set distance */ + sum = n2 - n1; + for (j = 0; j < n1; j++) { + size_t l; + i = map[j]; + l = lengths1[j] + lengths2[i]; + if (l > 0) { + size_t d = lev_edit_distance(lengths1[j], strings1[j], + lengths2[i], strings2[i], 1); + if (d == (size_t)(-1)) { + free(map); + return -1.0; + } + sum += 2.0*d/l; + } + } + free(map); + + return sum; +} + +/** + * lev_u_set_distance: + * @n1: The length of @lengths1 and @strings1. + * @lengths1: The lengths of strings in @strings1. + * @strings1: An array of strings that may contain NUL characters. + * @n2: The length of @lengths2 and @strings2. + * @lengths2: The lengths of strings in @strings2. + * @strings2: An array of strings that may contain NUL characters. + * + * Finds the distance between string sets @strings1 and @strings2. + * + * The difference from lev_u_edit_seq_distance() is that order doesn't matter. + * The optimal association of @strings1 and @strings2 is found first and + * the similarity is computed for that. + * + * Uses sequential Munkers-Blackman algorithm. + * + * Returns: The distance of the two sets. + **/ +_LEV_STATIC_PY double +lev_u_set_distance(size_t n1, const size_t *lengths1, + const lev_wchar *strings1[], + size_t n2, const size_t *lengths2, + const lev_wchar *strings2[]) +{ + double *dists; /* the (modified) distance matrix, indexed [row*n1 + col] */ + double *r; + size_t i, j; + size_t *map; + double sum; + + /* catch trivial cases */ + if (n1 == 0) + return (double)n2; + if (n2 == 0) + return (double)n1; + + /* make the number of columns (n1) smaller than the number of rows */ + if (n1 > n2) { + size_t nx = n1; + const size_t *lx = lengths1; + const lev_wchar **sx = strings1; + n1 = n2; + n2 = nx; + lengths1 = lengths2; + lengths2 = lx; + strings1 = strings2; + strings2 = sx; + } + + /* compute distances from each to each */ + r = dists = (double*)safe_malloc_3(n1, n2, sizeof(double)); + if (!r) + return -1.0; + for (i = 0; i < n2; i++) { + size_t len2 = lengths2[i]; + const lev_wchar *str2 = strings2[i]; + const size_t *len1p = lengths1; + const lev_wchar **str1p = strings1; + for (j = 0; j < n1; j++) { + size_t l = len2 + *len1p; + if (l == 0) + *(r++) = 0.0; + else { + size_t d = lev_u_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1); + if (d == (size_t)(-1)) { + free(r); + return -1.0; + } + *(r++) = (double)d/l; + } + } + } + + /* find the optimal mapping between the two sets */ + map = munkers_blackman(n1, n2, dists); + if (!map) + return -1.0; + + /* sum the set distance */ + sum = n2 - n1; + for (j = 0; j < n1; j++) { + size_t l; + i = map[j]; + l = lengths1[j] + lengths2[i]; + if (l > 0) { + size_t d = lev_u_edit_distance(lengths1[j], strings1[j], + lengths2[i], strings2[i], 1); + if (d == (size_t)(-1)) { + free(map); + return -1.0; + } + sum += 2.0*d/l; + } + } + free(map); + + return sum; +} + +/* + * Munkers-Blackman algorithm. + */ +static size_t* +munkers_blackman(size_t n1, size_t n2, double *dists) +{ + size_t i, j; + size_t *covc, *covr; /* 1 if column/row is covered */ + /* these contain 1-based indices, so we can use zero as `none' + * zstarr: column of a z* in given row + * zstarc: row of a z* in given column + * zprimer: column of a z' in given row */ + size_t *zstarr, *zstarc, *zprimer; + + /* allocate memory */ + covc = calloc(n1, sizeof(size_t)); + if (!covc) + return NULL; + zstarc = calloc(n1, sizeof(size_t)); + if (!zstarc) { + free(covc); + return NULL; + } + covr = calloc(n2, sizeof(size_t)); + if (!covr) { + free(zstarc); + free(covc); + return NULL; + } + zstarr = calloc(n2, sizeof(size_t)); + if (!zstarr) { + free(covr); + free(zstarc); + free(covc); + return NULL; + } + zprimer = calloc(n2, sizeof(size_t)); + if (!zprimer) { + free(zstarr); + free(covr); + free(zstarc); + free(covc); + return NULL; + } + + /* step 0 (subtract minimal distance) and step 1 (find zeroes) */ + for (j = 0; j < n1; j++) { + size_t minidx = 0; + double *col = dists + j; + double min = *col; + double *p = col + n1; + for (i = 1; i < n2; i++) { + if (min > *p) { + minidx = i; + min = *p; + } + p += n1; + } + /* subtract */ + p = col; + for (i = 0; i < n2; i++) { + *p -= min; + if (*p < LEV_EPSILON) + *p = 0.0; + p += n1; + } + /* star the zero, if possible */ + if (!zstarc[j] && !zstarr[minidx]) { + zstarc[j] = minidx + 1; + zstarr[minidx] = j + 1; + } + else { + /* otherwise try to find some other */ + p = col; + for (i = 0; i < n2; i++) { + if (i != minidx && *p == 0.0 + && !zstarc[j] && !zstarr[i]) { + zstarc[j] = i + 1; + zstarr[i] = j + 1; + break; + } + p += n1; + } + } + } + + /* main */ + while (1) { + /* step 2 (cover columns containing z*) */ + { + size_t nc = 0; + for (j = 0; j < n1; j++) { + if (zstarc[j]) { + covc[j] = 1; + nc++; + } + } + if (nc == n1) + break; + } + + /* step 3 (find uncovered zeroes) */ + while (1) { + step_3: + /* search uncovered matrix entries */ + for (j = 0; j < n1; j++) { + double *p = dists + j; + if (covc[j]) + continue; + for (i = 0; i < n2; i++) { + if (!covr[i] && *p == 0.0) { + /* when a zero is found, prime it */ + zprimer[i] = j + 1; + if (zstarr[i]) { + /* if there's a z* in the same row, + * uncover the column, cover the row and redo */ + covr[i] = 1; + covc[zstarr[i] - 1] = 0; + goto step_3; + } + /* if there's no z*, + * we are at the end of our path an can convert z' + * to z* */ + goto step_4; + } + p += n1; + } + } + + /* step 5 (new zero manufacturer) + * we can't get here, unless no zero is found at all */ + { + /* find the smallest uncovered entry */ + double min = LEV_INFINITY; + for (j = 0; j < n1; j++) { + double *p = dists + j; + if (covc[j]) + continue; + for (i = 0; i < n2; i++) { + if (!covr[i] && min > *p) { + min = *p; + } + p += n1; + } + } + /* add it to all covered rows */ + for (i = 0; i < n2; i++) { + double *p = dists + i*n1; + if (!covr[i]) + continue; + for (j = 0; j < n1; j++) + *(p++) += min; + } + /* subtract if from all uncovered columns */ + for (j = 0; j < n1; j++) { + double *p = dists + j; + if (covc[j]) + continue; + for (i = 0; i < n2; i++) { + *p -= min; + if (*p < LEV_EPSILON) + *p = 0.0; + p += n1; + } + } + } + } + + /* step 4 (increment the number of z*) + * i is the row number (we get it from step 3) */ + step_4: + i++; + do { + size_t x = i; + + i--; + j = zprimer[i] - 1; /* move to z' in the same row */ + zstarr[i] = j + 1; /* mark it as z* in row buffer */ + i = zstarc[j]; /* move to z* in the same column */ + zstarc[j] = x; /* mark the z' as being new z* */ + } while (i); + memset(zprimer, 0, n2*sizeof(size_t)); + memset(covr, 0, n2*sizeof(size_t)); + memset(covc, 0, n1*sizeof(size_t)); + } + + free(dists); + free(covc); + free(covr); + /*free(zstarc); this is the result */ + free(zstarr); + free(zprimer); + + for (j = 0; j < n1; j++) + zstarc[j]--; + return zstarc; +} +/* }}} */ + +/**************************************************************************** + * + * Editops and other difflib-like stuff. + * + ****************************************************************************/ +/* {{{ */ + +/** + * lev_editops_check_errors: + * @len1: The length of an eventual @ops source string. + * @len2: The length of an eventual @ops destination string. + * @n: The length of @ops. + * @ops: An array of elementary edit operations. + * + * Checks whether @ops is consistent and applicable as a partial edit from a + * string of length @len1 to a string of length @len2. + * + * Returns: Zero if @ops seems OK, a nonzero error code otherwise. + **/ +_LEV_STATIC_PY int +lev_editops_check_errors(size_t len1, size_t len2, + size_t n, const LevEditOp *ops) +{ + const LevEditOp *o; + size_t i; + + if (!n) + return LEV_EDIT_ERR_OK; + + /* check bounds */ + o = ops; + for (i = n; i; i--, o++) { + if (o->type >= LEV_EDIT_LAST) + return LEV_EDIT_ERR_TYPE; + if (o->spos > len1 || o->dpos > len2) + return LEV_EDIT_ERR_OUT; + if (o->spos == len1 && o->type != LEV_EDIT_INSERT) + return LEV_EDIT_ERR_OUT; + if (o->dpos == len2 && o->type != LEV_EDIT_DELETE) + return LEV_EDIT_ERR_OUT; + } + + /* check ordering */ + o = ops + 1; + for (i = n - 1; i; i--, o++, ops++) { + if (o->spos < ops->spos || o->dpos < ops->dpos) + return LEV_EDIT_ERR_ORDER; + } + + return LEV_EDIT_ERR_OK; +} + +/** + * lev_opcodes_check_errors: + * @len1: The length of an eventual @ops source string. + * @len2: The length of an eventual @ops destination string. + * @nb: The length of @bops. + * @bops: An array of difflib block edit operation codes. + * + * Checks whether @bops is consistent and applicable as an edit from a + * string of length @len1 to a string of length @len2. + * + * Returns: Zero if @bops seems OK, a nonzero error code otherwise. + **/ +_LEV_STATIC_PY int +lev_opcodes_check_errors(size_t len1, size_t len2, + size_t nb, const LevOpCode *bops) +{ + const LevOpCode *b; + size_t i; + + if (!nb) + return 1; + + /* check completenes */ + if (bops->sbeg || bops->dbeg + || bops[nb - 1].send != len1 || bops[nb - 1].dend != len2) + return LEV_EDIT_ERR_SPAN; + + /* check bounds and block consistency */ + b = bops; + for (i = nb; i; i--, b++) { + if (b->send > len1 || b->dend > len2) + return LEV_EDIT_ERR_OUT; + switch (b->type) { + case LEV_EDIT_KEEP: + case LEV_EDIT_REPLACE: + if (b->dend - b->dbeg != b->send - b->sbeg || b->dend == b->dbeg) + return LEV_EDIT_ERR_BLOCK; + break; + + case LEV_EDIT_INSERT: + if (b->dend - b->dbeg == 0 || b->send - b->sbeg != 0) + return LEV_EDIT_ERR_BLOCK; + break; + + case LEV_EDIT_DELETE: + if (b->send - b->sbeg == 0 || b->dend - b->dbeg != 0) + return LEV_EDIT_ERR_BLOCK; + break; + + default: + return LEV_EDIT_ERR_TYPE; + break; + } + } + + /* check ordering */ + b = bops + 1; + for (i = nb - 1; i; i--, b++, bops++) { + if (b->sbeg != bops->send || b->dbeg != bops->dend) + return LEV_EDIT_ERR_ORDER; + } + + return LEV_EDIT_ERR_OK; +} + +/** + * lev_editops_invert: + * @n: The length of @ops. + * @ops: An array of elementary edit operations. + * + * Inverts the sense of @ops. It is modified in place. + * + * In other words, @ops becomes a valid partial edit for the original source + * and destination strings with their roles exchanged. + **/ +_LEV_STATIC_PY void +lev_editops_invert(size_t n, LevEditOp *ops) +{ + size_t i; + + for (i = n; i; i--, ops++) { + size_t z; + + z = ops->dpos; + ops->dpos = ops->spos; + ops->spos = z; + if (ops->type & 2) + ops->type ^= 1; + } +} + +/** + * lev_editops_apply: + * @len1: The length of @string1. + * @string1: A string of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A string of length @len2, may contain NUL characters. + * @n: The size of @ops. + * @ops: An array of elementary edit operations. + * @len: Where the size of the resulting string should be stored. + * + * Applies a partial edit @ops from @string1 to @string2. + * + * NB: @ops is not checked for applicability. + * + * Returns: The result of the partial edit as a newly allocated string, its + * length is stored in @len. + **/ +_LEV_STATIC_PY lev_byte* +lev_editops_apply(size_t len1, const lev_byte *string1, + __attribute__((unused)) size_t len2, const lev_byte *string2, + size_t n, const LevEditOp *ops, + size_t *len) +{ + lev_byte *dst, *dpos; /* destination string */ + const lev_byte *spos; /* source string position */ + size_t i, j; + + /* this looks too complex for such a simple task, but note ops is not + * a complete edit sequence, we have to be able to apply anything anywhere */ + dpos = dst = (lev_byte*)safe_malloc((n + len1), sizeof(lev_byte)); + if (!dst) { + *len = (size_t)(-1); + return NULL; + } + spos = string1; + for (i = n; i; i--, ops++) { + /* XXX: this fine with gcc internal memcpy, but when memcpy is + * actually a function, it may be pretty slow */ + j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP); + if (j) { + memcpy(dpos, spos, j*sizeof(lev_byte)); + spos += j; + dpos += j; + } + switch (ops->type) { + case LEV_EDIT_DELETE: + spos++; + break; + + case LEV_EDIT_REPLACE: + spos++; + case LEV_EDIT_INSERT: + *(dpos++) = string2[ops->dpos]; + break; + + default: + break; + } + } + j = len1 - (spos - string1); + if (j) { + memcpy(dpos, spos, j*sizeof(lev_byte)); + spos += j; + dpos += j; + } + + *len = dpos - dst; + /* possible realloc failure is detected with *len != 0 */ + return realloc(dst, *len*sizeof(lev_byte)); +} + +/** + * lev_u_editops_apply: + * @len1: The length of @string1. + * @string1: A string of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A string of length @len2, may contain NUL characters. + * @n: The size of @ops. + * @ops: An array of elementary edit operations. + * @len: Where the size of the resulting string should be stored. + * + * Applies a partial edit @ops from @string1 to @string2. + * + * NB: @ops is not checked for applicability. + * + * Returns: The result of the partial edit as a newly allocated string, its + * length is stored in @len. + **/ +_LEV_STATIC_PY lev_wchar* +lev_u_editops_apply(size_t len1, const lev_wchar *string1, + __attribute__((unused)) size_t len2, + const lev_wchar *string2, + size_t n, const LevEditOp *ops, + size_t *len) +{ + lev_wchar *dst, *dpos; /* destination string */ + const lev_wchar *spos; /* source string position */ + size_t i, j; + + /* this looks too complex for such a simple task, but note ops is not + * a complete edit sequence, we have to be able to apply anything anywhere */ + dpos = dst = (lev_wchar*)safe_malloc((n + len1), sizeof(lev_wchar)); + if (!dst) { + *len = (size_t)(-1); + return NULL; + } + spos = string1; + for (i = n; i; i--, ops++) { + /* XXX: this is fine with gcc internal memcpy, but when memcpy is + * actually a function, it may be pretty slow */ + j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP); + if (j) { + memcpy(dpos, spos, j*sizeof(lev_wchar)); + spos += j; + dpos += j; + } + switch (ops->type) { + case LEV_EDIT_DELETE: + spos++; + break; + + case LEV_EDIT_REPLACE: + spos++; + case LEV_EDIT_INSERT: + *(dpos++) = string2[ops->dpos]; + break; + + default: + break; + } + } + j = len1 - (spos - string1); + if (j) { + memcpy(dpos, spos, j*sizeof(lev_wchar)); + spos += j; + dpos += j; + } + + *len = dpos - dst; + /* possible realloc failure is detected with *len != 0 */ + return realloc(dst, *len*sizeof(lev_wchar)); +} + +/** + * editops_from_cost_matrix: + * @len1: The length of @string1. + * @string1: A string of length @len1, may contain NUL characters. + * @o1: The offset where the matrix starts from the start of @string1. + * @len2: The length of @string2. + * @string2: A string of length @len2, may contain NUL characters. + * @o2: The offset where the matrix starts from the start of @string2. + * @matrix: The cost matrix. + * @n: Where the number of edit operations should be stored. + * + * Reconstructs the optimal edit sequence from the cost matrix @matrix. + * + * The matrix is freed. + * + * Returns: The optimal edit sequence, as a newly allocated array of + * elementary edit operations, it length is stored in @n. + **/ +static LevEditOp* +editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1, + size_t len2, const lev_byte *string2, size_t off2, + size_t *matrix, size_t *n) +{ + size_t *p; + size_t i, j, pos; + LevEditOp *ops; + int dir = 0; + + pos = *n = matrix[len1*len2 - 1]; + if (!*n) { + free(matrix); + return NULL; + } + ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); + if (!ops) { + free(matrix); + *n = (size_t)(-1); + return NULL; + } + i = len1 - 1; + j = len2 - 1; + p = matrix + len1*len2 - 1; + while (i || j) { + /* prefer contiuning in the same direction */ + if (dir < 0 && j && *p == *(p - 1) + 1) { + pos--; + ops[pos].type = LEV_EDIT_INSERT; + ops[pos].spos = i + off1; + ops[pos].dpos = --j + off2; + p--; + continue; + } + if (dir > 0 && i && *p == *(p - len2) + 1) { + pos--; + ops[pos].type = LEV_EDIT_DELETE; + ops[pos].spos = --i + off1; + ops[pos].dpos = j + off2; + p -= len2; + continue; + } + if (i && j && *p == *(p - len2 - 1) + && string1[i - 1] == string2[j - 1]) { + /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */ + i--; + j--; + p -= len2 + 1; + dir = 0; + continue; + } + if (i && j && *p == *(p - len2 - 1) + 1) { + pos--; + ops[pos].type = LEV_EDIT_REPLACE; + ops[pos].spos = --i + off1; + ops[pos].dpos = --j + off2; + p -= len2 + 1; + dir = 0; + continue; + } + /* we cant't turn directly from -1 to 1, in this case it would be better + * to go diagonally, but check it (dir == 0) */ + if (dir == 0 && j && *p == *(p - 1) + 1) { + pos--; + ops[pos].type = LEV_EDIT_INSERT; + ops[pos].spos = i + off1; + ops[pos].dpos = --j + off2; + p--; + dir = -1; + continue; + } + if (dir == 0 && i && *p == *(p - len2) + 1) { + pos--; + ops[pos].type = LEV_EDIT_DELETE; + ops[pos].spos = --i + off1; + ops[pos].dpos = j + off2; + p -= len2; + dir = 1; + continue; + } + /* coredump right now, later might be too late ;-) */ + assert("lost in the cost matrix" == NULL); + } + free(matrix); + + return ops; +} + +/** + * lev_editops_find: + * @len1: The length of @string1. + * @string1: A string of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A string of length @len2, may contain NUL characters. + * @n: Where the number of edit operations should be stored. + * + * Find an optimal edit sequence from @string1 to @string2. + * + * When there's more than one optimal sequence, a one is arbitrarily (though + * deterministically) chosen. + * + * Returns: The optimal edit sequence, as a newly allocated array of + * elementary edit operations, it length is stored in @n. + * It is normalized, i.e., keep operations are not included. + **/ +_LEV_STATIC_PY LevEditOp* +lev_editops_find(size_t len1, const lev_byte *string1, + size_t len2, const lev_byte *string2, + size_t *n) +{ + size_t len1o, len2o; + size_t i; + size_t *matrix; /* cost matrix */ + + /* strip common prefix */ + len1o = 0; + while (len1 > 0 && len2 > 0 && *string1 == *string2) { + len1--; + len2--; + string1++; + string2++; + len1o++; + } + len2o = len1o; + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { + len1--; + len2--; + } + len1++; + len2++; + + /* initialize first row and column */ + matrix = (size_t*)safe_malloc_3(len1, len2, sizeof(size_t)); + if (!matrix) { + *n = (size_t)(-1); + return NULL; + } + for (i = 0; i < len2; i++) + matrix[i] = i; + for (i = 1; i < len1; i++) + matrix[len2*i] = i; + + /* find the costs and fill the matrix */ + for (i = 1; i < len1; i++) { + size_t *prev = matrix + (i - 1)*len2; + size_t *p = matrix + i*len2; + size_t *end = p + len2 - 1; + const lev_byte char1 = string1[i - 1]; + const lev_byte *char2p = string2; + size_t x = i; + p++; + while (p <= end) { + size_t c3 = *(prev++) + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + c3 = *prev + 1; + if (x > c3) + x = c3; + *(p++) = x; + } + } + + /* find the way back */ + return editops_from_cost_matrix(len1, string1, len1o, + len2, string2, len2o, + matrix, n); +} + +/** + * ueditops_from_cost_matrix: + * @len1: The length of @string1. + * @string1: A string of length @len1, may contain NUL characters. + * @o1: The offset where the matrix starts from the start of @string1. + * @len2: The length of @string2. + * @string2: A string of length @len2, may contain NUL characters. + * @o2: The offset where the matrix starts from the start of @string2. + * @matrix: The cost matrix. + * @n: Where the number of edit operations should be stored. + * + * Reconstructs the optimal edit sequence from the cost matrix @matrix. + * + * The matrix is freed. + * + * Returns: The optimal edit sequence, as a newly allocated array of + * elementary edit operations, it length is stored in @n. + **/ +static LevEditOp* +ueditops_from_cost_matrix(size_t len1, const lev_wchar *string1, size_t o1, + size_t len2, const lev_wchar *string2, size_t o2, + size_t *matrix, size_t *n) +{ + size_t *p; + size_t i, j, pos; + LevEditOp *ops; + int dir = 0; + + pos = *n = matrix[len1*len2 - 1]; + if (!*n) { + free(matrix); + return NULL; + } + ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); + if (!ops) { + free(matrix); + *n = (size_t)(-1); + return NULL; + } + i = len1 - 1; + j = len2 - 1; + p = matrix + len1*len2 - 1; + while (i || j) { + /* prefer contiuning in the same direction */ + if (dir < 0 && j && *p == *(p - 1) + 1) { + pos--; + ops[pos].type = LEV_EDIT_INSERT; + ops[pos].spos = i + o1; + ops[pos].dpos = --j + o2; + p--; + continue; + } + if (dir > 0 && i && *p == *(p - len2) + 1) { + pos--; + ops[pos].type = LEV_EDIT_DELETE; + ops[pos].spos = --i + o1; + ops[pos].dpos = j + o2; + p -= len2; + continue; + } + if (i && j && *p == *(p - len2 - 1) + && string1[i - 1] == string2[j - 1]) { + /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */ + i--; + j--; + p -= len2 + 1; + dir = 0; + continue; + } + if (i && j && *p == *(p - len2 - 1) + 1) { + pos--; + ops[pos].type = LEV_EDIT_REPLACE; + ops[pos].spos = --i + o1; + ops[pos].dpos = --j + o2; + p -= len2 + 1; + dir = 0; + continue; + } + /* we cant't turn directly from -1 to 1, in this case it would be better + * to go diagonally, but check it (dir == 0) */ + if (dir == 0 && j && *p == *(p - 1) + 1) { + pos--; + ops[pos].type = LEV_EDIT_INSERT; + ops[pos].spos = i + o1; + ops[pos].dpos = --j + o2; + p--; + dir = -1; + continue; + } + if (dir == 0 && i && *p == *(p - len2) + 1) { + pos--; + ops[pos].type = LEV_EDIT_DELETE; + ops[pos].spos = --i + o1; + ops[pos].dpos = j + o2; + p -= len2; + dir = 1; + continue; + } + /* coredump right now, later might be too late ;-) */ + assert("lost in the cost matrix" == NULL); + } + free(matrix); + + return ops; +} + +/** + * lev_u_editops_find: + * @len1: The length of @string1. + * @string1: A string of length @len1, may contain NUL characters. + * @len2: The length of @string2. + * @string2: A string of length @len2, may contain NUL characters. + * @n: Where the number of edit operations should be stored. + * + * Find an optimal edit sequence from @string1 to @string2. + * + * When there's more than one optimal sequence, a one is arbitrarily (though + * deterministically) chosen. + * + * Returns: The optimal edit sequence, as a newly allocated array of + * elementary edit operations, it length is stored in @n. + * It is normalized, i.e., keep operations are not included. + **/ +_LEV_STATIC_PY LevEditOp* +lev_u_editops_find(size_t len1, const lev_wchar *string1, + size_t len2, const lev_wchar *string2, + size_t *n) +{ + size_t len1o, len2o; + size_t i; + size_t *matrix; /* cost matrix */ + + /* strip common prefix */ + len1o = 0; + while (len1 > 0 && len2 > 0 && *string1 == *string2) { + len1--; + len2--; + string1++; + string2++; + len1o++; + } + len2o = len1o; + + /* strip common suffix */ + while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { + len1--; + len2--; + } + len1++; + len2++; + + /* initalize first row and column */ + matrix = (size_t*)safe_malloc_3(len1, len2, sizeof(size_t)); + if (!matrix) { + *n = (size_t)(-1); + return NULL; + } + for (i = 0; i < len2; i++) + matrix[i] = i; + for (i = 1; i < len1; i++) + matrix[len2*i] = i; + + /* find the costs and fill the matrix */ + for (i = 1; i < len1; i++) { + size_t *prev = matrix + (i - 1)*len2; + size_t *p = matrix + i*len2; + size_t *end = p + len2 - 1; + const lev_wchar char1 = string1[i - 1]; + const lev_wchar *char2p = string2; + size_t x = i; + p++; + while (p <= end) { + size_t c3 = *(prev++) + (char1 != *(char2p++)); + x++; + if (x > c3) + x = c3; + c3 = *prev + 1; + if (x > c3) + x = c3; + *(p++) = x; + } + } + + /* find the way back */ + return ueditops_from_cost_matrix(len1, string1, len1o, + len2, string2, len2o, + matrix, n); +} + +/** + * lev_opcodes_to_editops: + * @nb: The length of @bops. + * @bops: An array of difflib block edit operation codes. + * @n: Where the number of edit operations should be stored. + * @keepkeep: If nonzero, keep operations will be included. Otherwise the + * result will be normalized, i.e. without any keep operations. + * + * Converts difflib block operation codes to elementary edit operations. + * + * Returns: The converted edit operatiosn, as a newly allocated array; its + * size is stored in @n. + **/ +_LEV_STATIC_PY LevEditOp* +lev_opcodes_to_editops(size_t nb, const LevOpCode *bops, + size_t *n, int keepkeep) +{ + size_t i; + const LevOpCode *b; + LevEditOp *ops, *o; + + /* compute the number of atomic operations */ + *n = 0; + if (!nb) + return NULL; + + b = bops; + if (keepkeep) { + for (i = nb; i; i--, b++) { + size_t sd = b->send - b->sbeg; + size_t dd = b->dend - b->dbeg; + *n += (sd > dd ? sd : dd); + } + } + else { + for (i = nb; i; i--, b++) { + size_t sd = b->send - b->sbeg; + size_t dd = b->dend - b->dbeg; + *n += (b->type != LEV_EDIT_KEEP ? (sd > dd ? sd : dd) : 0); + } + } + + /* convert */ + o = ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); + if (!ops) { + *n = (size_t)(-1); + return NULL; + } + b = bops; + for (i = nb; i; i--, b++) { + size_t j; + + switch (b->type) { + case LEV_EDIT_KEEP: + if (keepkeep) { + for (j = 0; j < b->send - b->sbeg; j++, o++) { + o->type = LEV_EDIT_KEEP; + o->spos = b->sbeg + j; + o->dpos = b->dbeg + j; + } + } + break; + + case LEV_EDIT_REPLACE: + for (j = 0; j < b->send - b->sbeg; j++, o++) { + o->type = LEV_EDIT_REPLACE; + o->spos = b->sbeg + j; + o->dpos = b->dbeg + j; + } + break; + + case LEV_EDIT_DELETE: + for (j = 0; j < b->send - b->sbeg; j++, o++) { + o->type = LEV_EDIT_DELETE; + o->spos = b->sbeg + j; + o->dpos = b->dbeg; + } + break; + + case LEV_EDIT_INSERT: + for (j = 0; j < b->dend - b->dbeg; j++, o++) { + o->type = LEV_EDIT_INSERT; + o->spos = b->sbeg; + o->dpos = b->dbeg + j; + } + break; + + default: + break; + } + } + assert((size_t)(o - ops) == *n); + + return ops; +} + +/** + * lev_editops_to_opcodes: + * @n: The size of @ops. + * @ops: An array of elementary edit operations. + * @nb: Where the number of difflib block operation codes should be stored. + * @len1: The length of the source string. + * @len2: The length of the destination string. + * + * Converts elementary edit operations to difflib block operation codes. + * + * Note the string lengths are necessary since difflib doesn't allow omitting + * keep operations. + * + * Returns: The converted block operation codes, as a newly allocated array; + * its length is stored in @nb. + **/ +_LEV_STATIC_PY LevOpCode* +lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb, + size_t len1, size_t len2) +{ + size_t nbl, i, spos, dpos; + const LevEditOp *o; + LevOpCode *bops, *b; + LevEditType type; + + /* compute the number of blocks */ + nbl = 0; + o = ops; + spos = dpos = 0; + type = LEV_EDIT_KEEP; + for (i = n; i; ) { + /* simply pretend there are no keep blocks */ + while (o->type == LEV_EDIT_KEEP && --i) + o++; + if (!i) + break; + if (spos < o->spos || dpos < o->dpos) { + nbl++; + spos = o->spos; + dpos = o->dpos; + } + nbl++; + type = o->type; + switch (type) { + case LEV_EDIT_REPLACE: + do { + spos++; + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_DELETE: + do { + spos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_INSERT: + do { + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + default: + break; + } + } + if (spos < len1 || dpos < len2) + nbl++; + + /* convert */ + b = bops = (LevOpCode*)safe_malloc(nbl, sizeof(LevOpCode)); + if (!bops) { + *nb = (size_t)(-1); + return NULL; + } + o = ops; + spos = dpos = 0; + type = LEV_EDIT_KEEP; + for (i = n; i; ) { + /* simply pretend there are no keep blocks */ + while (o->type == LEV_EDIT_KEEP && --i) + o++; + if (!i) + break; + b->sbeg = spos; + b->dbeg = dpos; + if (spos < o->spos || dpos < o->dpos) { + b->type = LEV_EDIT_KEEP; + spos = b->send = o->spos; + dpos = b->dend = o->dpos; + b++; + b->sbeg = spos; + b->dbeg = dpos; + } + type = o->type; + switch (type) { + case LEV_EDIT_REPLACE: + do { + spos++; + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_DELETE: + do { + spos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_INSERT: + do { + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + default: + break; + } + b->type = type; + b->send = spos; + b->dend = dpos; + b++; + } + if (spos < len1 || dpos < len2) { + assert(len1 - spos == len2 - dpos); + b->type = LEV_EDIT_KEEP; + b->sbeg = spos; + b->dbeg = dpos; + b->send = len1; + b->dend = len2; + b++; + } + assert((size_t)(b - bops) == nbl); + + *nb = nbl; + return bops; +} + +/** + * lev_opcodes_apply: + * @len1: The length of the source string. + * @string1: A string of length @len1, may contain NUL characters. + * @len2: The length of the destination string. + * @string2: A string of length @len2, may contain NUL characters. + * @nb: The length of @bops. + * @bops: An array of difflib block edit operation codes. + * @len: Where the size of the resulting string should be stored. + * + * Applies a sequence of difflib block operations to a string. + * + * NB: @bops is not checked for applicability. + * + * Returns: The result of the edit as a newly allocated string, its length + * is stored in @len. + **/ +_LEV_STATIC_PY lev_byte* +lev_opcodes_apply(size_t len1, const lev_byte *string1, + size_t len2, const lev_byte *string2, + size_t nb, const LevOpCode *bops, + size_t *len) +{ + lev_byte *dst, *dpos; /* destination string */ + const lev_byte *spos; /* source string position */ + size_t i; + + /* this looks too complex for such a simple task, but note ops is not + * a complete edit sequence, we have to be able to apply anything anywhere */ + dpos = dst = (lev_byte*)safe_malloc((len1 + len2), sizeof(lev_byte)); + if (!dst) { + *len = (size_t)(-1); + return NULL; + } + spos = string1; + for (i = nb; i; i--, bops++) { + switch (bops->type) { + case LEV_EDIT_INSERT: + case LEV_EDIT_REPLACE: + memcpy(dpos, string2 + bops->dbeg, + (bops->dend - bops->dbeg)*sizeof(lev_byte)); + break; + + case LEV_EDIT_KEEP: + memcpy(dpos, string1 + bops->sbeg, + (bops->send - bops->sbeg)*sizeof(lev_byte)); + break; + + default: + break; + } + spos += bops->send - bops->sbeg; + dpos += bops->dend - bops->dbeg; + } + + *len = dpos - dst; + /* possible realloc failure is detected with *len != 0 */ + return realloc(dst, *len*sizeof(lev_byte)); +} + +/** + * lev_u_opcodes_apply: + * @len1: The length of the source string. + * @string1: A string of length @len1, may contain NUL characters. + * @len2: The length of the destination string. + * @string2: A string of length @len2, may contain NUL characters. + * @nb: The length of @bops. + * @bops: An array of difflib block edit operation codes. + * @len: Where the size of the resulting string should be stored. + * + * Applies a sequence of difflib block operations to a string. + * + * NB: @bops is not checked for applicability. + * + * Returns: The result of the edit as a newly allocated string, its length + * is stored in @len. + **/ +_LEV_STATIC_PY lev_wchar* +lev_u_opcodes_apply(size_t len1, const lev_wchar *string1, + size_t len2, const lev_wchar *string2, + size_t nb, const LevOpCode *bops, + size_t *len) +{ + lev_wchar *dst, *dpos; /* destination string */ + const lev_wchar *spos; /* source string position */ + size_t i; + + /* this looks too complex for such a simple task, but note ops is not + * a complete edit sequence, we have to be able to apply anything anywhere */ + dpos = dst = (lev_wchar*)safe_malloc((len1 + len2), sizeof(lev_wchar)); + if (!dst) { + *len = (size_t)(-1); + return NULL; + } + spos = string1; + for (i = nb; i; i--, bops++) { + switch (bops->type) { + case LEV_EDIT_INSERT: + case LEV_EDIT_REPLACE: + memcpy(dpos, string2 + bops->dbeg, + (bops->dend - bops->dbeg)*sizeof(lev_wchar)); + break; + + case LEV_EDIT_KEEP: + memcpy(dpos, string1 + bops->sbeg, + (bops->send - bops->sbeg)*sizeof(lev_wchar)); + break; + + default: + break; + } + spos += bops->send - bops->sbeg; + dpos += bops->dend - bops->dbeg; + } + + *len = dpos - dst; + /* possible realloc failure is detected with *len != 0 */ + return realloc(dst, *len*sizeof(lev_wchar)); +} + +/** + * lev_opcodes_invert: + * @nb: The length of @ops. + * @bops: An array of difflib block edit operation codes. + * + * Inverts the sense of @ops. It is modified in place. + * + * In other words, @ops becomes a partial edit for the original source + * and destination strings with their roles exchanged. + **/ +_LEV_STATIC_PY void +lev_opcodes_invert(size_t nb, LevOpCode *bops) +{ + size_t i; + + for (i = nb; i; i--, bops++) { + size_t z; + + z = bops->dbeg; + bops->dbeg = bops->sbeg; + bops->sbeg = z; + z = bops->dend; + bops->dend = bops->send; + bops->send = z; + if (bops->type & 2) + bops->type ^= 1; + } +} + +/** + * lev_editops_matching_blocks: + * @len1: The length of the source string. + * @len2: The length of the destination string. + * @n: The size of @ops. + * @ops: An array of elementary edit operations. + * @nmblocks: Where the number of matching block should be stored. + * + * Computes the matching block corresponding to an optimal edit @ops. + * + * Returns: The matching blocks as a newly allocated array, it length is + * stored in @nmblocks. + **/ +_LEV_STATIC_PY LevMatchingBlock* +lev_editops_matching_blocks(size_t len1, + size_t len2, + size_t n, + const LevEditOp *ops, + size_t *nmblocks) +{ + size_t nmb, i, spos, dpos; + LevEditType type; + const LevEditOp *o; + LevMatchingBlock *mblocks, *mb; + + /* compute the number of matching blocks */ + nmb = 0; + o = ops; + spos = dpos = 0; + type = LEV_EDIT_KEEP; + for (i = n; i; ) { + /* simply pretend there are no keep blocks */ + while (o->type == LEV_EDIT_KEEP && --i) + o++; + if (!i) + break; + if (spos < o->spos || dpos < o->dpos) { + nmb++; + spos = o->spos; + dpos = o->dpos; + } + type = o->type; + switch (type) { + case LEV_EDIT_REPLACE: + do { + spos++; + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_DELETE: + do { + spos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_INSERT: + do { + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + default: + break; + } + } + if (spos < len1 || dpos < len2) + nmb++; + + /* fill the info */ + mb = mblocks = (LevMatchingBlock*)safe_malloc(nmb, sizeof(LevOpCode)); + if (!mblocks) { + *nmblocks = (size_t)(-1); + return NULL; + } + o = ops; + spos = dpos = 0; + type = LEV_EDIT_KEEP; + for (i = n; i; ) { + /* simply pretend there are no keep blocks */ + while (o->type == LEV_EDIT_KEEP && --i) + o++; + if (!i) + break; + if (spos < o->spos || dpos < o->dpos) { + mb->spos = spos; + mb->dpos = dpos; + mb->len = o->spos - spos; + spos = o->spos; + dpos = o->dpos; + mb++; + } + type = o->type; + switch (type) { + case LEV_EDIT_REPLACE: + do { + spos++; + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_DELETE: + do { + spos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + case LEV_EDIT_INSERT: + do { + dpos++; + i--; + o++; + } while (i && o->type == type && spos == o->spos && dpos == o->dpos); + break; + + default: + break; + } + } + if (spos < len1 || dpos < len2) { + assert(len1 - spos == len2 - dpos); + mb->spos = spos; + mb->dpos = dpos; + mb->len = len1 - spos; + mb++; + } + assert((size_t)(mb - mblocks) == nmb); + + *nmblocks = nmb; + return mblocks; +} + +/** + * lev_opcodes_matching_blocks: + * @len1: The length of the source string. + * @len2: The length of the destination string. + * @nb: The size of @bops. + * @bops: An array of difflib block edit operation codes. + * @nmblocks: Where the number of matching block should be stored. + * + * Computes the matching block corresponding to an optimal edit @bops. + * + * Returns: The matching blocks as a newly allocated array, it length is + * stored in @nmblocks. + **/ +_LEV_STATIC_PY LevMatchingBlock* +lev_opcodes_matching_blocks(size_t len1, + __attribute__((unused)) size_t len2, + size_t nb, + const LevOpCode *bops, + size_t *nmblocks) +{ + size_t nmb, i; + const LevOpCode *b; + LevMatchingBlock *mblocks, *mb; + + /* compute the number of matching blocks */ + nmb = 0; + b = bops; + for (i = nb; i; i--, b++) { + if (b->type == LEV_EDIT_KEEP) { + nmb++; + /* adjacent KEEP blocks -- we never produce it, but... */ + while (i && b->type == LEV_EDIT_KEEP) { + i--; + b++; + } + if (!i) + break; + } + } + + /* convert */ + mb = mblocks = (LevMatchingBlock*)safe_malloc(nmb, sizeof(LevOpCode)); + if (!mblocks) { + *nmblocks = (size_t)(-1); + return NULL; + } + b = bops; + for (i = nb; i; i--, b++) { + if (b->type == LEV_EDIT_KEEP) { + mb->spos = b->sbeg; + mb->dpos = b->dbeg; + /* adjacent KEEP blocks -- we never produce it, but... */ + while (i && b->type == LEV_EDIT_KEEP) { + i--; + b++; + } + if (!i) { + mb->len = len1 - mb->spos; + mb++; + break; + } + mb->len = b->sbeg - mb->spos; + mb++; + } + } + assert((size_t)(mb - mblocks) == nmb); + + *nmblocks = nmb; + return mblocks; +} + +/** + * lev_editops_total_cost: + * @n: Tne size of @ops. + * @ops: An array of elementary edit operations. + * + * Computes the total cost of operations in @ops. + * + * The costs of elementary operations are all 1. + * + * Returns: The total cost. + **/ +_LEV_STATIC_PY size_t +lev_editops_total_cost(size_t n, + const LevEditOp *ops) +{ + size_t i, sum = 0; + + for (i = n; i; i--, ops++) + sum += !!ops->type; + + return sum; +} + +/** + * lev_editops_normalize: + * @n: The size of @ops. + * @ops: An array of elementary edit operations. + * @nnorm: Where to store then length of the normalized array. + * + * Normalizes a list of edit operations to contain no keep operations. + * + * Note it returns %NULL for an empty array. + * + * Returns: A newly allocated array of normalized edit operations, its length + * is stored to @nnorm. + **/ +_LEV_STATIC_PY LevEditOp* +lev_editops_normalize(size_t n, + const LevEditOp *ops, + size_t *nnorm) +{ + size_t nx, i; + const LevEditOp *o; + LevEditOp *opsnorm, *on; + + if (!n || !ops) { + *nnorm = 0; + return NULL; + } + + nx = 0; + o = ops; + for (i = n; i; i--, o++) + nx += (o->type == LEV_EDIT_KEEP); + + *nnorm = n - nx; + if (!*nnorm) + return NULL; + + opsnorm = on = (LevEditOp*)safe_malloc((n - nx), sizeof(LevEditOp)); + o = ops; + for (i = n; i; i--, o++) { + if (o->type == LEV_EDIT_KEEP) + continue; + memcpy(on++, o, sizeof(LevEditOp)); + } + + return opsnorm; +} + +/** + * lev_opcodes_total_cost: + * @nb: Tne size of @bops. + * @bops: An array of difflib block operation codes. + * + * Computes the total cost of operations in @bops. + * + * The costs of elementary operations are all 1. + * + * Returns: The total cost. + **/ +_LEV_STATIC_PY size_t +lev_opcodes_total_cost(size_t nb, + const LevOpCode *bops) +{ + size_t i, sum = 0; + + for (i = nb; i; i--, bops++) { + switch (bops->type) { + case LEV_EDIT_REPLACE: + case LEV_EDIT_DELETE: + sum += (bops->send - bops->sbeg); + break; + + case LEV_EDIT_INSERT: + sum += (bops->dend - bops->dbeg); + break; + + default: + break; + } + } + + return sum; +} + +/** + * lev_editops_subtract: + * @n: The size of @ops. + * @ops: An array of elementary edit operations. + * @ns: The size of @sub. + * @sub: A subsequence (ordered subset) of @ops + * @nrem: Where to store then length of the remainder array. + * + * Subtracts a subsequence of elementary edit operations from a sequence. + * + * The remainder is a sequence that, applied to result of application of @sub, + * gives the same final result as application of @ops to original string. + * + * Returns: A newly allocated array of normalized edit operations, its length + * is stored to @nrem. It is always normalized, i.e, without any + * keep operations. On failure, %NULL is returned. + **/ +_LEV_STATIC_PY LevEditOp* +lev_editops_subtract(size_t n, + const LevEditOp *ops, + size_t ns, + const LevEditOp *sub, + size_t *nrem) +{ + static const int shifts[] = { 0, 0, 1, -1 }; + LevEditOp *rem; + size_t i, j, nr, nn; + int shift; + + /* compute remainder size */ + *nrem = -1; + nr = nn = 0; + for (i = 0; i < n; i++) { + if (ops[i].type != LEV_EDIT_KEEP) + nr++; + } + for (i = 0; i < ns; i++) { + if (sub[i].type != LEV_EDIT_KEEP) + nn++; + } + if (nn > nr) + return NULL; + nr -= nn; + + + /* subtract */ + /* we could simply return NULL when nr == 0, but then it would be possible + * to subtract *any* sequence of the right length to get an empty sequence + * -- clrealy incorrectly; so we have to scan the list to check */ + rem = nr ? (LevEditOp*)safe_malloc(nr, sizeof(LevEditOp)) : NULL; + j = nn = shift = 0; + for (i = 0; i < ns; i++) { + while ((ops[j].spos != sub[i].spos + || ops[j].dpos != sub[i].dpos + || ops[j].type != sub[i].type) + && j < n) { + if (ops[j].type != LEV_EDIT_KEEP) { + rem[nn] = ops[j]; + rem[nn].spos += shift; + nn++; + } + j++; + } + if (j == n) { + free(rem); + return NULL; + } + + shift += shifts[sub[i].type]; + j++; + } + + while (j < n) { + if (ops[j].type != LEV_EDIT_KEEP) { + rem[nn] = ops[j]; + rem[nn].spos += shift; + nn++; + } + j++; + } + assert(nn == nr); + + *nrem = nr; + return rem; +} +/* }}} */ + +/**************************************************************************** + * + * Weighted mean strings + * + ****************************************************************************/ + +/* +_LEV_STATIC_PY lev_byte* +lev_weighted_average(size_t len1, + const lev_byte* string1, + size_t len2, + const lev_byte* string2, + size_t n, + const LevEditOp *ops, + LevAveragingType avtype, + size_t total_cost, + double alpha, + size_t *len) +{ + return NULL; +} +*/ +