diff --git "a/.venv/lib/python3.9/site-packages/Levenshtein/_levenshtein.c" "b/.venv/lib/python3.9/site-packages/Levenshtein/_levenshtein.c" deleted file mode 100644--- "a/.venv/lib/python3.9/site-packages/Levenshtein/_levenshtein.c" +++ /dev/null @@ -1,6853 +0,0 @@ -/* - * Levenshtein.c - * @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $ - * Python extension computing Levenshtein distances, string similarities, - * median strings and other goodies. - * - * Copyright (C) 2002-2003 David Necas (Yeti) . - * - * The Taus113 random generator: - * Copyright (C) 2002 Atakan Gurkan - * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough - * (see below for more) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - **/ - -/** - * TODO: - * - * - Implement weighted string averaging, see: - * H. Bunke et. al.: On the Weighted Mean of a Pair of Strings, - * Pattern Analysis and Applications 2002, 5(1): 23-30. - * X. Jiang et. al.: Dynamic Computations of Generalized Median Strings, - * Pattern Analysis and Applications 2002, ???. - * The latter also contains an interesting median-search algorithm. - * - * - Deal with stray symbols in greedy median() and median_improve(). - * There are two possibilities: - * (i) Remember which strings contain which symbols. This allows certain - * small optimizations when processing them. - * (ii) Use some overall heuristics to find symbols which don't worth - * trying. This is very appealing, but hard to do properly - * (requires some inequality strong enough to allow practical exclusion - * of certain symbols -- at certain positions) - * - * - Editops should be an object that only *looks* like a list (which means - * it is a list in duck typing) to avoid never-ending conversions from - * Python lists to LevEditOp arrays and back - * - * - Optimize munkers_blackman(), it's pretty dumb (no memory of visited - * columns/rows) - * - * - Make it really usable as a C library (needs some wrappers, headers, ..., - * and maybe even documentation ;-) - * - * - Add interface to various interesting auxiliary results, namely - * set and sequence distance (only ratio is exported), the map from - * munkers_blackman() itself, ... - * - * - Generalizations: - * - character weight matrix/function - * - arbitrary edit operation costs, decomposable edit operations - * - * - Create a test suite - * - * - Add more interesting algorithms ;-) - * - * Postponed TODO (investigated, and a big `but' was found): - * - * - A linear approximate set median algorithm: - * P. Indyk: Sublinear time algorithms for metric space problems, - * STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html. - * BUT: The algorithm seems to be advantageous only in the case of very - * large sets -- if my estimates are correct (the article itself is quite - * `asymptotic'), say 10^5 at least. On smaller sets either one would get - * only an extermely rough median estimate, or the number of distance - * computations would be in fact higher than in the dumb O(n^2) algorithm. - * - * - Improve setmedian() speed with triangular inequality, see: - * Juan, A., E. Vidal: An Algorithm for Fast Median Search, - * 1997, http://citeseer.nj.nec.com/article/juan97algorithm.html - * BUT: It doesn't seem to help much in spaces of high dimension (see the - * discussion and graphs in the article itself), a few percents at most, - * and strings behave like a space with a very high dimension (locally), so - * who knows, it probably wouldn't help much. - * - **/ -#ifdef NO_PYTHON -#define _GNU_SOURCE -#include -#include -/* for debugging */ -#include -#else /* NO_PYTHON */ -#define _LEV_STATIC_PY static -#define lev_wchar Py_UNICODE -#include -#endif /* NO_PYTHON */ - -#if PY_MAJOR_VERSION >= 3 -#define LEV_PYTHON3 -#define PyString_Type PyBytes_Type -#define PyString_GET_SIZE PyBytes_GET_SIZE -#define PyString_AS_STRING PyBytes_AS_STRING -#define PyString_Check PyBytes_Check -#define PyString_FromStringAndSize PyBytes_FromStringAndSize -#define PyString_InternFromString PyUnicode_InternFromString -#define PyInt_AS_LONG PyLong_AsLong -#define PyInt_FromLong PyLong_FromLong -#define PyInt_Check PyLong_Check -#define PY_INIT_MOD(module, name, doc, methods) \ - static struct PyModuleDef moduledef = { \ - PyModuleDef_HEAD_INIT, name, doc, -1, methods, }; \ - module = PyModule_Create(&moduledef); - #define PY_MOD_INIT_FUNC_DEF(name) PyObject* PyInit_##name(void) -#else - #define PY_INIT_MOD(module, name, doc, methods) \ - Py_InitModule3(name, methods, doc); - #define PY_MOD_INIT_FUNC_DEF(name) void init##name(void) -#endif /* PY_MAJOR_VERSION */ - -#include -#include "_levenshtein.h" - -/* FIXME: inline avaliability should be solved in setup.py, somehow, or - * even better in Python.h, like const is... - * this should inline at least with gcc and msvc */ -#ifndef __GNUC__ -# ifdef _MSC_VER -# define inline __inline -# else -# define inline /* */ -# endif -# define __attribute__(x) /* */ -#endif - -#define LEV_EPSILON 1e-14 -#define LEV_INFINITY 1e100 - -/* Me thinks the second argument of PyArg_UnpackTuple() should be const. - * Anyway I habitually pass a constant string. - * A cast to placate the compiler. */ -#define PYARGCFIX(x) (char*)(x) - -/* local functions */ -static size_t* -munkers_blackman(size_t n1, - size_t n2, - double *dists); - -#define TAUS113_LCG(n) ((69069UL * n) & 0xffffffffUL) -#define TAUS113_MASK 0xffffffffUL - -typedef struct { - unsigned long int z1, z2, z3, z4; -} taus113_state_t; - -static inline unsigned long int -taus113_get(taus113_state_t *state); - -static void -taus113_set(taus113_state_t *state, - unsigned long int s); - -#ifndef NO_PYTHON -/* python interface and wrappers */ -/* declarations and docstrings {{{ */ -static PyObject* distance_py(PyObject *self, PyObject *args); -static PyObject* ratio_py(PyObject *self, PyObject *args); -static PyObject* hamming_py(PyObject *self, PyObject *args); -static PyObject* jaro_py(PyObject *self, PyObject *args); -static PyObject* jaro_winkler_py(PyObject *self, PyObject *args); -static PyObject* median_py(PyObject *self, PyObject *args); -static PyObject* median_improve_py(PyObject *self, PyObject *args); -static PyObject* quickmedian_py(PyObject *self, PyObject *args); -static PyObject* setmedian_py(PyObject *self, PyObject *args); -static PyObject* seqratio_py(PyObject *self, PyObject *args); -static PyObject* setratio_py(PyObject *self, PyObject *args); -static PyObject* editops_py(PyObject *self, PyObject *args); -static PyObject* opcodes_py(PyObject *self, PyObject *args); -static PyObject* inverse_py(PyObject *self, PyObject *args); -static PyObject* apply_edit_py(PyObject *self, PyObject *args); -static PyObject* matching_blocks_py(PyObject *self, PyObject *args); -static PyObject* subtract_edit_py(PyObject *self, PyObject *args); - -#define Levenshtein_DESC \ - "A C extension module for fast computation of:\n" \ - "- Levenshtein (edit) distance and edit sequence manipulation\n" \ - "- string similarity\n" \ - "- approximate median strings, and generally string averaging\n" \ - "- string sequence and set similarity\n" \ - "\n" \ - "Levenshtein has a some overlap with difflib (SequenceMatcher). It\n" \ - "supports only strings, not arbitrary sequence types, but on the\n" \ - "other hand it's much faster.\n" \ - "\n" \ - "It supports both normal and Unicode strings, but can't mix them, all\n" \ - "arguments to a function (method) have to be of the same type (or its\n" \ - "subclasses).\n" - -#define distance_DESC \ - "Compute absolute Levenshtein distance of two strings.\n" \ - "\n" \ - "distance(string1, string2)\n" \ - "\n" \ - "Examples (it's hard to spell Levenshtein correctly):\n" \ - "\n" \ - ">>> distance('Levenshtein', 'Lenvinsten')\n" \ - "4\n" \ - ">>> distance('Levenshtein', 'Levensthein')\n" \ - "2\n" \ - ">>> distance('Levenshtein', 'Levenshten')\n" \ - "1\n" \ - ">>> distance('Levenshtein', 'Levenshtein')\n" \ - "0\n" \ - "\n" \ - "Yeah, we've managed it at last.\n" - -#define ratio_DESC \ - "Compute similarity of two strings.\n" \ - "\n" \ - "ratio(string1, string2)\n" \ - "\n" \ - "The similarity is a number between 0 and 1, it's usually equal or\n" \ - "somewhat higher than difflib.SequenceMatcher.ratio(), because it's\n" \ - "based on real minimal edit distance.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> ratio('Hello world!', 'Holly grail!') # doctest: +ELLIPSIS\n" \ - "0.583333...\n" \ - ">>> ratio('Brian', 'Jesus')\n" \ - "0.0\n" \ - "\n" \ - "Really? I thought there was some similarity.\n" - -#define hamming_DESC \ - "Compute Hamming distance of two strings.\n" \ - "\n" \ - "hamming(string1, string2)\n" \ - "\n" \ - "The Hamming distance is simply the number of differing characters.\n" \ - "That means the length of the strings must be the same.\n" \ - "\n" \ - "Examples:\n" \ - ">>> hamming('Hello world!', 'Holly grail!')\n" \ - "7\n" \ - ">>> hamming('Brian', 'Jesus')\n" \ - "5\n" - -#define jaro_DESC \ - "Compute Jaro string similarity metric of two strings.\n" \ - "\n" \ - "jaro(string1, string2)\n" \ - "\n" \ - "The Jaro string similarity metric is intended for short strings like\n" \ - "personal last names. It is 0 for completely different strings and\n" \ - "1 for identical strings.\n" \ - "\n" \ - "Examples:\n" \ - ">>> jaro('Brian', 'Jesus')\n" \ - "0.0\n" \ - ">>> jaro('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \ - "0.779761...\n" \ - ">>> jaro('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \ - "0.708333...\n" - -#define jaro_winkler_DESC \ - "Compute Jaro string similarity metric of two strings.\n" \ - "\n" \ - "jaro_winkler(string1, string2[, prefix_weight])\n" \ - "\n" \ - "The Jaro-Winkler string similarity metric is a modification of Jaro\n" \ - "metric giving more weight to common prefix, as spelling mistakes are\n" \ - "more likely to occur near ends of words.\n" \ - "\n" \ - "The prefix weight is inverse value of common prefix length sufficient\n" \ - "to consider the strings *identical*. If no prefix weight is\n" \ - "specified, 1/10 is used.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> jaro_winkler('Brian', 'Jesus')\n" \ - "0.0\n" \ - ">>> jaro_winkler('Thorkel', 'Thorgier') # doctest: +ELLIPSIS\n" \ - "0.867857...\n" \ - ">>> jaro_winkler('Dinsdale', 'D') # doctest: +ELLIPSIS\n" \ - "0.7375...\n" \ - ">>> jaro_winkler('Thorkel', 'Thorgier', 0.25)\n" \ - "1.0\n" - -#define median_DESC \ - "Find an approximate generalized median string using greedy algorithm.\n" \ - "\n" \ - "median(string_sequence[, weight_sequence])\n" \ - "\n" \ - "You can optionally pass a weight for each string as the second\n" \ - "argument. The weights are interpreted as item multiplicities,\n" \ - "although any non-negative real numbers are accepted. Use them to\n" \ - "improve computation speed when strings often appear multiple times\n" \ - "in the sequence.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> median(['SpSm', 'mpamm', 'Spam', 'Spa', 'Sua', 'hSam'])\n" \ - "'Spam'\n" \ - ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ - "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ - "... 'Levenhtin', 'evenshtei']\n" \ - ">>> median(fixme)\n" \ - "'Levenshtein'\n" \ - "\n" \ - "Hm. Even a computer program can spell Levenshtein better than me.\n" - -#define median_improve_DESC \ - "Improve an approximate generalized median string by perturbations.\n" \ - "\n" \ - "median_improve(string, string_sequence[, weight_sequence])\n" \ - "\n" \ - "The first argument is the estimated generalized median string you\n" \ - "want to improve, the others are the same as in median(). It returns\n" \ - "a string with total distance less or equal to that of the given string.\n" \ - "\n" \ - "Note this is much slower than median(). Also note it performs only\n" \ - "one improvement step, calling median_improve() again on the result\n" \ - "may improve it further, though this is unlikely to happen unless the\n" \ - "given string was not very similar to the actual generalized median.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ - "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ - "... 'Levenhtin', 'evenshtei']\n" \ - ">>> median_improve('spam', fixme)\n" \ - "'enhtein'\n" \ - ">>> median_improve(median_improve('spam', fixme), fixme)\n" \ - "'Levenshtein'\n" \ - "\n" \ - "It takes some work to change spam to Levenshtein.\n" - -#define quickmedian_DESC \ - "Find a very approximate generalized median string, but fast.\n" \ - "\n" \ - "quickmedian(string[, weight_sequence])\n" \ - "\n" \ - "See median() for argument description.\n" \ - "\n" \ - "This method is somewhere between setmedian() and picking a random\n" \ - "string from the set; both speedwise and quality-wise.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',\n" \ - "... 'Leveshtei', 'Lenshtein', 'Lvenstein',\n" \ - "... 'Levenhtin', 'evenshtei']\n" \ - ">>> quickmedian(fixme)\n" \ - "'Levnshein'\n" - -#define setmedian_DESC \ - "Find set median of a string set (passed as a sequence).\n" \ - "\n" \ - "setmedian(string[, weight_sequence])\n" \ - "\n" \ - "See median() for argument description.\n" \ - "\n" \ - "The returned string is always one of the strings in the sequence.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> setmedian(['ehee', 'cceaes', 'chees', 'chreesc',\n" \ - "... 'chees', 'cheesee', 'cseese', 'chetese'])\n" \ - "'chees'\n" \ - "\n" \ - "You haven't asked me about Limburger, sir.\n" - -#define seqratio_DESC \ - "Compute similarity ratio of two sequences of strings.\n" \ - "\n" \ - "seqratio(string_sequence1, string_sequence2)\n" \ - "\n" \ - "This is like ratio(), but for string sequences. A kind of ratio()\n" \ - "is used to to measure the cost of item change operation for the\n" \ - "strings.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> seqratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \ - "... ['caribou', 'sausage', 'gorn', 'woody'])\n" \ - "0.21517857142857144\n" - -#define setratio_DESC \ - "Compute similarity ratio of two strings sets (passed as sequences).\n" \ - "\n" \ - "setratio(string_sequence1, string_sequence2)\n" \ - "\n" \ - "The best match between any strings in the first set and the second\n" \ - "set (passed as sequences) is attempted. I.e., the order doesn't\n" \ - "matter here.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> setratio(['newspaper', 'litter bin', 'tinny', 'antelope'],\n" \ - "... ['caribou', 'sausage', 'gorn', 'woody']) # doctest: +ELLIPSIS\n" \ - "0.281845...\n" \ - "\n" \ - "No, even reordering doesn't help the tinny words to match the\n" \ - "woody ones.\n" - -#define editops_DESC \ - "Find sequence of edit operations transforming one string to another.\n" \ - "\n" \ - "editops(source_string, destination_string)\n" \ - "editops(edit_operations, source_length, destination_length)\n" \ - "\n" \ - "The result is a list of triples (operation, spos, dpos), where\n" \ - "operation is one of 'equal', 'replace', 'insert', or 'delete'; spos\n" \ - "and dpos are position of characters in the first (source) and the\n" \ - "second (destination) strings. These are operations on signle\n" \ - "characters. In fact the returned list doesn't contain the 'equal',\n" \ - "but all the related functions accept both lists with and without\n" \ - "'equal's.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> editops('spam', 'park')\n" \ - "[('delete', 0, 0), ('insert', 3, 2), ('replace', 3, 3)]\n" \ - "\n" \ - "The alternate form editops(opcodes, source_string, destination_string)\n" \ - "can be used for conversion from opcodes (5-tuples) to editops (you can\n" \ - "pass strings or their lengths, it doesn't matter).\n" - -#define opcodes_DESC \ - "Find sequence of edit operations transforming one string to another.\n" \ - "\n" \ - "opcodes(source_string, destination_string)\n" \ - "opcodes(edit_operations, source_length, destination_length)\n" \ - "\n" \ - "The result is a list of 5-tuples with the same meaning as in\n" \ - "SequenceMatcher's get_opcodes() output. But since the algorithms\n" \ - "differ, the actual sequences from Levenshtein and SequenceMatcher\n" \ - "may differ too.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> for x in opcodes('spam', 'park'):\n" \ - "... print(x)\n" \ - "...\n" \ - "('delete', 0, 1, 0, 0)\n" \ - "('equal', 1, 3, 0, 2)\n" \ - "('insert', 3, 3, 2, 3)\n" \ - "('replace', 3, 4, 3, 4)\n" \ - "\n" \ - "The alternate form opcodes(editops, source_string, destination_string)\n" \ - "can be used for conversion from editops (triples) to opcodes (you can\n" \ - "pass strings or their lengths, it doesn't matter).\n" - -#define inverse_DESC \ - "Invert the sense of an edit operation sequence.\n" \ - "\n" \ - "inverse(edit_operations)\n" \ - "\n" \ - "In other words, it returns a list of edit operations transforming the\n" \ - "second (destination) string to the first (source). It can be used\n" \ - "with both editops and opcodes.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> inverse(editops('spam', 'park'))\n" \ - "[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" \ - ">>> editops('park', 'spam')\n" \ - "[('insert', 0, 0), ('delete', 2, 3), ('replace', 3, 3)]\n" - -#define apply_edit_DESC \ - "Apply a sequence of edit operations to a string.\n" \ - "\n" \ - "apply_edit(edit_operations, source_string, destination_string)\n" \ - "\n" \ - "In the case of editops, the sequence can be arbitrary ordered subset\n" \ - "of the edit sequence transforming source_string to destination_string.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> e = editops('man', 'scotsman')\n" \ - ">>> apply_edit(e, 'man', 'scotsman')\n" \ - "'scotsman'\n" \ - ">>> apply_edit(e[:3], 'man', 'scotsman')\n" \ - "'scoman'\n" \ - "\n" \ - "The other form of edit operations, opcodes, is not very suitable for\n" \ - "such a tricks, because it has to always span over complete strings,\n" \ - "subsets can be created by carefully replacing blocks with 'equal'\n" \ - "blocks, or by enlarging 'equal' block at the expense of other blocks\n" \ - "and adjusting the other blocks accordingly.\n" \ - "\n" \ - "Examples:\n" \ - ">>> a, b = 'spam and eggs', 'foo and bar'\n" \ - ">>> e = opcodes(a, b)\n" \ - ">>> apply_edit(inverse(e), b, a)\n" \ - "'spam and eggs'\n" \ - ">>> e[4] = ('equal', 10, 13, 8, 11)\n" \ - ">>> e\n" \ - "[('delete', 0, 1, 0, 0), ('replace', 1, 4, 0, 3), ('equal', 4, 9, 3, 8), ('delete', 9, 10, 8, 8), ('equal', 10, 13, 8, 11)]\n" \ - ">>> apply_edit(e, a, b)\n" \ - "'foo and ggs'\n" - -#define matching_blocks_DESC \ - "Find identical blocks in two strings.\n" \ - "\n" \ - "matching_blocks(edit_operations, source_length, destination_length)\n" \ - "\n" \ - "The result is a list of triples with the same meaning as in\n" \ - "SequenceMatcher's get_matching_blocks() output. It can be used with\n" \ - "both editops and opcodes. The second and third arguments don't\n" \ - "have to be actually strings, their lengths are enough.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> a, b = 'spam', 'park'\n" \ - ">>> matching_blocks(editops(a, b), a, b)\n" \ - "[(1, 0, 2), (4, 4, 0)]\n" \ - ">>> matching_blocks(editops(a, b), len(a), len(b))\n" \ - "[(1, 0, 2), (4, 4, 0)]\n" \ - "\n" \ - "The last zero-length block is not an error, but it's there for\n" \ - "compatibility with difflib which always emits it.\n" \ - "\n" \ - "One can join the matching blocks to get two identical strings:\n" \ - ">>> a, b = 'dog kennels', 'mattresses'\n" \ - ">>> mb = matching_blocks(editops(a,b), a, b)\n" \ - ">>> ''.join([a[x[0]:x[0]+x[2]] for x in mb])\n" \ - "'ees'\n" \ - ">>> ''.join([b[x[1]:x[1]+x[2]] for x in mb])\n" \ - "'ees'\n" - -#define subtract_edit_DESC \ - "Subtract an edit subsequence from a sequence.\n" \ - "\n" \ - "subtract_edit(edit_operations, subsequence)\n" \ - "\n" \ - "The result is equivalent to\n" \ - "editops(apply_edit(subsequence, s1, s2), s2), except that is\n" \ - "constructed directly from the edit operations. That is, if you apply\n" \ - "it to the result of subsequence application, you get the same final\n" \ - "string as from application complete edit_operations. It may be not\n" \ - "identical, though (in amibuous cases, like insertion of a character\n" \ - "next to the same character).\n" \ - "\n" \ - "The subtracted subsequence must be an ordered subset of\n" \ - "edit_operations.\n" \ - "\n" \ - "Note this function does not accept difflib-style opcodes as no one in\n" \ - "his right mind wants to create subsequences from them.\n" \ - "\n" \ - "Examples:\n" \ - "\n" \ - ">>> e = editops('man', 'scotsman')\n" \ - ">>> e1 = e[:3]\n" \ - ">>> bastard = apply_edit(e1, 'man', 'scotsman')\n" \ - ">>> bastard\n" \ - "'scoman'\n" \ - ">>> apply_edit(subtract_edit(e, e1), bastard, 'scotsman')\n" \ - "'scotsman'\n" \ - -#define METHODS_ITEM(x) { #x, x##_py, METH_VARARGS, x##_DESC } -static PyMethodDef methods[] = { - METHODS_ITEM(distance), - METHODS_ITEM(ratio), - METHODS_ITEM(hamming), - METHODS_ITEM(jaro), - METHODS_ITEM(jaro_winkler), - METHODS_ITEM(median), - METHODS_ITEM(median_improve), - METHODS_ITEM(quickmedian), - METHODS_ITEM(setmedian), - METHODS_ITEM(seqratio), - METHODS_ITEM(setratio), - METHODS_ITEM(editops), - METHODS_ITEM(opcodes), - METHODS_ITEM(inverse), - METHODS_ITEM(apply_edit), - METHODS_ITEM(matching_blocks), - METHODS_ITEM(subtract_edit), - { NULL, NULL, 0, NULL }, -}; - -/* opcode names, these are to be initialized in the init func, - * indexed by LevEditType values */ -struct { - PyObject* pystring; - const char *cstring; - size_t len; -} -static opcode_names[] = { - { NULL, "equal", 0 }, - { NULL, "replace", 0 }, - { NULL, "insert", 0 }, - { NULL, "delete", 0 }, -}; -#define N_OPCODE_NAMES ((sizeof(opcode_names)/sizeof(opcode_names[0]))) - -typedef lev_byte *(*MedianFuncString)(size_t n, - const size_t *lengths, - const lev_byte *strings[], - const double *weights, - size_t *medlength); -typedef Py_UNICODE *(*MedianFuncUnicode)(size_t n, - const size_t *lengths, - const Py_UNICODE *strings[], - const double *weights, - size_t *medlength); -typedef struct { - MedianFuncString s; - MedianFuncUnicode u; -} MedianFuncs; - -typedef lev_byte *(*MedianImproveFuncString)(size_t len, const lev_byte *s, - size_t n, - const size_t *lengths, - const lev_byte *strings[], - const double *weights, - size_t *medlength); -typedef Py_UNICODE *(*MedianImproveFuncUnicode)(size_t len, const Py_UNICODE *s, - size_t n, - const size_t *lengths, - const Py_UNICODE *strings[], - const double *weights, - size_t *medlength); -typedef struct { - MedianImproveFuncString s; - MedianImproveFuncUnicode u; -} MedianImproveFuncs; - -typedef double (*SetSeqFuncString)(size_t n1, - const size_t *lengths1, - const lev_byte *strings1[], - size_t n2, - const size_t *lengths2, - const lev_byte *strings2[]); -typedef double (*SetSeqFuncUnicode)(size_t n1, - const size_t *lengths1, - const Py_UNICODE *strings1[], - size_t n2, - const size_t *lengths2, - const Py_UNICODE *strings2[]); - -typedef struct { - SetSeqFuncString s; - SetSeqFuncUnicode u; -} SetSeqFuncs; - -static long int -levenshtein_common(PyObject *args, - const char *name, - size_t xcost, - size_t *lensum); - -static int -extract_stringlist(PyObject *list, - const char *name, - size_t n, - size_t **sizelist, - void *strlist); - -static double* -extract_weightlist(PyObject *wlist, - const char *name, - size_t n); - -static PyObject* -median_common(PyObject *args, - const char *name, - MedianFuncs foo); - -static PyObject* -median_improve_common(PyObject *args, - const char *name, - MedianImproveFuncs foo); - -static double -setseq_common(PyObject *args, - const char *name, - SetSeqFuncs foo, - size_t *lensum); - -static void * -safe_malloc(size_t nmemb, size_t size) { - /* extra-conservative overflow check */ - if (SIZE_MAX / size <= nmemb) { - return NULL; - } - return malloc(nmemb * size); -} - -static void * -safe_malloc_3(size_t nmemb1, size_t nmemb2, size_t size) { - /* extra-conservative overflow check */ - if (SIZE_MAX / size <= nmemb2) { - return NULL; - } - return safe_malloc(nmemb1, nmemb2 * size); -} - -/* }}} */ - -/**************************************************************************** - * - * Python interface and subroutines - * - ****************************************************************************/ -/* {{{ */ - -static long int -levenshtein_common(PyObject *args, const char *name, size_t xcost, - size_t *lensum) -{ - PyObject *arg1, *arg2; - size_t len1, len2; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) - return -1; - - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2; - - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - *lensum = len1 + len2; - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - { - size_t d = lev_edit_distance(len1, string1, len2, string2, xcost); - if (d == (size_t)(-1)) { - PyErr_NoMemory(); - return -1; - } - return d; - } - } - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2; - - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - *lensum = len1 + len2; - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - { - size_t d = lev_u_edit_distance(len1, string1, len2, string2, xcost); - if (d == (size_t)(-1)) { - PyErr_NoMemory(); - return -1; - } - return d; - } - } - else { - PyErr_Format(PyExc_TypeError, - "%s expected two Strings or two Unicodes", name); - return -1; - } -} - -static PyObject* -distance_py(PyObject *self, PyObject *args) -{ - size_t lensum; - long int ldist; - - if ((ldist = levenshtein_common(args, "distance", 0, &lensum)) < 0) - return NULL; - - return PyInt_FromLong((long)ldist); -} - -static PyObject* -ratio_py(PyObject *self, PyObject *args) -{ - size_t lensum; - long int ldist; - - if ((ldist = levenshtein_common(args, "ratio", 1, &lensum)) < 0) - return NULL; - - if (lensum == 0) - return PyFloat_FromDouble(1.0); - - return PyFloat_FromDouble((double)(lensum - ldist)/(lensum)); -} - -static PyObject* -hamming_py(PyObject *self, PyObject *args) -{ - PyObject *arg1, *arg2; - const char *name = "hamming"; - size_t len1, len2; - long int dist; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) - return NULL; - - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2; - - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - if (len1 != len2) { - PyErr_Format(PyExc_ValueError, - "%s expected two strings of the same length", name); - return NULL; - } - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - dist = lev_hamming_distance(len1, string1, string2); - return PyInt_FromLong(dist); - } - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2; - - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - if (len1 != len2) { - PyErr_Format(PyExc_ValueError, - "%s expected two unicodes of the same length", name); - return NULL; - } - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - dist = lev_u_hamming_distance(len1, string1, string2); - return PyInt_FromLong(dist); - } - else { - PyErr_Format(PyExc_TypeError, - "%s expected two Strings or two Unicodes", name); - return NULL; - } -} - -static PyObject* -jaro_py(PyObject *self, PyObject *args) -{ - PyObject *arg1, *arg2; - const char *name = "jaro"; - size_t len1, len2; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &arg1, &arg2)) - return NULL; - - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2; - - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - return PyFloat_FromDouble(lev_jaro_ratio(len1, string1, len2, string2)); - } - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2; - - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - return PyFloat_FromDouble(lev_u_jaro_ratio(len1, string1, len2, string2)); - } - else { - PyErr_Format(PyExc_TypeError, - "%s expected two Strings or two Unicodes", name); - return NULL; - } -} - -static PyObject* -jaro_winkler_py(PyObject *self, PyObject *args) -{ - PyObject *arg1, *arg2, *arg3 = NULL; - double pfweight = 0.1; - const char *name = "jaro_winkler"; - size_t len1, len2; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &arg2, &arg3)) - return NULL; - - if (arg3) { - if (!PyObject_TypeCheck(arg3, &PyFloat_Type)) { - PyErr_Format(PyExc_TypeError, "%s third argument must be a Float", name); - return NULL; - } - pfweight = PyFloat_AS_DOUBLE(arg3); - if (pfweight < 0.0) { - PyErr_Format(PyExc_ValueError, "%s negative prefix weight", name); - return NULL; - } - } - - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2; - - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - return PyFloat_FromDouble(lev_jaro_winkler_ratio(len1, string1, - len2, string2, - pfweight)); - } - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2; - - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - return PyFloat_FromDouble(lev_u_jaro_winkler_ratio(len1, string1, - len2, string2, - pfweight)); - } - else { - PyErr_Format(PyExc_TypeError, - "%s expected two Strings or two Unicodes", name); - return NULL; - } -} - -static PyObject* -median_py(PyObject *self, PyObject *args) -{ - MedianFuncs engines = { lev_greedy_median, lev_u_greedy_median }; - return median_common(args, "median", engines); -} - -static PyObject* -median_improve_py(PyObject *self, PyObject *args) -{ - MedianImproveFuncs engines = { lev_median_improve, lev_u_median_improve }; - return median_improve_common(args, "median_improve", engines); -} - -static PyObject* -quickmedian_py(PyObject *self, PyObject *args) -{ - MedianFuncs engines = { lev_quick_median, lev_u_quick_median }; - return median_common(args, "quickmedian", engines); -} - -static PyObject* -setmedian_py(PyObject *self, PyObject *args) -{ - MedianFuncs engines = { lev_set_median, lev_u_set_median }; - return median_common(args, "setmedian", engines); -} - -static PyObject* -median_common(PyObject *args, const char *name, MedianFuncs foo) -{ - size_t n, len; - void *strings = NULL; - size_t *sizes = NULL; - PyObject *strlist = NULL; - PyObject *wlist = NULL; - PyObject *strseq = NULL; - double *weights; - int stringtype; - PyObject *result = NULL; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 1, 2, &strlist, &wlist)) - return NULL; - - if (!PySequence_Check(strlist)) { - PyErr_Format(PyExc_TypeError, - "%s first argument must be a Sequence", name); - return NULL; - } - strseq = PySequence_Fast(strlist, name); - - n = PySequence_Fast_GET_SIZE(strseq); - if (n == 0) { - Py_INCREF(Py_None); - Py_DECREF(strseq); - return Py_None; - } - - /* get (optional) weights, use 1 if none specified. */ - weights = extract_weightlist(wlist, name, n); - if (!weights) { - Py_DECREF(strseq); - return NULL; - } - - stringtype = extract_stringlist(strseq, name, n, &sizes, &strings); - Py_DECREF(strseq); - if (stringtype < 0) { - free(weights); - return NULL; - } - - if (stringtype == 0) { - lev_byte *medstr = foo.s(n, sizes, strings, weights, &len); - if (!medstr && len) - result = PyErr_NoMemory(); - else { - result = PyString_FromStringAndSize(medstr, len); - free(medstr); - } - } - else if (stringtype == 1) { - Py_UNICODE *medstr = foo.u(n, sizes, strings, weights, &len); - if (!medstr && len) - result = PyErr_NoMemory(); - else { - result = PyUnicode_FromUnicode(medstr, len); - free(medstr); - } - } - else - PyErr_Format(PyExc_SystemError, "%s internal error", name); - - free(strings); - free(weights); - free(sizes); - return result; -} - -static PyObject* -median_improve_common(PyObject *args, const char *name, MedianImproveFuncs foo) -{ - size_t n, len; - void *strings = NULL; - size_t *sizes = NULL; - PyObject *arg1 = NULL; - PyObject *strlist = NULL; - PyObject *wlist = NULL; - PyObject *strseq = NULL; - double *weights; - int stringtype; - PyObject *result = NULL; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 3, &arg1, &strlist, &wlist)) - return NULL; - - if (PyObject_TypeCheck(arg1, &PyString_Type)) - stringtype = 0; - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type)) - stringtype = 1; - else { - PyErr_Format(PyExc_TypeError, - "%s first argument must be a String or Unicode", name); - return NULL; - } - - if (!PySequence_Check(strlist)) { - PyErr_Format(PyExc_TypeError, - "%s second argument must be a Sequence", name); - return NULL; - } - strseq = PySequence_Fast(strlist, name); - - n = PySequence_Fast_GET_SIZE(strseq); - if (n == 0) { - Py_INCREF(Py_None); - Py_DECREF(strseq); - return Py_None; - } - - /* get (optional) weights, use 1 if none specified. */ - weights = extract_weightlist(wlist, name, n); - if (!weights) { - Py_DECREF(strseq); - return NULL; - } - - if (extract_stringlist(strseq, name, n, &sizes, &strings) != stringtype) { - PyErr_Format(PyExc_TypeError, - "%s argument types don't match", name); - free(weights); - return NULL; - } - - Py_DECREF(strseq); - if (stringtype == 0) { - lev_byte *s = PyString_AS_STRING(arg1); - size_t l = PyString_GET_SIZE(arg1); - lev_byte *medstr = foo.s(l, s, n, sizes, strings, weights, &len); - if (!medstr && len) - result = PyErr_NoMemory(); - else { - result = PyString_FromStringAndSize(medstr, len); - free(medstr); - } - } - else if (stringtype == 1) { - Py_UNICODE *s = PyUnicode_AS_UNICODE(arg1); - size_t l = PyUnicode_GET_SIZE(arg1); - Py_UNICODE *medstr = foo.u(l, s, n, sizes, strings, weights, &len); - if (!medstr && len) - result = PyErr_NoMemory(); - else { - result = PyUnicode_FromUnicode(medstr, len); - free(medstr); - } - } - else - PyErr_Format(PyExc_SystemError, "%s internal error", name); - - free(strings); - free(weights); - free(sizes); - return result; -} - -static double* -extract_weightlist(PyObject *wlist, const char *name, size_t n) -{ - size_t i; - double *weights = NULL; - PyObject *seq; - - if (wlist) { - if (!PySequence_Check(wlist)) { - PyErr_Format(PyExc_TypeError, - "%s second argument must be a Sequence", name); - return NULL; - } - seq = PySequence_Fast(wlist, name); - if (PySequence_Fast_GET_SIZE(wlist) != n) { - PyErr_Format(PyExc_ValueError, - "%s got %i strings but %i weights", - name, n, PyList_GET_SIZE(wlist)); - Py_DECREF(seq); - return NULL; - } - weights = (double*)safe_malloc(n, sizeof(double)); - if (!weights) - return (double*)PyErr_NoMemory(); - for (i = 0; i < n; i++) { - PyObject *item = PySequence_Fast_GET_ITEM(wlist, i); - PyObject *number = PyNumber_Float(item); - - if (!number) { - free(weights); - PyErr_Format(PyExc_TypeError, - "%s weight #%i is not a Number", name, i); - Py_DECREF(seq); - return NULL; - } - weights[i] = PyFloat_AS_DOUBLE(number); - Py_DECREF(number); - if (weights[i] < 0) { - free(weights); - PyErr_Format(PyExc_ValueError, - "%s weight #%i is negative", name, i); - Py_DECREF(seq); - return NULL; - } - } - Py_DECREF(seq); - } - else { - weights = (double*)safe_malloc(n, sizeof(double)); - if (!weights) - return (double*)PyErr_NoMemory(); - for (i = 0; i < n; i++) - weights[i] = 1.0; - } - - return weights; -} - -/* extract a list of strings or unicode strings, returns - * 0 -- strings - * 1 -- unicode strings - * <0 -- failure - */ -static int -extract_stringlist(PyObject *list, const char *name, - size_t n, size_t **sizelist, void *strlist) -{ - size_t i; - PyObject *first; - - /* check first object type. when it's a string then all others must be - * strings too; when it's a unicode string then all others must be unicode - * strings too. */ - first = PySequence_Fast_GET_ITEM(list, 0); - /* i don't exactly understand why the problem doesn't exhibit itself earlier - * but a queer error message is better than a segfault :o) */ - if (first == (PyObject*)-1) { - PyErr_Format(PyExc_TypeError, - "%s undecomposable Sequence???", name); - return -1; - } - - if (PyObject_TypeCheck(first, &PyString_Type)) { - lev_byte **strings; - size_t *sizes; - - strings = (lev_byte**)safe_malloc(n, sizeof(lev_byte*)); - if (!strings) { - PyErr_Format(PyExc_MemoryError, - "%s cannot allocate memory", name); - return -1; - } - sizes = (size_t*)safe_malloc(n, sizeof(size_t)); - if (!sizes) { - free(strings); - PyErr_Format(PyExc_MemoryError, - "%s cannot allocate memory", name); - return -1; - } - - strings[0] = PyString_AS_STRING(first); - sizes[0] = PyString_GET_SIZE(first); - for (i = 1; i < n; i++) { - PyObject *item = PySequence_Fast_GET_ITEM(list, i); - - if (!PyObject_TypeCheck(item, &PyString_Type)) { - free(strings); - free(sizes); - PyErr_Format(PyExc_TypeError, - "%s item #%i is not a String", name, i); - return -1; - } - strings[i] = PyString_AS_STRING(item); - sizes[i] = PyString_GET_SIZE(item); - } - - *(lev_byte***)strlist = strings; - *sizelist = sizes; - return 0; - } - if (PyObject_TypeCheck(first, &PyUnicode_Type)) { - Py_UNICODE **strings; - size_t *sizes; - - strings = (Py_UNICODE**)safe_malloc(n, sizeof(Py_UNICODE*)); - if (!strings) { - PyErr_NoMemory(); - return -1; - } - sizes = (size_t*)safe_malloc(n, sizeof(size_t)); - if (!sizes) { - free(strings); - PyErr_NoMemory(); - return -1; - } - - strings[0] = PyUnicode_AS_UNICODE(first); - sizes[0] = PyUnicode_GET_SIZE(first); - for (i = 1; i < n; i++) { - PyObject *item = PySequence_Fast_GET_ITEM(list, i); - - if (!PyObject_TypeCheck(item, &PyUnicode_Type)) { - free(strings); - free(sizes); - PyErr_Format(PyExc_TypeError, - "%s item #%i is not a Unicode", name, i); - return -1; - } - strings[i] = PyUnicode_AS_UNICODE(item); - sizes[i] = PyUnicode_GET_SIZE(item); - } - - *(Py_UNICODE***)strlist = strings; - *sizelist = sizes; - return 1; - } - - PyErr_Format(PyExc_TypeError, - "%s expected list of Strings or Unicodes", name); - return -1; -} - -static PyObject* -seqratio_py(PyObject *self, PyObject *args) -{ - SetSeqFuncs engines = { lev_edit_seq_distance, lev_u_edit_seq_distance }; - size_t lensum; - double r = setseq_common(args, "seqratio", engines, &lensum); - if (r < 0) - return NULL; - if (lensum == 0) - return PyFloat_FromDouble(1.0); - return PyFloat_FromDouble((lensum - r)/lensum); -} - -static PyObject* -setratio_py(PyObject *self, PyObject *args) -{ - SetSeqFuncs engines = { lev_set_distance, lev_u_set_distance }; - size_t lensum; - double r = setseq_common(args, "setratio", engines, &lensum); - if (r < 0) - return NULL; - if (lensum == 0) - return PyFloat_FromDouble(1.0); - return PyFloat_FromDouble((lensum - r)/lensum); -} - -static double -setseq_common(PyObject *args, const char *name, SetSeqFuncs foo, - size_t *lensum) -{ - size_t n1, n2; - void *strings1 = NULL; - void *strings2 = NULL; - size_t *sizes1 = NULL; - size_t *sizes2 = NULL; - PyObject *strlist1; - PyObject *strlist2; - PyObject *strseq1; - PyObject *strseq2; - int stringtype1, stringtype2; - double r = -1.0; - - if (!PyArg_UnpackTuple(args, PYARGCFIX(name), 2, 2, &strlist1, &strlist2)) - return r; - - if (!PySequence_Check(strlist1)) { - PyErr_Format(PyExc_TypeError, - "%s first argument must be a Sequence", name); - return r; - } - if (!PySequence_Check(strlist2)) { - PyErr_Format(PyExc_TypeError, - "%s second argument must be a Sequence", name); - return r; - } - - strseq1 = PySequence_Fast(strlist1, name); - strseq2 = PySequence_Fast(strlist2, name); - - n1 = PySequence_Fast_GET_SIZE(strseq1); - n2 = PySequence_Fast_GET_SIZE(strseq2); - *lensum = n1 + n2; - if (n1 == 0) { - Py_DECREF(strseq1); - Py_DECREF(strseq2); - return (double)n2; - } - if (n2 == 0) { - Py_DECREF(strseq1); - Py_DECREF(strseq2); - return (double)n1; - } - - stringtype1 = extract_stringlist(strseq1, name, n1, &sizes1, &strings1); - Py_DECREF(strseq1); - if (stringtype1 < 0) { - Py_DECREF(strseq2); - return r; - } - stringtype2 = extract_stringlist(strseq2, name, n2, &sizes2, &strings2); - Py_DECREF(strseq2); - if (stringtype2 < 0) { - free(sizes1); - free(strings1); - return r; - } - - if (stringtype1 != stringtype2) { - PyErr_Format(PyExc_TypeError, - "%s both sequences must consist of items of the same type", - name); - } - else if (stringtype1 == 0) { - r = foo.s(n1, sizes1, strings1, n2, sizes2, strings2); - if (r < 0.0) - PyErr_NoMemory(); - } - else if (stringtype1 == 1) { - r = foo.u(n1, sizes1, strings1, n2, sizes2, strings2); - if (r < 0.0) - PyErr_NoMemory(); - } - else - PyErr_Format(PyExc_SystemError, "%s internal error", name); - - free(strings1); - free(strings2); - free(sizes1); - free(sizes2); - return r; -} - -static inline LevEditType -string_to_edittype(PyObject *string) -{ - const char *s; - size_t i, len; - - for (i = 0; i < N_OPCODE_NAMES; i++) { - if (string == opcode_names[i].pystring) - return i; - } - - /* With Python >= 2.2, we shouldn't get here, except when the strings are - * not Strings but subtypes. */ -#ifdef LEV_PYTHON3 - /* For Python 3, the string is an unicode object; use CompareWithAsciiString */ - if (!PyUnicode_Check(string)) { - return LEV_EDIT_LAST; - } - - for (i = 0; i < N_OPCODE_NAMES; i++) { - if (PyUnicode_CompareWithASCIIString(string, opcode_names[i].cstring) == 0) { - return i; - } - } - -#else - - if (!PyString_Check(string)) - return LEV_EDIT_LAST; - - s = PyString_AS_STRING(string); - len = PyString_GET_SIZE(string); - for (i = 0; i < N_OPCODE_NAMES; i++) { - if (len == opcode_names[i].len - && memcmp(s, opcode_names[i].cstring, len) == 0) { - return i; - } - } -#endif - - return LEV_EDIT_LAST; -} - -static LevEditOp* -extract_editops(PyObject *list) -{ - LevEditOp *ops; - size_t i; - LevEditType type; - size_t n = PyList_GET_SIZE(list); - - ops = (LevEditOp*)safe_malloc(n, sizeof(LevEditOp)); - if (!ops) - return (LevEditOp*)PyErr_NoMemory(); - for (i = 0; i < n; i++) { - PyObject *item; - PyObject *tuple = PyList_GET_ITEM(list, i); - - if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 3) { - free(ops); - return NULL; - } - item = PyTuple_GET_ITEM(tuple, 0); - if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) { - free(ops); - return NULL; - } - ops[i].type = type; - item = PyTuple_GET_ITEM(tuple, 1); - if (!PyInt_Check(item)) { - free(ops); - return NULL; - } - ops[i].spos = (size_t)PyInt_AS_LONG(item); - item = PyTuple_GET_ITEM(tuple, 2); - if (!PyInt_Check(item)) { - free(ops); - return NULL; - } - ops[i].dpos = (size_t)PyInt_AS_LONG(item); - } - return ops; -} - -static LevOpCode* -extract_opcodes(PyObject *list) -{ - LevOpCode *bops; - size_t i; - LevEditType type; - size_t nb = PyList_GET_SIZE(list); - - bops = (LevOpCode*)safe_malloc(nb, sizeof(LevOpCode)); - if (!bops) - return (LevOpCode*)PyErr_NoMemory(); - for (i = 0; i < nb; i++) { - PyObject *item; - PyObject *tuple = PyList_GET_ITEM(list, i); - - if (!PyTuple_Check(tuple) || PyTuple_GET_SIZE(tuple) != 5) { - free(bops); - return NULL; - } - - item = PyTuple_GET_ITEM(tuple, 0); - if ((type = string_to_edittype(item)) == LEV_EDIT_LAST) { - free(bops); - return NULL; - } - bops[i].type = type; - - item = PyTuple_GET_ITEM(tuple, 1); - if (!PyInt_Check(item)) { - free(bops); - return NULL; - } - bops[i].sbeg = (size_t)PyInt_AS_LONG(item); - - item = PyTuple_GET_ITEM(tuple, 2); - if (!PyInt_Check(item)) { - free(bops); - return NULL; - } - bops[i].send = (size_t)PyInt_AS_LONG(item); - - item = PyTuple_GET_ITEM(tuple, 3); - if (!PyInt_Check(item)) { - free(bops); - return NULL; - } - bops[i].dbeg = (size_t)PyInt_AS_LONG(item); - - item = PyTuple_GET_ITEM(tuple, 4); - if (!PyInt_Check(item)) { - free(bops); - return NULL; - } - bops[i].dend = (size_t)PyInt_AS_LONG(item); - } - return bops; -} - -static PyObject* -editops_to_tuple_list(size_t n, LevEditOp *ops) -{ - PyObject *list; - size_t i; - - list = PyList_New(n); - for (i = 0; i < n; i++, ops++) { - PyObject *tuple = PyTuple_New(3); - PyObject *is = opcode_names[ops->type].pystring; - Py_INCREF(is); - PyTuple_SET_ITEM(tuple, 0, is); - PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)ops->spos)); - PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)ops->dpos)); - PyList_SET_ITEM(list, i, tuple); - } - - return list; -} - -static PyObject* -matching_blocks_to_tuple_list(size_t len1, size_t len2, - size_t nmb, LevMatchingBlock *mblocks) -{ - PyObject *list, *tuple; - size_t i; - - list = PyList_New(nmb + 1); - for (i = 0; i < nmb; i++, mblocks++) { - tuple = PyTuple_New(3); - PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)mblocks->spos)); - PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)mblocks->dpos)); - PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)mblocks->len)); - PyList_SET_ITEM(list, i, tuple); - } - tuple = PyTuple_New(3); - PyTuple_SET_ITEM(tuple, 0, PyInt_FromLong((long)len1)); - PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)len2)); - PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)0)); - PyList_SET_ITEM(list, nmb, tuple); - - return list; -} - -static size_t -get_length_of_anything(PyObject *object) -{ - if (PyInt_Check(object)) { - long int len = PyInt_AS_LONG(object); - if (len < 0) - len = -1; - return (size_t)len; - } - if (PySequence_Check(object)) - return PySequence_Length(object); - return (size_t)-1; -} - -static PyObject* -editops_py(PyObject *self, PyObject *args) -{ - PyObject *arg1, *arg2, *arg3 = NULL; - PyObject *oplist; - size_t len1, len2, n; - LevEditOp *ops; - LevOpCode *bops; - - if (!PyArg_UnpackTuple(args, PYARGCFIX("editops"), 2, 3, - &arg1, &arg2, &arg3)) { - return NULL; - } - - /* convert: we were called (bops, s1, s2) */ - if (arg3) { - if (!PyList_Check(arg1)) { - PyErr_Format(PyExc_ValueError, - "editops first argument must be a List of edit operations"); - return NULL; - } - n = PyList_GET_SIZE(arg1); - if (!n) { - Py_INCREF(arg1); - return arg1; - } - len1 = get_length_of_anything(arg2); - len2 = get_length_of_anything(arg3); - if (len1 == (size_t)-1 || len2 == (size_t)-1) { - PyErr_Format(PyExc_ValueError, - "editops second and third argument must specify sizes"); - return NULL; - } - - if ((bops = extract_opcodes(arg1)) != NULL) { - if (lev_opcodes_check_errors(len1, len2, n, bops)) { - PyErr_Format(PyExc_ValueError, - "editops edit operation list is invalid"); - free(bops); - return NULL; - } - ops = lev_opcodes_to_editops(n, bops, &n, 0); /* XXX: different n's! */ - if (!ops && n) { - free(bops); - return PyErr_NoMemory(); - } - oplist = editops_to_tuple_list(n, ops); - free(ops); - free(bops); - return oplist; - } - if ((ops = extract_editops(arg1)) != NULL) { - if (lev_editops_check_errors(len1, len2, n, ops)) { - PyErr_Format(PyExc_ValueError, - "editops edit operation list is invalid"); - free(ops); - return NULL; - } - free(ops); - Py_INCREF(arg1); /* editops -> editops is identity */ - return arg1; - } - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "editops first argument must be a List of edit operations"); - return NULL; - } - - /* find editops: we were called (s1, s2) */ - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2; - - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - ops = lev_editops_find(len1, string1, len2, string2, &n); - } - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2; - - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - ops = lev_u_editops_find(len1, string1, len2, string2, &n); - } - else { - PyErr_Format(PyExc_TypeError, - "editops expected two Strings or two Unicodes"); - return NULL; - } - if (!ops && n) - return PyErr_NoMemory(); - oplist = editops_to_tuple_list(n, ops); - free(ops); - return oplist; -} - -static PyObject* -opcodes_to_tuple_list(size_t nb, LevOpCode *bops) -{ - PyObject *list; - size_t i; - - list = PyList_New(nb); - for (i = 0; i < nb; i++, bops++) { - PyObject *tuple = PyTuple_New(5); - PyObject *is = opcode_names[bops->type].pystring; - Py_INCREF(is); - PyTuple_SET_ITEM(tuple, 0, is); - PyTuple_SET_ITEM(tuple, 1, PyInt_FromLong((long)bops->sbeg)); - PyTuple_SET_ITEM(tuple, 2, PyInt_FromLong((long)bops->send)); - PyTuple_SET_ITEM(tuple, 3, PyInt_FromLong((long)bops->dbeg)); - PyTuple_SET_ITEM(tuple, 4, PyInt_FromLong((long)bops->dend)); - PyList_SET_ITEM(list, i, tuple); - } - - return list; -} - -static PyObject* -opcodes_py(PyObject *self, PyObject *args) -{ - PyObject *arg1, *arg2, *arg3 = NULL; - PyObject *oplist; - size_t len1, len2, n, nb; - LevEditOp *ops; - LevOpCode *bops; - - if (!PyArg_UnpackTuple(args, PYARGCFIX("opcodes"), 2, 3, - &arg1, &arg2, &arg3)) - return NULL; - - /* convert: we were called (ops, s1, s2) */ - if (arg3) { - if (!PyList_Check(arg1)) { - PyErr_Format(PyExc_TypeError, - "opcodes first argument must be a List of edit operations"); - return NULL; - } - n = PyList_GET_SIZE(arg1); - len1 = get_length_of_anything(arg2); - len2 = get_length_of_anything(arg3); - if (len1 == (size_t)-1 || len2 == (size_t)-1) { - PyErr_Format(PyExc_ValueError, - "opcodes second and third argument must specify sizes"); - return NULL; - } - - if ((ops = extract_editops(arg1)) != NULL) { - if (lev_editops_check_errors(len1, len2, n, ops)) { - PyErr_Format(PyExc_ValueError, - "opcodes edit operation list is invalid"); - free(ops); - return NULL; - } - bops = lev_editops_to_opcodes(n, ops, &n, len1, len2); /* XXX: n != n */ - if (!bops && n) { - free(ops); - return PyErr_NoMemory(); - } - oplist = opcodes_to_tuple_list(n, bops); - free(bops); - free(ops); - return oplist; - } - if ((bops = extract_opcodes(arg1)) != NULL) { - if (lev_opcodes_check_errors(len1, len2, n, bops)) { - PyErr_Format(PyExc_ValueError, - "opcodes edit operation list is invalid"); - free(bops); - return NULL; - } - free(bops); - Py_INCREF(arg1); /* opcodes -> opcodes is identity */ - return arg1; - } - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "opcodes first argument must be a List of edit operations"); - return NULL; - } - - /* find opcodes: we were called (s1, s2) */ - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2; - - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - ops = lev_editops_find(len1, string1, len2, string2, &n); - } - else if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2; - - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - ops = lev_u_editops_find(len1, string1, len2, string2, &n); - } - else { - PyErr_Format(PyExc_TypeError, - "opcodes expected two Strings or two Unicodes"); - return NULL; - } - if (!ops && n) - return PyErr_NoMemory(); - bops = lev_editops_to_opcodes(n, ops, &nb, len1, len2); - free(ops); - if (!bops && nb) - return PyErr_NoMemory(); - oplist = opcodes_to_tuple_list(nb, bops); - free(bops); - return oplist; -} - -static PyObject* -inverse_py(PyObject *self, PyObject *args) -{ - PyObject *list, *result; - size_t n; - LevEditOp *ops; - LevOpCode *bops; - - if (!PyArg_UnpackTuple(args, PYARGCFIX("inverse"), 1, 1, &list) - || !PyList_Check(list)) - return NULL; - - n = PyList_GET_SIZE(list); - if (!n) { - Py_INCREF(list); - return list; - } - if ((ops = extract_editops(list)) != NULL) { - lev_editops_invert(n, ops); - result = editops_to_tuple_list(n, ops); - free(ops); - return result; - } - if ((bops = extract_opcodes(list)) != NULL) { - lev_opcodes_invert(n, bops); - result = opcodes_to_tuple_list(n, bops); - free(bops); - return result; - } - - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "inverse expected a list of edit operations"); - return NULL; -} - -static PyObject* -apply_edit_py(PyObject *self, PyObject *args) -{ - PyObject *list, *result, *arg1, *arg2; - size_t n, len, len1, len2; - LevEditOp *ops; - LevOpCode *bops; - - if (!PyArg_UnpackTuple(args, PYARGCFIX("apply_edit"), 3, 3, - &list, &arg1, &arg2)) - return NULL; - - if (!PyList_Check(list)) { - PyErr_Format(PyExc_TypeError, - "apply_edit first argument must be a List of edit operations"); - return NULL; - } - n = PyList_GET_SIZE(list); - - if (PyObject_TypeCheck(arg1, &PyString_Type) - && PyObject_TypeCheck(arg2, &PyString_Type)) { - lev_byte *string1, *string2, *s; - - if (!n) { - Py_INCREF(arg1); - return arg1; - } - len1 = PyString_GET_SIZE(arg1); - len2 = PyString_GET_SIZE(arg2); - string1 = PyString_AS_STRING(arg1); - string2 = PyString_AS_STRING(arg2); - - if ((ops = extract_editops(list)) != NULL) { - if (lev_editops_check_errors(len1, len2, n, ops)) { - PyErr_Format(PyExc_ValueError, - "apply_edit edit operations are invalid or inapplicable"); - free(ops); - return NULL; - } - s = lev_editops_apply(len1, string1, len2, string2, - n, ops, &len); - free(ops); - if (!s && len) - return PyErr_NoMemory(); - result = PyString_FromStringAndSize(s, len); - free(s); - return result; - } - if ((bops = extract_opcodes(list)) != NULL) { - if (lev_opcodes_check_errors(len1, len2, n, bops)) { - PyErr_Format(PyExc_ValueError, - "apply_edit edit operations are invalid or inapplicable"); - free(bops); - return NULL; - } - s = lev_opcodes_apply(len1, string1, len2, string2, - n, bops, &len); - free(bops); - if (!s && len) - return PyErr_NoMemory(); - result = PyString_FromStringAndSize(s, len); - free(s); - return result; - } - - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "apply_edit first argument must be " - "a list of edit operations"); - return NULL; - } - if (PyObject_TypeCheck(arg1, &PyUnicode_Type) - && PyObject_TypeCheck(arg2, &PyUnicode_Type)) { - Py_UNICODE *string1, *string2, *s; - - if (!n) { - Py_INCREF(arg1); - return arg1; - } - len1 = PyUnicode_GET_SIZE(arg1); - len2 = PyUnicode_GET_SIZE(arg2); - string1 = PyUnicode_AS_UNICODE(arg1); - string2 = PyUnicode_AS_UNICODE(arg2); - - if ((ops = extract_editops(list)) != NULL) { - if (lev_editops_check_errors(len1, len2, n, ops)) { - PyErr_Format(PyExc_ValueError, - "apply_edit edit operations are invalid or inapplicable"); - free(ops); - return NULL; - } - s = lev_u_editops_apply(len1, string1, len2, string2, - n, ops, &len); - free(ops); - if (!s && len) - return PyErr_NoMemory(); - result = PyUnicode_FromUnicode(s, len); - free(s); - return result; - } - if ((bops = extract_opcodes(list)) != NULL) { - if (lev_opcodes_check_errors(len1, len2, n, bops)) { - PyErr_Format(PyExc_ValueError, - "apply_edit edit operations are invalid or inapplicable"); - free(bops); - return NULL; - } - s = lev_u_opcodes_apply(len1, string1, len2, string2, - n, bops, &len); - free(bops); - if (!s && len) - return PyErr_NoMemory(); - result = PyUnicode_FromUnicode(s, len); - free(s); - return result; - } - - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "apply_edit first argument must be " - "a list of edit operations"); - return NULL; - } - - PyErr_Format(PyExc_TypeError, - "apply_edit expected two Strings or two Unicodes"); - return NULL; -} - -static PyObject* -matching_blocks_py(PyObject *self, PyObject *args) -{ - PyObject *list, *arg1, *arg2, *result; - size_t n, nmb, len1, len2; - LevEditOp *ops; - LevOpCode *bops; - LevMatchingBlock *mblocks; - - if (!PyArg_UnpackTuple(args, PYARGCFIX("matching_blocks"), 3, 3, - &list, &arg1, &arg2) - || !PyList_Check(list)) - return NULL; - - if (!PyList_Check(list)) { - PyErr_Format(PyExc_TypeError, - "matching_blocks first argument must be " - "a List of edit operations"); - return NULL; - } - n = PyList_GET_SIZE(list); - len1 = get_length_of_anything(arg1); - len2 = get_length_of_anything(arg2); - if (len1 == (size_t)-1 || len2 == (size_t)-1) { - PyErr_Format(PyExc_ValueError, - "matching_blocks second and third argument " - "must specify sizes"); - return NULL; - } - - if ((ops = extract_editops(list)) != NULL) { - if (lev_editops_check_errors(len1, len2, n, ops)) { - PyErr_Format(PyExc_ValueError, - "apply_edit edit operations are invalid or inapplicable"); - free(ops); - return NULL; - } - mblocks = lev_editops_matching_blocks(len1, len2, n, ops, &nmb); - free(ops); - if (!mblocks && nmb) - return PyErr_NoMemory(); - result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks); - free(mblocks); - return result; - } - if ((bops = extract_opcodes(list)) != NULL) { - if (lev_opcodes_check_errors(len1, len2, n, bops)) { - PyErr_Format(PyExc_ValueError, - "apply_edit edit operations are invalid or inapplicable"); - free(bops); - return NULL; - } - mblocks = lev_opcodes_matching_blocks(len1, len2, n, bops, &nmb); - free(bops); - if (!mblocks && nmb) - return PyErr_NoMemory(); - result = matching_blocks_to_tuple_list(len1, len2, nmb, mblocks); - free(mblocks); - return result; - } - - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "inverse expected a list of edit operations"); - return NULL; -} - -static PyObject* -subtract_edit_py(PyObject *self, PyObject *args) -{ - PyObject *list, *sub, *result; - size_t n, ns, nr; - LevEditOp *ops, *osub, *orem; - - if (!PyArg_UnpackTuple(args, PYARGCFIX("subtract_edit"), 2, 2, &list, &sub) - || !PyList_Check(list)) - return NULL; - - ns = PyList_GET_SIZE(sub); - if (!ns) { - Py_INCREF(list); - return list; - } - n = PyList_GET_SIZE(list); - if (!n) { - PyErr_Format(PyExc_ValueError, - "subtract_edit subsequence is not a subsequence " - "or is invalid"); - return NULL; - } - - if ((ops = extract_editops(list)) != NULL) { - if ((osub = extract_editops(sub)) != NULL) { - orem = lev_editops_subtract(n, ops, ns, osub, &nr); - free(ops); - free(osub); - - if (!orem && nr == -1) { - PyErr_Format(PyExc_ValueError, - "subtract_edit subsequence is not a subsequence " - "or is invalid"); - return NULL; - } - result = editops_to_tuple_list(nr, orem); - free(orem); - - return result; - } - free(ops); - } - - if (!PyErr_Occurred()) - PyErr_Format(PyExc_TypeError, - "subtract_edit expected two lists of edit operations"); - return NULL; -} - - -PY_MOD_INIT_FUNC_DEF(_levenshtein) -{ -#ifdef LEV_PYTHON3 - PyObject *module; -#endif - size_t i; - - PY_INIT_MOD(module, "_levenshtein", Levenshtein_DESC, methods) - /* create intern strings for edit operation names */ - if (opcode_names[0].pystring) - abort(); - for (i = 0; i < N_OPCODE_NAMES; i++) { -#ifdef LEV_PYTHON3 - opcode_names[i].pystring - = PyUnicode_InternFromString(opcode_names[i].cstring); -#else - opcode_names[i].pystring - = PyString_InternFromString(opcode_names[i].cstring); -#endif - opcode_names[i].len = strlen(opcode_names[i].cstring); - } - lev_init_rng(0); -#ifdef LEV_PYTHON3 - return module; -#endif -} -/* }}} */ -#endif /* not NO_PYTHON */ - -/**************************************************************************** - * - * C (i.e. executive) part - * - ****************************************************************************/ - -/**************************************************************************** - * - * Taus113 - * - ****************************************************************************/ -/* {{{ */ - -/* - * Based on Tausworthe random generator implementation rng/taus113.c - * from the GNU Scientific Library (http://sources.redhat.com/gsl/) - * which has the notice - * Copyright (C) 2002 Atakan Gurkan - * Based on the file taus.c which has the notice - * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough - */ - -static inline unsigned long -taus113_get(taus113_state_t *state) -{ - unsigned long b1, b2, b3, b4; - - b1 = ((((state->z1 << 6UL) & TAUS113_MASK) ^ state->z1) >> 13UL); - state->z1 = ((((state->z1 & 4294967294UL) << 18UL) & TAUS113_MASK) ^ b1); - - b2 = ((((state->z2 << 2UL) & TAUS113_MASK) ^ state->z2) >> 27UL); - state->z2 = ((((state->z2 & 4294967288UL) << 2UL) & TAUS113_MASK) ^ b2); - - b3 = ((((state->z3 << 13UL) & TAUS113_MASK) ^ state->z3) >> 21UL); - state->z3 = ((((state->z3 & 4294967280UL) << 7UL) & TAUS113_MASK) ^ b3); - - b4 = ((((state->z4 << 3UL) & TAUS113_MASK) ^ state->z4) >> 12UL); - state->z4 = ((((state->z4 & 4294967168UL) << 13UL) & TAUS113_MASK) ^ b4); - - return (state->z1 ^ state->z2 ^ state->z3 ^ state->z4); - -} - -static void -taus113_set(taus113_state_t *state, - unsigned long int s) -{ - if (!s) - s = 1UL; /* default seed is 1 */ - - state->z1 = TAUS113_LCG(s); - if (state->z1 < 2UL) - state->z1 += 2UL; - - state->z2 = TAUS113_LCG(state->z1); - if (state->z2 < 8UL) - state->z2 += 8UL; - - state->z3 = TAUS113_LCG(state->z2); - if (state->z3 < 16UL) - state->z3 += 16UL; - - state->z4 = TAUS113_LCG(state->z3); - if (state->z4 < 128UL) - state->z4 += 128UL; - - /* Calling RNG ten times to satify recurrence condition */ - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); - taus113_get(state); -} - - -/** - * lev_init_rng: - * @seed: A seed. If zero, a default value (currently 1) is used instead. - * - * Initializes the random generator used by some Levenshtein functions. - * - * This does NOT happen automatically when these functions are used. - **/ -_LEV_STATIC_PY void -lev_init_rng(unsigned long int seed) -{ - static taus113_state_t state; - - taus113_set(&state, seed); -} -/* }}} */ - -/**************************************************************************** - * - * Basic stuff, Levenshtein distance - * - ****************************************************************************/ -/* {{{ */ - -/** - * lev_edit_distance: - * @len1: The length of @string1. - * @string1: A sequence of bytes of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A sequence of bytes of length @len2, may contain NUL characters. - * @xcost: If nonzero, the replace operation has weight 2, otherwise all - * edit operations have equal weights of 1. - * - * Computes Levenshtein edit distance of two strings. - * - * Returns: The edit distance. - **/ -_LEV_STATIC_PY size_t -lev_edit_distance(size_t len1, const lev_byte *string1, - size_t len2, const lev_byte *string2, - int xcost) -{ - size_t i; - size_t *row; /* we only need to keep one row of costs */ - size_t *end; - size_t half; - - /* strip common prefix */ - while (len1 > 0 && len2 > 0 && *string1 == *string2) { - len1--; - len2--; - string1++; - string2++; - } - - /* strip common suffix */ - while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { - len1--; - len2--; - } - - /* catch trivial cases */ - if (len1 == 0) - return len2; - if (len2 == 0) - return len1; - - /* make the inner cycle (i.e. string2) the longer one */ - if (len1 > len2) { - size_t nx = len1; - const lev_byte *sx = string1; - len1 = len2; - len2 = nx; - string1 = string2; - string2 = sx; - } - /* check len1 == 1 separately */ - if (len1 == 1) { - if (xcost) - return len2 + 1 - 2*(memchr(string2, *string1, len2) != NULL); - else - return len2 - (memchr(string2, *string1, len2) != NULL); - } - len1++; - len2++; - half = len1 >> 1; - - /* initalize first row */ - row = (size_t*)safe_malloc(len2, sizeof(size_t)); - if (!row) - return (size_t)(-1); - end = row + len2 - 1; - for (i = 0; i < len2 - (xcost ? 0 : half); i++) - row[i] = i; - - /* go through the matrix and compute the costs. yes, this is an extremely - * obfuscated version, but also extremely memory-conservative and relatively - * fast. */ - if (xcost) { - for (i = 1; i < len1; i++) { - size_t *p = row + 1; - const lev_byte char1 = string1[i - 1]; - const lev_byte *char2p = string2; - size_t D = i; - size_t x = i; - while (p <= end) { - if (char1 == *(char2p++)) - x = --D; - else - x++; - D = *p; - D++; - if (x > D) - x = D; - *(p++) = x; - } - } - } - else { - /* in this case we don't have to scan two corner triangles (of size len1/2) - * in the matrix because no best path can go throught them. note this - * breaks when len1 == len2 == 2 so the memchr() special case above is - * necessary */ - row[0] = len1 - half - 1; - for (i = 1; i < len1; i++) { - size_t *p; - const lev_byte char1 = string1[i - 1]; - const lev_byte *char2p; - size_t D, x; - /* skip the upper triangle */ - if (i >= len1 - half) { - size_t offset = i - (len1 - half); - size_t c3; - - char2p = string2 + offset; - p = row + offset; - c3 = *(p++) + (char1 != *(char2p++)); - x = *p; - x++; - D = x; - if (x > c3) - x = c3; - *(p++) = x; - } - else { - p = row + 1; - char2p = string2; - D = x = i; - } - /* skip the lower triangle */ - if (i <= half + 1) - end = row + len2 + i - half - 2; - /* main */ - while (p <= end) { - size_t c3 = --D + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - D = *p; - D++; - if (x > D) - x = D; - *(p++) = x; - } - /* lower triangle sentinel */ - if (i <= half) { - size_t c3 = --D + (char1 != *char2p); - x++; - if (x > c3) - x = c3; - *p = x; - } - } - } - - i = *end; - free(row); - return i; -} - -_LEV_STATIC_PY double -lev_edit_distance_sod(size_t len, const lev_byte *string, - size_t n, const size_t *lengths, - const lev_byte *strings[], - const double *weights, - int xcost) -{ - size_t i, d; - double sum = 0.0; - - for (i = 0; i < n; i++) { - d = lev_edit_distance(len, string, lengths[i], strings[i], xcost); - if (d == (size_t)-1) - return -1.0; - sum += weights[i]*d; - } - return sum; -} - -/** - * lev_u_edit_distance: - * @len1: The length of @string1. - * @string1: A sequence of Unicode characters of length @len1, may contain NUL - * characters. - * @len2: The length of @string2. - * @string2: A sequence of Unicode characters of length @len2, may contain NUL - * characters. - * @xcost: If nonzero, the replace operation has weight 2, otherwise all - * edit operations have equal weights of 1. - * - * Computes Levenshtein edit distance of two Unicode strings. - * - * Returns: The edit distance. - **/ -_LEV_STATIC_PY size_t -lev_u_edit_distance(size_t len1, const lev_wchar *string1, - size_t len2, const lev_wchar *string2, - int xcost) -{ - size_t i; - size_t *row; /* we only need to keep one row of costs */ - size_t *end; - size_t half; - - /* strip common prefix */ - while (len1 > 0 && len2 > 0 && *string1 == *string2) { - len1--; - len2--; - string1++; - string2++; - } - - /* strip common suffix */ - while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { - len1--; - len2--; - } - - /* catch trivial cases */ - if (len1 == 0) - return len2; - if (len2 == 0) - return len1; - - /* make the inner cycle (i.e. string2) the longer one */ - if (len1 > len2) { - size_t nx = len1; - const lev_wchar *sx = string1; - len1 = len2; - len2 = nx; - string1 = string2; - string2 = sx; - } - /* check len1 == 1 separately */ - if (len1 == 1) { - lev_wchar z = *string1; - const lev_wchar *p = string2; - for (i = len2; i; i--) { - if (*(p++) == z) - return len2 - 1; - } - return len2 + (xcost != 0); - } - len1++; - len2++; - half = len1 >> 1; - - /* initalize first row */ - row = (size_t*)safe_malloc(len2, sizeof(size_t)); - if (!row) - return (size_t)(-1); - end = row + len2 - 1; - for (i = 0; i < len2 - (xcost ? 0 : half); i++) - row[i] = i; - - /* go through the matrix and compute the costs. yes, this is an extremely - * obfuscated version, but also extremely memory-conservative and relatively - * fast. */ - if (xcost) { - for (i = 1; i < len1; i++) { - size_t *p = row + 1; - const lev_wchar char1 = string1[i - 1]; - const lev_wchar *char2p = string2; - size_t D = i - 1; - size_t x = i; - while (p <= end) { - if (char1 == *(char2p++)) - x = D; - else - x++; - D = *p; - if (x > D + 1) - x = D + 1; - *(p++) = x; - } - } - } - else { - /* in this case we don't have to scan two corner triangles (of size len1/2) - * in the matrix because no best path can go throught them. note this - * breaks when len1 == len2 == 2 so the memchr() special case above is - * necessary */ - row[0] = len1 - half - 1; - for (i = 1; i < len1; i++) { - size_t *p; - const lev_wchar char1 = string1[i - 1]; - const lev_wchar *char2p; - size_t D, x; - /* skip the upper triangle */ - if (i >= len1 - half) { - size_t offset = i - (len1 - half); - size_t c3; - - char2p = string2 + offset; - p = row + offset; - c3 = *(p++) + (char1 != *(char2p++)); - x = *p; - x++; - D = x; - if (x > c3) - x = c3; - *(p++) = x; - } - else { - p = row + 1; - char2p = string2; - D = x = i; - } - /* skip the lower triangle */ - if (i <= half + 1) - end = row + len2 + i - half - 2; - /* main */ - while (p <= end) { - size_t c3 = --D + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - D = *p; - D++; - if (x > D) - x = D; - *(p++) = x; - } - /* lower triangle sentinel */ - if (i <= half) { - size_t c3 = --D + (char1 != *char2p); - x++; - if (x > c3) - x = c3; - *p = x; - } - } - } - - i = *end; - free(row); - return i; -} - -_LEV_STATIC_PY double -lev_u_edit_distance_sod(size_t len, const lev_wchar *string, - size_t n, const size_t *lengths, - const lev_wchar *strings[], - const double *weights, - int xcost) -{ - size_t i, d; - double sum = 0.0; - - for (i = 0; i < n; i++) { - d = lev_u_edit_distance(len, string, lengths[i], strings[i], xcost); - if (d == (size_t)-1) - return -1.0; - sum += weights[i]*d; - } - return sum; -} -/* }}} */ - - -/**************************************************************************** - * - * Other simple distances: Hamming, Jaro, Jaro-Winkler - * - ****************************************************************************/ -/* {{{ */ -/** - * lev_hamming_distance: - * @len: The length of @string1 and @string2. - * @string1: A sequence of bytes of length @len1, may contain NUL characters. - * @string2: A sequence of bytes of length @len2, may contain NUL characters. - * - * Computes Hamming distance of two strings. - * - * The strings must have the same length. - * - * Returns: The Hamming distance. - **/ -_LEV_STATIC_PY size_t -lev_hamming_distance(size_t len, - const lev_byte *string1, - const lev_byte *string2) -{ - size_t dist, i; - - dist = 0; - for (i = len; i; i--) { - if (*(string1++) != *(string2++)) - dist++; - } - - return dist; -} - -/** - * lev_u_hamming_distance: - * @len: The length of @string1 and @string2. - * @string1: A sequence of Unicode characters of length @len1, may contain NUL - * characters. - * @string2: A sequence of Unicode characters of length @len2, may contain NUL - * characters. - * - * Computes Hamming distance of two strings. - * - * The strings must have the same length. - * - * Returns: The Hamming distance. - **/ -_LEV_STATIC_PY size_t -lev_u_hamming_distance(size_t len, - const lev_wchar *string1, - const lev_wchar *string2) -{ - size_t dist, i; - - dist = 0; - for (i = len; i; i--) { - if (*(string1++) != *(string2++)) - dist++; - } - - return dist; -} - -/** - * lev_jaro_ratio: - * @len1: The length of @string1. - * @string1: A sequence of bytes of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A sequence of bytes of length @len2, may contain NUL characters. - * - * Computes Jaro string similarity metric of two strings. - * - * Returns: The Jaro metric of @string1 and @string2. - **/ -_LEV_STATIC_PY double -lev_jaro_ratio(size_t len1, const lev_byte *string1, - size_t len2, const lev_byte *string2) -{ - size_t i, j, halflen, trans, match, to; - size_t *idx; - double md; - - if (len1 == 0 || len2 == 0) { - if (len1 == 0 && len2 == 0) - return 1.0; - return 0.0; - } - /* make len1 always shorter (or equally long) */ - if (len1 > len2) { - const lev_byte *b; - - b = string1; - string1 = string2; - string2 = b; - - i = len1; - len1 = len2; - len2 = i; - } - - halflen = (len1 + 1)/2; - idx = (size_t*)calloc(len1, sizeof(size_t)); - if (!idx) - return -1.0; - - /* The literature about Jaro metric is confusing as the method of assigment - * of common characters is nowhere specified. There are several possible - * deterministic mutual assignments of common characters of two strings. - * We use earliest-position method, which is however suboptimal (e.g., it - * gives two transpositions in jaro("Jaro", "Joaro") because of assigment - * of the first `o'). No reasonable algorithm for the optimal one is - * currently known to me. */ - match = 0; - /* the part with allowed range overlapping left */ - for (i = 0; i < halflen; i++) { - for (j = 0; j <= i+halflen; j++) { - if (string1[j] == string2[i] && !idx[j]) { - match++; - idx[j] = match; - break; - } - } - } - /* the part with allowed range overlapping right */ - to = len1+halflen < len2 ? len1+halflen : len2; - for (i = halflen; i < to; i++) { - for (j = i - halflen; j < len1; j++) { - if (string1[j] == string2[i] && !idx[j]) { - match++; - idx[j] = match; - break; - } - } - } - if (!match) { - free(idx); - return 0.0; - } - /* count transpositions */ - i = 0; - trans = 0; - for (j = 0; j < len1; j++) { - if (idx[j]) { - i++; - if (idx[j] != i) - trans++; - } - } - free(idx); - - md = (double)match; - return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0; -} - -/** - * lev_u_jaro_ratio: - * @len1: The length of @string1. - * @string1: A sequence of Unicode characters of length @len1, may contain NUL - * characters. - * @len2: The length of @string2. - * @string2: A sequence of Unicode characters of length @len2, may contain NUL - * characters. - * - * Computes Jaro string similarity metric of two Unicode strings. - * - * Returns: The Jaro metric of @string1 and @string2. - **/ -_LEV_STATIC_PY double -lev_u_jaro_ratio(size_t len1, const lev_wchar *string1, - size_t len2, const lev_wchar *string2) -{ - size_t i, j, halflen, trans, match, to; - size_t *idx; - double md; - - if (len1 == 0 || len2 == 0) { - if (len1 == 0 && len2 == 0) - return 1.0; - return 0.0; - } - /* make len1 always shorter (or equally long) */ - if (len1 > len2) { - const lev_wchar *b; - - b = string1; - string1 = string2; - string2 = b; - - i = len1; - len1 = len2; - len2 = i; - } - - halflen = (len1 + 1)/2; - idx = (size_t*)calloc(len1, sizeof(size_t)); - if (!idx) - return -1.0; - - match = 0; - /* the part with allowed range overlapping left */ - for (i = 0; i < halflen; i++) { - for (j = 0; j <= i+halflen; j++) { - if (string1[j] == string2[i] && !idx[j]) { - match++; - idx[j] = match; - break; - } - } - } - /* the part with allowed range overlapping right */ - to = len1+halflen < len2 ? len1+halflen : len2; - for (i = halflen; i < to; i++) { - for (j = i - halflen; j < len1; j++) { - if (string1[j] == string2[i] && !idx[j]) { - match++; - idx[j] = match; - break; - } - } - } - if (!match) { - free(idx); - return 0.0; - } - /* count transpositions */ - i = 0; - trans = 0; - for (j = 0; j < len1; j++) { - if (idx[j]) { - i++; - if (idx[j] != i) - trans++; - } - } - free(idx); - - md = (double)match; - return (md/len1 + md/len2 + 1.0 - trans/md/2.0)/3.0; -} - -/** - * lev_jaro_winkler_ratio: - * @len1: The length of @string1. - * @string1: A sequence of bytes of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A sequence of bytes of length @len2, may contain NUL characters. - * @pfweight: Prefix weight, i.e., how much weight should be given to a - * common prefix. - * - * Computes Jaro-Winkler string similarity metric of two strings. - * - * The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the - * length of common prefix. - * - * Returns: The Jaro-Winkler metric of @string1 and @string2. - **/ -_LEV_STATIC_PY double -lev_jaro_winkler_ratio(size_t len1, const lev_byte *string1, - size_t len2, const lev_byte *string2, - double pfweight) -{ - double j; - size_t p, m; - - j = lev_jaro_ratio(len1, string1, len2, string2); - m = len1 < len2 ? len1 : len2; - for (p = 0; p < m; p++) { - if (string1[p] != string2[p]) - break; - } - j += (1.0 - j)*p*pfweight; - return j > 1.0 ? 1.0 : j; -} - -/** - * lev_u_jaro_winkler_ratio: - * @len1: The length of @string1. - * @string1: A sequence of Unicode characters of length @len1, may contain NUL - * characters. - * @len2: The length of @string2. - * @string2: A sequence of Unicode characters of length @len2, may contain NUL - * characters. - * @pfweight: Prefix weight, i.e., how much weight should be given to a - * common prefix. - * - * Computes Jaro-Winkler string similarity metric of two Unicode strings. - * - * The formula is J+@pfweight*P*(1-J), where J is Jaro metric and P is the - * length of common prefix. - * - * Returns: The Jaro-Winkler metric of @string1 and @string2. - **/ -_LEV_STATIC_PY double -lev_u_jaro_winkler_ratio(size_t len1, const lev_wchar *string1, - size_t len2, const lev_wchar *string2, - double pfweight) -{ - double j; - size_t p, m; - - j = lev_u_jaro_ratio(len1, string1, len2, string2); - m = len1 < len2 ? len1 : len2; - for (p = 0; p < m; p++) { - if (string1[p] != string2[p]) - break; - } - j += (1.0 - j)*p*pfweight; - return j > 1.0 ? 1.0 : j; -} -/* }}} */ - -/**************************************************************************** - * - * Generalized medians, the greedy algorithm, and greedy improvements - * - ****************************************************************************/ -/* {{{ */ - -/* compute the sets of symbols each string contains, and the set of symbols - * in any of them (symset). meanwhile, count how many different symbols - * there are (used below for symlist). */ -static lev_byte* -make_symlist(size_t n, const size_t *lengths, - const lev_byte *strings[], size_t *symlistlen) -{ - short int *symset; /* indexed by ALL symbols, contains 1 for symbols - present in the strings, zero for others */ - size_t i, j; - lev_byte *symlist; - - symset = calloc(0x100, sizeof(short int)); - if (!symset) { - *symlistlen = (size_t)(-1); - return NULL; - } - *symlistlen = 0; - for (i = 0; i < n; i++) { - const lev_byte *stri = strings[i]; - for (j = 0; j < lengths[i]; j++) { - int c = stri[j]; - if (!symset[c]) { - (*symlistlen)++; - symset[c] = 1; - } - } - } - if (!*symlistlen) { - free(symset); - return NULL; - } - - /* create dense symbol table, so we can easily iterate over only characters - * present in the strings */ - { - size_t pos = 0; - symlist = (lev_byte*)safe_malloc((*symlistlen), sizeof(lev_byte)); - if (!symlist) { - *symlistlen = (size_t)(-1); - free(symset); - return NULL; - } - for (j = 0; j < 0x100; j++) { - if (symset[j]) - symlist[pos++] = (lev_byte)j; - } - } - free(symset); - - return symlist; -} - -/** - * lev_greedy_median: - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * @medlength: Where the length of the median should be stored. - * - * Finds a generalized median string of @strings using the greedy algorithm. - * - * Note it's considerably more efficient to give a string with weight 2 - * than to store two identical strings in @strings (with weights 1). - * - * Returns: The generalized median, as a newly allocated string; its length - * is stored in @medlength. - **/ -_LEV_STATIC_PY lev_byte* -lev_greedy_median(size_t n, const size_t *lengths, - const lev_byte *strings[], - const double *weights, - size_t *medlength) -{ - size_t i; /* usually iterates over strings (n) */ - size_t j; /* usually iterates over characters */ - size_t len; /* usually iterates over the approximate median string */ - lev_byte *symlist; /* list of symbols present in the strings, - we iterate over it insead of set of all - existing symbols */ - size_t symlistlen; /* length of symlist */ - size_t maxlen; /* maximum input string length */ - size_t stoplen; /* maximum tried median string length -- this is slightly - higher than maxlen, because the median string may be - longer than any of the input strings */ - size_t **rows; /* Levenshtein matrix rows for each string, we need to keep - only one previous row to construct the current one */ - size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, - shared among all strings */ - lev_byte *median; /* the resulting approximate median string */ - double *mediandist; /* the total distance of the best median string of - given length. warning! mediandist[0] is total - distance for empty string, while median[] itself - is normally zero-based */ - size_t bestlen; /* the best approximate median string length */ - - /* find all symbols */ - symlist = make_symlist(n, lengths, strings, &symlistlen); - if (!symlist) { - *medlength = 0; - if (symlistlen != 0) - return NULL; - else - return calloc(1, sizeof(lev_byte)); - } - - /* allocate and initialize per-string matrix rows and a common work buffer */ - rows = (size_t**)safe_malloc(n, sizeof(size_t*)); - if (!rows) { - free(symlist); - return NULL; - } - maxlen = 0; - for (i = 0; i < n; i++) { - size_t *ri; - size_t leni = lengths[i]; - if (leni > maxlen) - maxlen = leni; - ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); - if (!ri) { - for (j = 0; j < i; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - for (j = 0; j <= leni; j++) - ri[j] = j; - } - stoplen = 2*maxlen + 1; - row = (size_t*)safe_malloc((stoplen + 1), sizeof(size_t)); - if (!row) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - - /* compute final cost of string of length 0 (empty string may be also - * a valid answer) */ - median = (lev_byte*)safe_malloc(stoplen, sizeof(lev_byte)); - if (!median) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(row); - free(symlist); - return NULL; - } - mediandist = (double*)safe_malloc((stoplen + 1), sizeof(double)); - if (!mediandist) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(row); - free(symlist); - free(median); - return NULL; - } - mediandist[0] = 0.0; - for (i = 0; i < n; i++) - mediandist[0] += lengths[i]*weights[i]; - - /* build up the approximate median string symbol by symbol - * XXX: we actually exit on break below, but on the same condition */ - for (len = 1; len <= stoplen; len++) { - lev_byte symbol; - double minminsum = LEV_INFINITY; - row[0] = len; - /* iterate over all symbols we may want to add */ - for (j = 0; j < symlistlen; j++) { - double totaldist = 0.0; - double minsum = 0.0; - symbol = symlist[j]; - /* sum Levenshtein distances from all the strings, with given weights */ - for (i = 0; i < n; i++) { - const lev_byte *stri = strings[i]; - size_t *p = rows[i]; - size_t leni = lengths[i]; - size_t *end = rows[i] + leni; - size_t min = len; - size_t x = len; /* == row[0] */ - /* compute how another row of Levenshtein matrix would look for median - * string with this symbol added */ - while (p < end) { - size_t D = *(p++) + (symbol != *(stri++)); - x++; - if (x > D) - x = D; - if (x > *p + 1) - x = *p + 1; - if (x < min) - min = x; - } - minsum += min*weights[i]; - totaldist += x*weights[i]; - } - /* is this symbol better than all the others? */ - if (minsum < minminsum) { - minminsum = minsum; - mediandist[len] = totaldist; - median[len - 1] = symbol; - } - } - /* stop the iteration if we no longer need to recompute the matrix rows - * or when we are over maxlen and adding more characters doesn't seem - * useful */ - if (len == stoplen - || (len > maxlen && mediandist[len] > mediandist[len - 1])) { - stoplen = len; - break; - } - /* now the best symbol is known, so recompute all matrix rows for this - * one */ - symbol = median[len - 1]; - for (i = 0; i < n; i++) { - const lev_byte *stri = strings[i]; - size_t *oldrow = rows[i]; - size_t leni = lengths[i]; - size_t k; - /* compute a row of Levenshtein matrix */ - for (k = 1; k <= leni; k++) { - size_t c1 = oldrow[k] + 1; - size_t c2 = row[k - 1] + 1; - size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); - row[k] = c2 > c3 ? c3 : c2; - if (row[k] > c1) - row[k] = c1; - } - memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); - } - } - - /* find the string with minimum total distance */ - bestlen = 0; - for (len = 1; len <= stoplen; len++) { - if (mediandist[len] < mediandist[bestlen]) - bestlen = len; - } - - /* clean up */ - for (i = 0; i < n; i++) - free(rows[i]); - free(rows); - free(row); - free(symlist); - free(mediandist); - - /* return result */ - { - lev_byte *result = (lev_byte*)safe_malloc(bestlen, sizeof(lev_byte)); - if (!result) { - free(median); - return NULL; - } - memcpy(result, median, bestlen*sizeof(lev_byte)); - free(median); - *medlength = bestlen; - return result; - } -} - -/* - * Knowing the distance matrices up to some row, finish the distance - * computations. - * - * string1, len1 are already shortened. - */ -static double -finish_distance_computations(size_t len1, lev_byte *string1, - size_t n, const size_t *lengths, - const lev_byte **strings, - const double *weights, size_t **rows, - size_t *row) -{ - size_t *end; - size_t i, j; - size_t offset; /* row[0]; offset + len1 give together real len of string1 */ - double distsum = 0.0; /* sum of distances */ - - /* catch trivia case */ - if (len1 == 0) { - for (j = 0; j < n; j++) - distsum += rows[j][lengths[j]]*weights[j]; - return distsum; - } - - /* iterate through the strings and sum the distances */ - for (j = 0; j < n; j++) { - size_t *rowi = rows[j]; /* current row */ - size_t leni = lengths[j]; /* current length */ - size_t len = len1; /* temporary len1 for suffix stripping */ - const lev_byte *stringi = strings[j]; /* current string */ - - /* strip common suffix (prefix CAN'T be stripped) */ - while (len && leni && stringi[leni-1] == string1[len-1]) { - len--; - leni--; - } - - /* catch trivial cases */ - if (len == 0) { - distsum += rowi[leni]*weights[j]; - continue; - } - offset = rowi[0]; - if (leni == 0) { - distsum += (offset + len)*weights[j]; - continue; - } - - /* complete the matrix */ - memcpy(row, rowi, (leni + 1)*sizeof(size_t)); - end = row + leni; - - for (i = 1; i <= len; i++) { - size_t *p = row + 1; - const lev_byte char1 = string1[i - 1]; - const lev_byte *char2p = stringi; - size_t D, x; - - D = x = i + offset; - while (p <= end) { - size_t c3 = --D + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - D = *p; - D++; - if (x > D) - x = D; - *(p++) = x; - } - } - distsum += weights[j]*(*end); - } - - return distsum; -} - -/** - * lev_median_improve: - * @len: The length of @s. - * @s: The approximate generalized median string to be improved. - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * @medlength: Where the new length of the median should be stored. - * - * Tries to make @s a better generalized median string of @strings with - * small perturbations. - * - * It never returns a string with larger SOD than @s; in the worst case, a - * string identical to @s is returned. - * - * Returns: The improved generalized median, as a newly allocated string; its - * length is stored in @medlength. - **/ -_LEV_STATIC_PY lev_byte* -lev_median_improve(size_t len, const lev_byte *s, - size_t n, const size_t *lengths, - const lev_byte *strings[], - const double *weights, - size_t *medlength) -{ - size_t i; /* usually iterates over strings (n) */ - size_t j; /* usually iterates over characters */ - size_t pos; /* the position in the approximate median string we are - trying to change */ - lev_byte *symlist; /* list of symbols present in the strings, - we iterate over it insead of set of all - existing symbols */ - size_t symlistlen; /* length of symlist */ - size_t maxlen; /* maximum input string length */ - size_t stoplen; /* maximum tried median string length -- this is slightly - higher than maxlen, because the median string may be - longer than any of the input strings */ - size_t **rows; /* Levenshtein matrix rows for each string, we need to keep - only one previous row to construct the current one */ - size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, - shared among all strings */ - lev_byte *median; /* the resulting approximate median string */ - size_t medlen; /* the current approximate median string length */ - double minminsum; /* the current total distance sum */ - - /* find all symbols */ - symlist = make_symlist(n, lengths, strings, &symlistlen); - if (!symlist) { - *medlength = 0; - if (symlistlen != 0) - return NULL; - else - return calloc(1, sizeof(lev_byte)); - } - - /* allocate and initialize per-string matrix rows and a common work buffer */ - rows = (size_t**)safe_malloc(n, sizeof(size_t*)); - if (!rows) { - free(symlist); - return NULL; - } - maxlen = 0; - for (i = 0; i < n; i++) { - size_t *ri; - size_t leni = lengths[i]; - if (leni > maxlen) - maxlen = leni; - ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); - if (!ri) { - for (j = 0; j < i; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - for (j = 0; j <= leni; j++) - ri[j] = j; - } - stoplen = 2*maxlen + 1; - row = (size_t*)safe_malloc((stoplen + 2), sizeof(size_t)); - if (!row) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - - /* initialize median to given string */ - median = (lev_byte*)safe_malloc((stoplen+1), sizeof(lev_byte)); - if (!median) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(row); - free(symlist); - return NULL; - } - median++; /* we need -1st element for insertions a pos 0 */ - medlen = len; - memcpy(median, s, (medlen)*sizeof(lev_byte)); - minminsum = finish_distance_computations(medlen, median, - n, lengths, strings, - weights, rows, row); - - /* sequentially try perturbations on all positions */ - for (pos = 0; pos <= medlen; ) { - lev_byte orig_symbol, symbol; - LevEditType operation; - double sum; - - symbol = median[pos]; - operation = LEV_EDIT_KEEP; - /* IF pos < medlength: FOREACH symbol: try to replace the symbol - * at pos, if some lower the total distance, chooste the best */ - if (pos < medlen) { - orig_symbol = median[pos]; - for (j = 0; j < symlistlen; j++) { - if (symlist[j] == orig_symbol) - continue; - median[pos] = symlist[j]; - sum = finish_distance_computations(medlen - pos, median + pos, - n, lengths, strings, - weights, rows, row); - if (sum < minminsum) { - minminsum = sum; - symbol = symlist[j]; - operation = LEV_EDIT_REPLACE; - } - } - median[pos] = orig_symbol; - } - /* FOREACH symbol: try to add it at pos, if some lower the total - * distance, chooste the best (increase medlength) - * We simulate insertion by replacing the character at pos-1 */ - orig_symbol = *(median + pos - 1); - for (j = 0; j < symlistlen; j++) { - *(median + pos - 1) = symlist[j]; - sum = finish_distance_computations(medlen - pos + 1, median + pos - 1, - n, lengths, strings, - weights, rows, row); - if (sum < minminsum) { - minminsum = sum; - symbol = symlist[j]; - operation = LEV_EDIT_INSERT; - } - } - *(median + pos - 1) = orig_symbol; - /* IF pos < medlength: try to delete the symbol at pos, if it lowers - * the total distance remember it (decrease medlength) */ - if (pos < medlen) { - sum = finish_distance_computations(medlen - pos - 1, median + pos + 1, - n, lengths, strings, - weights, rows, row); - if (sum < minminsum) { - minminsum = sum; - operation = LEV_EDIT_DELETE; - } - } - /* actually perform the operation */ - switch (operation) { - case LEV_EDIT_REPLACE: - median[pos] = symbol; - break; - - case LEV_EDIT_INSERT: - memmove(median+pos+1, median+pos, - (medlen - pos)*sizeof(lev_byte)); - median[pos] = symbol; - medlen++; - break; - - case LEV_EDIT_DELETE: - memmove(median+pos, median + pos+1, - (medlen - pos-1)*sizeof(lev_byte)); - medlen--; - break; - - default: - break; - } - assert(medlen <= stoplen); - /* now the result is known, so recompute all matrix rows and move on */ - if (operation != LEV_EDIT_DELETE) { - symbol = median[pos]; - row[0] = pos + 1; - for (i = 0; i < n; i++) { - const lev_byte *stri = strings[i]; - size_t *oldrow = rows[i]; - size_t leni = lengths[i]; - size_t k; - /* compute a row of Levenshtein matrix */ - for (k = 1; k <= leni; k++) { - size_t c1 = oldrow[k] + 1; - size_t c2 = row[k - 1] + 1; - size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); - row[k] = c2 > c3 ? c3 : c2; - if (row[k] > c1) - row[k] = c1; - } - memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); - } - pos++; - } - } - - /* clean up */ - for (i = 0; i < n; i++) - free(rows[i]); - free(rows); - free(row); - free(symlist); - - /* return result */ - { - lev_byte *result = (lev_byte*)safe_malloc(medlen, sizeof(lev_byte)); - if (!result) { - free(median); - return NULL; - } - *medlength = medlen; - memcpy(result, median, medlen*sizeof(lev_byte)); - median--; - free(median); - return result; - } -} - -/* used internally in make_usymlist */ -typedef struct _HItem HItem; -struct _HItem { - lev_wchar c; - HItem *n; -}; - -/* free usmylist hash - * this is a separate function because we need it in more than one place */ -static void -free_usymlist_hash(HItem *symmap) -{ - size_t j; - - for (j = 0; j < 0x100; j++) { - HItem *p = symmap + j; - if (p->n == symmap || p->n == NULL) - continue; - p = p->n; - while (p) { - HItem *q = p; - p = p->n; - free(q); - } - } - free(symmap); -} - -/* compute the sets of symbols each string contains, and the set of symbols - * in any of them (symset). meanwhile, count how many different symbols - * there are (used below for symlist). */ -static lev_wchar* -make_usymlist(size_t n, const size_t *lengths, - const lev_wchar *strings[], size_t *symlistlen) -{ - lev_wchar *symlist; - size_t i, j; - HItem *symmap; - - j = 0; - for (i = 0; i < n; i++) - j += lengths[i]; - - *symlistlen = 0; - if (j == 0) - return NULL; - - /* find all symbols, use a kind of hash for storage */ - symmap = (HItem*)safe_malloc(0x100, sizeof(HItem)); - if (!symmap) { - *symlistlen = (size_t)(-1); - return NULL; - } - /* this is an ugly memory allocation avoiding hack: most hash elements - * will probably contain none or one symbols only so, when p->n is equal - * to symmap, it means there're no symbols yet, afters insterting the - * first one, p->n becomes normally NULL and then it behaves like an - * usual singly linked list */ - for (i = 0; i < 0x100; i++) - symmap[i].n = symmap; - for (i = 0; i < n; i++) { - const lev_wchar *stri = strings[i]; - for (j = 0; j < lengths[i]; j++) { - int c = stri[j]; - int key = (c + (c >> 7)) & 0xff; - HItem *p = symmap + key; - if (p->n == symmap) { - p->c = c; - p->n = NULL; - (*symlistlen)++; - continue; - } - while (p->c != c && p->n != NULL) - p = p->n; - if (p->c != c) { - p->n = (HItem*)malloc(sizeof(HItem)); - if (!p->n) { - free_usymlist_hash(symmap); - *symlistlen = (size_t)(-1); - return NULL; - } - p = p->n; - p->n = NULL; - p->c = c; - (*symlistlen)++; - } - } - } - /* create dense symbol table, so we can easily iterate over only characters - * present in the strings */ - { - size_t pos = 0; - symlist = (lev_wchar*)safe_malloc((*symlistlen), sizeof(lev_wchar)); - if (!symlist) { - free_usymlist_hash(symmap); - *symlistlen = (size_t)(-1); - return NULL; - } - for (j = 0; j < 0x100; j++) { - HItem *p = symmap + j; - while (p != NULL && p->n != symmap) { - symlist[pos++] = p->c; - p = p->n; - } - } - } - - /* free memory */ - free_usymlist_hash(symmap); - - return symlist; -} - -/** - * lev_u_greedy_median: - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * @medlength: Where the length of the median should be stored. - * - * Finds a generalized median string of @strings using the greedy algorithm. - * - * Note it's considerably more efficient to give a string with weight 2 - * than to store two identical strings in @strings (with weights 1). - * - * Returns: The generalized median, as a newly allocated string; its length - * is stored in @medlength. - **/ -_LEV_STATIC_PY lev_wchar* -lev_u_greedy_median(size_t n, const size_t *lengths, - const lev_wchar *strings[], - const double *weights, - size_t *medlength) -{ - size_t i; /* usually iterates over strings (n) */ - size_t j; /* usually iterates over characters */ - size_t len; /* usually iterates over the approximate median string */ - lev_wchar *symlist; /* list of symbols present in the strings, - we iterate over it insead of set of all - existing symbols */ - size_t symlistlen; /* length of symlistle */ - size_t maxlen; /* maximum input string length */ - size_t stoplen; /* maximum tried median string length -- this is slightly - higher than maxlen, because the median string may be - longer than any of the input strings */ - size_t **rows; /* Levenshtein matrix rows for each string, we need to keep - only one previous row to construct the current one */ - size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, - shared among all strings */ - lev_wchar *median; /* the resulting approximate median string */ - double *mediandist; /* the total distance of the best median string of - given length. warning! mediandist[0] is total - distance for empty string, while median[] itself - is normally zero-based */ - size_t bestlen; /* the best approximate median string length */ - - /* find all symbols */ - symlist = make_usymlist(n, lengths, strings, &symlistlen); - if (!symlist) { - *medlength = 0; - if (symlistlen != 0) - return NULL; - else - return calloc(1, sizeof(lev_wchar)); - } - - /* allocate and initialize per-string matrix rows and a common work buffer */ - rows = (size_t**)safe_malloc(n, sizeof(size_t*)); - if (!rows) { - free(symlist); - return NULL; - } - maxlen = 0; - for (i = 0; i < n; i++) { - size_t *ri; - size_t leni = lengths[i]; - if (leni > maxlen) - maxlen = leni; - ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); - if (!ri) { - for (j = 0; j < i; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - for (j = 0; j <= leni; j++) - ri[j] = j; - } - stoplen = 2*maxlen + 1; - row = (size_t*)safe_malloc((stoplen + 1), sizeof(size_t)); - if (!row) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - - /* compute final cost of string of length 0 (empty string may be also - * a valid answer) */ - median = (lev_wchar*)safe_malloc(stoplen, sizeof(lev_wchar)); - if (!median) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(row); - free(symlist); - return NULL; - } - mediandist = (double*)safe_malloc((stoplen + 1), sizeof(double)); - if (!mediandist) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(row); - free(symlist); - free(median); - return NULL; - } - mediandist[0] = 0.0; - for (i = 0; i < n; i++) - mediandist[0] += lengths[i]*weights[i]; - - /* build up the approximate median string symbol by symbol - * XXX: we actually exit on break below, but on the same condition */ - for (len = 1; len <= stoplen; len++) { - lev_wchar symbol; - double minminsum = LEV_INFINITY; - row[0] = len; - /* iterate over all symbols we may want to add */ - for (j = 0; j < symlistlen; j++) { - double totaldist = 0.0; - double minsum = 0.0; - symbol = symlist[j]; - /* sum Levenshtein distances from all the strings, with given weights */ - for (i = 0; i < n; i++) { - const lev_wchar *stri = strings[i]; - size_t *p = rows[i]; - size_t leni = lengths[i]; - size_t *end = rows[i] + leni; - size_t min = len; - size_t x = len; /* == row[0] */ - /* compute how another row of Levenshtein matrix would look for median - * string with this symbol added */ - while (p < end) { - size_t D = *(p++) + (symbol != *(stri++)); - x++; - if (x > D) - x = D; - if (x > *p + 1) - x = *p + 1; - if (x < min) - min = x; - } - minsum += min*weights[i]; - totaldist += x*weights[i]; - } - /* is this symbol better than all the others? */ - if (minsum < minminsum) { - minminsum = minsum; - mediandist[len] = totaldist; - median[len - 1] = symbol; - } - } - /* stop the iteration if we no longer need to recompute the matrix rows - * or when we are over maxlen and adding more characters doesn't seem - * useful */ - if (len == stoplen - || (len > maxlen && mediandist[len] > mediandist[len - 1])) { - stoplen = len; - break; - } - /* now the best symbol is known, so recompute all matrix rows for this - * one */ - symbol = median[len - 1]; - for (i = 0; i < n; i++) { - const lev_wchar *stri = strings[i]; - size_t *oldrow = rows[i]; - size_t leni = lengths[i]; - size_t k; - /* compute a row of Levenshtein matrix */ - for (k = 1; k <= leni; k++) { - size_t c1 = oldrow[k] + 1; - size_t c2 = row[k - 1] + 1; - size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); - row[k] = c2 > c3 ? c3 : c2; - if (row[k] > c1) - row[k] = c1; - } - memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); - } - } - - /* find the string with minimum total distance */ - bestlen = 0; - for (len = 1; len <= stoplen; len++) { - if (mediandist[len] < mediandist[bestlen]) - bestlen = len; - } - - /* clean up */ - for (i = 0; i < n; i++) - free(rows[i]); - free(rows); - free(row); - free(symlist); - free(mediandist); - - /* return result */ - { - lev_wchar *result = (lev_wchar*)safe_malloc(bestlen, sizeof(lev_wchar)); - if (!result) { - free(median); - return NULL; - } - memcpy(result, median, bestlen*sizeof(lev_wchar)); - free(median); - *medlength = bestlen; - return result; - } -} - -/* - * Knowing the distance matrices up to some row, finish the distance - * computations. - * - * string1, len1 are already shortened. - */ -static double -finish_udistance_computations(size_t len1, lev_wchar *string1, - size_t n, const size_t *lengths, - const lev_wchar **strings, - const double *weights, size_t **rows, - size_t *row) -{ - size_t *end; - size_t i, j; - size_t offset; /* row[0]; offset + len1 give together real len of string1 */ - double distsum = 0.0; /* sum of distances */ - - /* catch trivia case */ - if (len1 == 0) { - for (j = 0; j < n; j++) - distsum += rows[j][lengths[j]]*weights[j]; - return distsum; - } - - /* iterate through the strings and sum the distances */ - for (j = 0; j < n; j++) { - size_t *rowi = rows[j]; /* current row */ - size_t leni = lengths[j]; /* current length */ - size_t len = len1; /* temporary len1 for suffix stripping */ - const lev_wchar *stringi = strings[j]; /* current string */ - - /* strip common suffix (prefix CAN'T be stripped) */ - while (len && leni && stringi[leni-1] == string1[len-1]) { - len--; - leni--; - } - - /* catch trivial cases */ - if (len == 0) { - distsum += rowi[leni]*weights[j]; - continue; - } - offset = rowi[0]; - if (leni == 0) { - distsum += (offset + len)*weights[j]; - continue; - } - - /* complete the matrix */ - memcpy(row, rowi, (leni + 1)*sizeof(size_t)); - end = row + leni; - - for (i = 1; i <= len; i++) { - size_t *p = row + 1; - const lev_wchar char1 = string1[i - 1]; - const lev_wchar *char2p = stringi; - size_t D, x; - - D = x = i + offset; - while (p <= end) { - size_t c3 = --D + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - D = *p; - D++; - if (x > D) - x = D; - *(p++) = x; - } - } - distsum += weights[j]*(*end); - } - - return distsum; -} - -/** - * lev_u_median_improve: - * @len: The length of @s. - * @s: The approximate generalized median string to be improved. - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * @medlength: Where the new length of the median should be stored. - * - * Tries to make @s a better generalized median string of @strings with - * small perturbations. - * - * It never returns a string with larger SOD than @s; in the worst case, a - * string identical to @s is returned. - * - * Returns: The improved generalized median, as a newly allocated string; its - * length is stored in @medlength. - **/ -_LEV_STATIC_PY lev_wchar* -lev_u_median_improve(size_t len, const lev_wchar *s, - size_t n, const size_t *lengths, - const lev_wchar *strings[], - const double *weights, - size_t *medlength) -{ - size_t i; /* usually iterates over strings (n) */ - size_t j; /* usually iterates over characters */ - size_t pos; /* the position in the approximate median string we are - trying to change */ - lev_wchar *symlist; /* list of symbols present in the strings, - we iterate over it insead of set of all - existing symbols */ - size_t symlistlen; /* length of symlist */ - size_t maxlen; /* maximum input string length */ - size_t stoplen; /* maximum tried median string length -- this is slightly - higher than maxlen, because the median string may be - longer than any of the input strings */ - size_t **rows; /* Levenshtein matrix rows for each string, we need to keep - only one previous row to construct the current one */ - size_t *row; /* a scratch buffer for new Levenshtein matrix row computation, - shared among all strings */ - lev_wchar *median; /* the resulting approximate median string */ - size_t medlen; /* the current approximate median string length */ - double minminsum; /* the current total distance sum */ - - /* find all symbols */ - symlist = make_usymlist(n, lengths, strings, &symlistlen); - if (!symlist) { - *medlength = 0; - if (symlistlen != 0) - return NULL; - else - return calloc(1, sizeof(lev_wchar)); - } - - /* allocate and initialize per-string matrix rows and a common work buffer */ - rows = (size_t**)safe_malloc(n, sizeof(size_t*)); - if (!rows) { - free(symlist); - return NULL; - } - maxlen = 0; - for (i = 0; i < n; i++) { - size_t *ri; - size_t leni = lengths[i]; - if (leni > maxlen) - maxlen = leni; - ri = rows[i] = (size_t*)safe_malloc((leni + 1), sizeof(size_t)); - if (!ri) { - for (j = 0; j < i; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - for (j = 0; j <= leni; j++) - ri[j] = j; - } - stoplen = 2*maxlen + 1; - row = (size_t*)safe_malloc((stoplen + 2), sizeof(size_t)); - if (!row) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(symlist); - return NULL; - } - - /* initialize median to given string */ - median = (lev_wchar*)safe_malloc((stoplen+1), sizeof(lev_wchar)); - if (!median) { - for (j = 0; j < n; j++) - free(rows[j]); - free(rows); - free(row); - free(symlist); - return NULL; - } - median++; /* we need -1st element for insertions a pos 0 */ - medlen = len; - memcpy(median, s, (medlen)*sizeof(lev_wchar)); - minminsum = finish_udistance_computations(medlen, median, - n, lengths, strings, - weights, rows, row); - - /* sequentially try perturbations on all positions */ - for (pos = 0; pos <= medlen; ) { - lev_wchar orig_symbol, symbol; - LevEditType operation; - double sum; - - symbol = median[pos]; - operation = LEV_EDIT_KEEP; - /* IF pos < medlength: FOREACH symbol: try to replace the symbol - * at pos, if some lower the total distance, chooste the best */ - if (pos < medlen) { - orig_symbol = median[pos]; - for (j = 0; j < symlistlen; j++) { - if (symlist[j] == orig_symbol) - continue; - median[pos] = symlist[j]; - sum = finish_udistance_computations(medlen - pos, median + pos, - n, lengths, strings, - weights, rows, row); - if (sum < minminsum) { - minminsum = sum; - symbol = symlist[j]; - operation = LEV_EDIT_REPLACE; - } - } - median[pos] = orig_symbol; - } - /* FOREACH symbol: try to add it at pos, if some lower the total - * distance, chooste the best (increase medlength) - * We simulate insertion by replacing the character at pos-1 */ - orig_symbol = *(median + pos - 1); - for (j = 0; j < symlistlen; j++) { - *(median + pos - 1) = symlist[j]; - sum = finish_udistance_computations(medlen - pos + 1, median + pos - 1, - n, lengths, strings, - weights, rows, row); - if (sum < minminsum) { - minminsum = sum; - symbol = symlist[j]; - operation = LEV_EDIT_INSERT; - } - } - *(median + pos - 1) = orig_symbol; - /* IF pos < medlength: try to delete the symbol at pos, if it lowers - * the total distance remember it (decrease medlength) */ - if (pos < medlen) { - sum = finish_udistance_computations(medlen - pos - 1, median + pos + 1, - n, lengths, strings, - weights, rows, row); - if (sum < minminsum) { - minminsum = sum; - operation = LEV_EDIT_DELETE; - } - } - /* actually perform the operation */ - switch (operation) { - case LEV_EDIT_REPLACE: - median[pos] = symbol; - break; - - case LEV_EDIT_INSERT: - memmove(median+pos+1, median+pos, - (medlen - pos)*sizeof(lev_wchar)); - median[pos] = symbol; - medlen++; - break; - - case LEV_EDIT_DELETE: - memmove(median+pos, median + pos+1, - (medlen - pos-1)*sizeof(lev_wchar)); - medlen--; - break; - - default: - break; - } - assert(medlen <= stoplen); - /* now the result is known, so recompute all matrix rows and move on */ - if (operation != LEV_EDIT_DELETE) { - symbol = median[pos]; - row[0] = pos + 1; - for (i = 0; i < n; i++) { - const lev_wchar *stri = strings[i]; - size_t *oldrow = rows[i]; - size_t leni = lengths[i]; - size_t k; - /* compute a row of Levenshtein matrix */ - for (k = 1; k <= leni; k++) { - size_t c1 = oldrow[k] + 1; - size_t c2 = row[k - 1] + 1; - size_t c3 = oldrow[k - 1] + (symbol != stri[k - 1]); - row[k] = c2 > c3 ? c3 : c2; - if (row[k] > c1) - row[k] = c1; - } - memcpy(oldrow, row, (leni + 1)*sizeof(size_t)); - } - pos++; - } - } - - /* clean up */ - for (i = 0; i < n; i++) - free(rows[i]); - free(rows); - free(row); - free(symlist); - - /* return result */ - { - lev_wchar *result = (lev_wchar*)safe_malloc(medlen, sizeof(lev_wchar)); - if (!result) { - free(median); - return NULL; - } - *medlength = medlen; - memcpy(result, median, medlen*sizeof(lev_wchar)); - median--; - free(median); - return result; - } -} -/* }}} */ - -/**************************************************************************** - * - * Quick (voting) medians - * - ****************************************************************************/ -/* {{{ */ - -/* compute the sets of symbols each string contains, and the set of symbols - * in any of them (symset). meanwhile, count how many different symbols - * there are (used below for symlist). - * the symset is passed as an argument to avoid its allocation and - * deallocation when it's used in the caller too */ -static lev_byte* -make_symlistset(size_t n, const size_t *lengths, - const lev_byte *strings[], size_t *symlistlen, - double *symset) -{ - size_t i, j; - lev_byte *symlist; - - if (!symset) { - *symlistlen = (size_t)(-1); - return NULL; - } - memset(symset, 0, 0x100*sizeof(double)); /* XXX: needs IEEE doubles?! */ - *symlistlen = 0; - for (i = 0; i < n; i++) { - const lev_byte *stri = strings[i]; - for (j = 0; j < lengths[i]; j++) { - int c = stri[j]; - if (!symset[c]) { - (*symlistlen)++; - symset[c] = 1.0; - } - } - } - if (!*symlistlen) - return NULL; - - /* create dense symbol table, so we can easily iterate over only characters - * present in the strings */ - { - size_t pos = 0; - symlist = (lev_byte*)safe_malloc((*symlistlen), sizeof(lev_byte)); - if (!symlist) { - *symlistlen = (size_t)(-1); - return NULL; - } - for (j = 0; j < 0x100; j++) { - if (symset[j]) - symlist[pos++] = (lev_byte)j; - } - } - - return symlist; -} - -_LEV_STATIC_PY lev_byte* -lev_quick_median(size_t n, - const size_t *lengths, - const lev_byte *strings[], - const double *weights, - size_t *medlength) -{ - size_t symlistlen, len, i, j, k; - lev_byte *symlist; - lev_byte *median; /* the resulting string */ - double *symset; - double ml, wl; - - /* first check whether the result would be an empty string - * and compute resulting string length */ - ml = wl = 0.0; - for (i = 0; i < n; i++) { - ml += lengths[i]*weights[i]; - wl += weights[i]; - } - if (wl == 0.0) - return (lev_byte*)calloc(1, sizeof(lev_byte)); - ml = floor(ml/wl + 0.499999); - *medlength = len = ml; - if (!len) - return (lev_byte*)calloc(1, sizeof(lev_byte)); - median = (lev_byte*)safe_malloc(len, sizeof(lev_byte)); - if (!median) - return NULL; - - /* find the symbol set; - * now an empty symbol set is really a failure */ - symset = (double*)calloc(0x100, sizeof(double)); - if (!symset) { - free(median); - return NULL; - } - symlist = make_symlistset(n, lengths, strings, &symlistlen, symset); - if (!symlist) { - free(median); - free(symset); - return NULL; - } - - for (j = 0; j < len; j++) { - /* clear the symbol probabilities */ - if (symlistlen < 32) { - for (i = 0; i < symlistlen; i++) - symset[symlist[i]] = 0.0; - } - else - memset(symset, 0, 0x100*sizeof(double)); - - /* let all strings vote */ - for (i = 0; i < n; i++) { - const lev_byte *stri = strings[i]; - double weighti = weights[i]; - size_t lengthi = lengths[i]; - double start = lengthi/ml*j; - double end = start + lengthi/ml; - size_t istart = floor(start); - size_t iend = ceil(end); - - /* rounding errors can overflow the buffer */ - if (iend > lengthi) - iend = lengthi; - - for (k = istart+1; k < iend; k++) - symset[stri[k]] += weighti; - symset[stri[istart]] += weighti*(1+istart - start); - symset[stri[iend-1]] -= weighti*(iend - end); - } - - /* find the elected symbol */ - k = symlist[0]; - for (i = 1; i < symlistlen; i++) { - if (symset[symlist[i]] > symset[k]) - k = symlist[i]; - } - median[j] = k; - } - - free(symset); - free(symlist); - - return median; -} - -/* used internally in make_usymlistset */ -typedef struct _HQItem HQItem; -struct _HQItem { - lev_wchar c; - double s; - HQItem *n; -}; - -/* free usmylistset hash - * this is a separate function because we need it in more than one place */ -static void -free_usymlistset_hash(HQItem *symmap) -{ - size_t j; - - for (j = 0; j < 0x100; j++) { - HQItem *p = symmap + j; - if (p->n == symmap || p->n == NULL) - continue; - p = p->n; - while (p) { - HQItem *q = p; - p = p->n; - free(q); - } - } - free(symmap); -} - -/* compute the sets of symbols each string contains, and the set of symbols - * in any of them (symset). meanwhile, count how many different symbols - * there are (used below for symlist). - * the symset is passed as an argument to avoid its allocation and - * deallocation when it's used in the caller too */ -static lev_wchar* -make_usymlistset(size_t n, const size_t *lengths, - const lev_wchar *strings[], size_t *symlistlen, - HQItem *symmap) -{ - lev_wchar *symlist; - size_t i, j; - - j = 0; - for (i = 0; i < n; i++) - j += lengths[i]; - - *symlistlen = 0; - if (j == 0) - return NULL; - - /* this is an ugly memory allocation avoiding hack: most hash elements - * will probably contain none or one symbols only so, when p->n is equal - * to symmap, it means there're no symbols yet, afters insterting the - * first one, p->n becomes normally NULL and then it behaves like an - * usual singly linked list */ - for (i = 0; i < 0x100; i++) - symmap[i].n = symmap; - for (i = 0; i < n; i++) { - const lev_wchar *stri = strings[i]; - for (j = 0; j < lengths[i]; j++) { - int c = stri[j]; - int key = (c + (c >> 7)) & 0xff; - HQItem *p = symmap + key; - if (p->n == symmap) { - p->c = c; - p->n = NULL; - (*symlistlen)++; - continue; - } - while (p->c != c && p->n != NULL) - p = p->n; - if (p->c != c) { - p->n = (HQItem*)malloc(sizeof(HQItem)); - if (!p->n) { - *symlistlen = (size_t)(-1); - return NULL; - } - p = p->n; - p->n = NULL; - p->c = c; - (*symlistlen)++; - } - } - } - /* create dense symbol table, so we can easily iterate over only characters - * present in the strings */ - { - size_t pos = 0; - symlist = (lev_wchar*)safe_malloc((*symlistlen), sizeof(lev_wchar)); - if (!symlist) { - *symlistlen = (size_t)(-1); - return NULL; - } - for (j = 0; j < 0x100; j++) { - HQItem *p = symmap + j; - while (p != NULL && p->n != symmap) { - symlist[pos++] = p->c; - p = p->n; - } - } - } - - return symlist; -} - -_LEV_STATIC_PY lev_wchar* -lev_u_quick_median(size_t n, - const size_t *lengths, - const lev_wchar *strings[], - const double *weights, - size_t *medlength) -{ - size_t symlistlen, len, i, j, k; - lev_wchar *symlist; - lev_wchar *median; /* the resulting string */ - HQItem *symmap; - double ml, wl; - - /* first check whether the result would be an empty string - * and compute resulting string length */ - ml = wl = 0.0; - for (i = 0; i < n; i++) { - ml += lengths[i]*weights[i]; - wl += weights[i]; - } - if (wl == 0.0) - return (lev_wchar*)calloc(1, sizeof(lev_wchar)); - ml = floor(ml/wl + 0.499999); - *medlength = len = ml; - if (!len) - return (lev_wchar*)calloc(1, sizeof(lev_wchar)); - median = (lev_wchar*)safe_malloc(len, sizeof(lev_wchar)); - if (!median) - return NULL; - - /* find the symbol set; - * now an empty symbol set is really a failure */ - symmap = (HQItem*)safe_malloc(0x100, sizeof(HQItem)); - if (!symmap) { - free(median); - return NULL; - } - symlist = make_usymlistset(n, lengths, strings, &symlistlen, symmap); - if (!symlist) { - free(median); - free_usymlistset_hash(symmap); - return NULL; - } - - for (j = 0; j < len; j++) { - /* clear the symbol probabilities */ - for (i = 0; i < 0x100; i++) { - HQItem *p = symmap + i; - if (p->n == symmap) - continue; - while (p) { - p->s = 0.0; - p = p->n; - } - } - - /* let all strings vote */ - for (i = 0; i < n; i++) { - const lev_wchar *stri = strings[i]; - double weighti = weights[i]; - size_t lengthi = lengths[i]; - double start = lengthi/ml*j; - double end = start + lengthi/ml; - size_t istart = floor(start); - size_t iend = ceil(end); - - /* rounding errors can overflow the buffer */ - if (iend > lengthi) - iend = lengthi; - - /* the inner part, including the complete last character */ - for (k = istart+1; k < iend; k++) { - int c = stri[k]; - int key = (c + (c >> 7)) & 0xff; - HQItem *p = symmap + key; - while (p->c != c) - p = p->n; - p->s += weighti; - } - /* the initial fraction */ - { - int c = stri[istart]; - int key = (c + (c >> 7)) & 0xff; - HQItem *p = symmap + key; - while (p->c != c) - p = p->n; - p->s += weighti*(1+istart - start); - } - /* subtract what we counted from the last character but doesn't - * actually belong here. - * this strategy works also when istart+1 == iend (i.e., everything - * happens inside a one character) */ - { - int c = stri[iend-1]; - int key = (c + (c >> 7)) & 0xff; - HQItem *p = symmap + key; - while (p->c != c) - p = p->n; - p->s -= weighti*(iend - end); - } - } - - /* find the elected symbol */ - { - HQItem *max = NULL; - - for (i = 0; i < 0x100; i++) { - HQItem *p = symmap + i; - if (p->n == symmap) - continue; - while (p) { - if (!max || p->s > max->s) - max = p; - p = p->n; - } - } - median[j] = max->c; - } - } - - free_usymlistset_hash(symmap); - free(symlist); - - return median; -} -/* }}} */ - -/**************************************************************************** - * - * Set medians - * - ****************************************************************************/ -/* {{{ */ - -/** - * lev_set_median_index: - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * - * Finds the median string of a string set @strings. - * - * Returns: An index in @strings pointing to the set median, -1 in case of - * failure. - **/ -_LEV_STATIC_PY size_t -lev_set_median_index(size_t n, const size_t *lengths, - const lev_byte *strings[], - const double *weights) -{ - size_t minidx = 0; - double mindist = LEV_INFINITY; - size_t i; - long int *distances; - - distances = (long int*)safe_malloc((n*(n - 1)/2), sizeof(long int)); - if (!distances) - return (size_t)-1; - - memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); /* XXX */ - for (i = 0; i < n; i++) { - size_t j = 0; - double dist = 0.0; - const lev_byte *stri = strings[i]; - size_t leni = lengths[i]; - /* below diagonal */ - while (j < i && dist < mindist) { - size_t dindex = (i - 1)*(i - 2)/2 + j; - long int d; - if (distances[dindex] >= 0) - d = distances[dindex]; - else { - d = lev_edit_distance(lengths[j], strings[j], leni, stri, 0); - if (d < 0) { - free(distances); - return (size_t)-1; - } - } - dist += weights[j]*d; - j++; - } - j++; /* no need to compare item with itself */ - /* above diagonal */ - while (j < n && dist < mindist) { - size_t dindex = (j - 1)*(j - 2)/2 + i; - distances[dindex] = lev_edit_distance(lengths[j], strings[j], - leni, stri, 0); - if (distances[dindex] < 0) { - free(distances); - return (size_t)-1; - } - dist += weights[j]*distances[dindex]; - j++; - } - - if (dist < mindist) { - mindist = dist; - minidx = i; - } - } - - free(distances); - return minidx; -} - -/** - * lev_u_set_median_index: - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * - * Finds the median string of a string set @strings. - * - * Returns: An index in @strings pointing to the set median, -1 in case of - * failure. - **/ -_LEV_STATIC_PY size_t -lev_u_set_median_index(size_t n, const size_t *lengths, - const lev_wchar *strings[], - const double *weights) -{ - size_t minidx = 0; - double mindist = LEV_INFINITY; - size_t i; - long int *distances; - - distances = (long int*)safe_malloc((n*(n - 1)/2), sizeof(long int)); - if (!distances) - return (size_t)-1; - - memset(distances, 0xff, (n*(n - 1)/2)*sizeof(long int)); /* XXX */ - for (i = 0; i < n; i++) { - size_t j = 0; - double dist = 0.0; - const lev_wchar *stri = strings[i]; - size_t leni = lengths[i]; - /* below diagonal */ - while (j < i && dist < mindist) { - size_t dindex = (i - 1)*(i - 2)/2 + j; - long int d; - if (distances[dindex] >= 0) - d = distances[dindex]; - else { - d = lev_u_edit_distance(lengths[j], strings[j], leni, stri, 0); - if (d < 0) { - free(distances); - return (size_t)-1; - } - } - dist += weights[j]*d; - j++; - } - j++; /* no need to compare item with itself */ - /* above diagonal */ - while (j < n && dist < mindist) { - size_t dindex = (j - 1)*(j - 2)/2 + i; - distances[dindex] = lev_u_edit_distance(lengths[j], strings[j], - leni, stri, 0); - if (distances[dindex] < 0) { - free(distances); - return (size_t)-1; - } - dist += weights[j]*distances[dindex]; - j++; - } - - if (dist < mindist) { - mindist = dist; - minidx = i; - } - } - - free(distances); - return minidx; -} - -/** - * lev_set_median: - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * @medlength: Where the length of the median string should be stored. - * - * Finds the median string of a string set @strings. - * - * Returns: The set median as a newly allocate string, its length is stored - * in @medlength. %NULL in the case of failure. - **/ -_LEV_STATIC_PY lev_byte* -lev_set_median(size_t n, const size_t *lengths, - const lev_byte *strings[], - const double *weights, - size_t *medlength) -{ - size_t minidx = lev_set_median_index(n, lengths, strings, weights); - lev_byte *result; - - if (minidx == (size_t)-1) - return NULL; - - *medlength = lengths[minidx]; - if (!lengths[minidx]) - return (lev_byte*)calloc(1, sizeof(lev_byte)); - - result = (lev_byte*)safe_malloc(lengths[minidx], sizeof(lev_byte)); - if (!result) - return NULL; - return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_byte)); -} - -/** - * lev_u_set_median: - * @n: The size of @lengths, @strings, and @weights. - * @lengths: The lengths of @strings. - * @strings: An array of strings, that may contain NUL characters. - * @weights: The string weights (they behave exactly as multiplicities, though - * any positive value is allowed, not just integers). - * @medlength: Where the length of the median string should be stored. - * - * Finds the median string of a string set @strings. - * - * Returns: The set median as a newly allocate string, its length is stored - * in @medlength. %NULL in the case of failure. - **/ -_LEV_STATIC_PY lev_wchar* -lev_u_set_median(size_t n, const size_t *lengths, - const lev_wchar *strings[], - const double *weights, - size_t *medlength) -{ - size_t minidx = lev_u_set_median_index(n, lengths, strings, weights); - lev_wchar *result; - - if (minidx == (size_t)-1) - return NULL; - - *medlength = lengths[minidx]; - if (!lengths[minidx]) - return (lev_wchar*)calloc(1, sizeof(lev_wchar)); - - result = (lev_wchar*)safe_malloc(lengths[minidx], sizeof(lev_wchar)); - if (!result) - return NULL; - return memcpy(result, strings[minidx], lengths[minidx]*sizeof(lev_wchar)); -} -/* }}} */ - -/**************************************************************************** - * - * Set, sequence distances - * - ****************************************************************************/ -/* {{{ */ - -/** - * lev_edit_seq_distance: - * @n1: The length of @lengths1 and @strings1. - * @lengths1: The lengths of strings in @strings1. - * @strings1: An array of strings that may contain NUL characters. - * @n2: The length of @lengths2 and @strings2. - * @lengths2: The lengths of strings in @strings2. - * @strings2: An array of strings that may contain NUL characters. - * - * Finds the distance between string sequences @strings1 and @strings2. - * - * In other words, this is a double-Levenshtein algorithm. - * - * The cost of string replace operation is based on string similarity: it's - * zero for identical strings and 2 for completely unsimilar strings. - * - * Returns: The distance of the two sequences. - **/ -_LEV_STATIC_PY double -lev_edit_seq_distance(size_t n1, const size_t *lengths1, - const lev_byte *strings1[], - size_t n2, const size_t *lengths2, - const lev_byte *strings2[]) -{ - size_t i; - double *row; /* we only need to keep one row of costs */ - double *end; - - /* strip common prefix */ - while (n1 > 0 && n2 > 0 - && *lengths1 == *lengths2 - && memcmp(*strings1, *strings2, - *lengths1*sizeof(lev_byte)) == 0) { - n1--; - n2--; - strings1++; - strings2++; - lengths1++; - lengths2++; - } - - /* strip common suffix */ - while (n1 > 0 && n2 > 0 - && lengths1[n1-1] == lengths2[n2-1] - && memcmp(strings1[n1-1], strings2[n2-1], - lengths1[n1-1]*sizeof(lev_byte)) == 0) { - n1--; - n2--; - } - - /* catch trivial cases */ - if (n1 == 0) - return n2; - if (n2 == 0) - return n1; - - /* make the inner cycle (i.e. strings2) the longer one */ - if (n1 > n2) { - size_t nx = n1; - const size_t *lx = lengths1; - const lev_byte **sx = strings1; - n1 = n2; - n2 = nx; - lengths1 = lengths2; - lengths2 = lx; - strings1 = strings2; - strings2 = sx; - } - n1++; - n2++; - - /* initalize first row */ - row = (double*)safe_malloc(n2, sizeof(double)); - if (!row) - return -1.0; - end = row + n2 - 1; - for (i = 0; i < n2; i++) - row[i] = (double)i; - - /* go through the matrix and compute the costs. yes, this is an extremely - * obfuscated version, but also extremely memory-conservative and relatively - * fast. */ - for (i = 1; i < n1; i++) { - double *p = row + 1; - const lev_byte *str1 = strings1[i - 1]; - const size_t len1 = lengths1[i - 1]; - const lev_byte **str2p = strings2; - const size_t *len2p = lengths2; - double D = i - 1.0; - double x = i; - while (p <= end) { - size_t l = len1 + *len2p; - double q; - if (l == 0) - q = D; - else { - size_t d = lev_edit_distance(len1, str1, *(len2p++), *(str2p++), 1); - if (d == (size_t)(-1)) { - free(row); - return -1.0; - } - q = D + 2.0/l*d; - } - x += 1.0; - if (x > q) - x = q; - D = *p; - if (x > D + 1.0) - x = D + 1.0; - *(p++) = x; - } - } - - { - double q = *end; - free(row); - return q; - } -} - -/** - * lev_u_edit_seq_distance: - * @n1: The length of @lengths1 and @strings1. - * @lengths1: The lengths of strings in @strings1. - * @strings1: An array of strings that may contain NUL characters. - * @n2: The length of @lengths2 and @strings2. - * @lengths2: The lengths of strings in @strings2. - * @strings2: An array of strings that may contain NUL characters. - * - * Finds the distance between string sequences @strings1 and @strings2. - * - * In other words, this is a double-Levenshtein algorithm. - * - * The cost of string replace operation is based on string similarity: it's - * zero for identical strings and 2 for completely unsimilar strings. - * - * Returns: The distance of the two sequences. - **/ -_LEV_STATIC_PY double -lev_u_edit_seq_distance(size_t n1, const size_t *lengths1, - const lev_wchar *strings1[], - size_t n2, const size_t *lengths2, - const lev_wchar *strings2[]) -{ - size_t i; - double *row; /* we only need to keep one row of costs */ - double *end; - - /* strip common prefix */ - while (n1 > 0 && n2 > 0 - && *lengths1 == *lengths2 - && memcmp(*strings1, *strings2, - *lengths1*sizeof(lev_wchar)) == 0) { - n1--; - n2--; - strings1++; - strings2++; - lengths1++; - lengths2++; - } - - /* strip common suffix */ - while (n1 > 0 && n2 > 0 - && lengths1[n1-1] == lengths2[n2-1] - && memcmp(strings1[n1-1], strings2[n2-1], - lengths1[n1-1]*sizeof(lev_wchar)) == 0) { - n1--; - n2--; - } - - /* catch trivial cases */ - if (n1 == 0) - return (double)n2; - if (n2 == 0) - return (double)n1; - - /* make the inner cycle (i.e. strings2) the longer one */ - if (n1 > n2) { - size_t nx = n1; - const size_t *lx = lengths1; - const lev_wchar **sx = strings1; - n1 = n2; - n2 = nx; - lengths1 = lengths2; - lengths2 = lx; - strings1 = strings2; - strings2 = sx; - } - n1++; - n2++; - - /* initalize first row */ - row = (double*)safe_malloc(n2, sizeof(double)); - if (!row) - return -1.0; - end = row + n2 - 1; - for (i = 0; i < n2; i++) - row[i] = (double)i; - - /* go through the matrix and compute the costs. yes, this is an extremely - * obfuscated version, but also extremely memory-conservative and relatively - * fast. */ - for (i = 1; i < n1; i++) { - double *p = row + 1; - const lev_wchar *str1 = strings1[i - 1]; - const size_t len1 = lengths1[i - 1]; - const lev_wchar **str2p = strings2; - const size_t *len2p = lengths2; - double D = i - 1.0; - double x = i; - while (p <= end) { - size_t l = len1 + *len2p; - double q; - if (l == 0) - q = D; - else { - size_t d = lev_u_edit_distance(len1, str1, *(len2p++), *(str2p++), 1); - if (d == (size_t)(-1)) { - free(row); - return -1.0; - } - q = D + 2.0/l*d; - } - x += 1.0; - if (x > q) - x = q; - D = *p; - if (x > D + 1.0) - x = D + 1.0; - *(p++) = x; - } - } - - { - double q = *end; - free(row); - return q; - } -} - -/** - * lev_set_distance: - * @n1: The length of @lengths1 and @strings1. - * @lengths1: The lengths of strings in @strings1. - * @strings1: An array of strings that may contain NUL characters. - * @n2: The length of @lengths2 and @strings2. - * @lengths2: The lengths of strings in @strings2. - * @strings2: An array of strings that may contain NUL characters. - * - * Finds the distance between string sets @strings1 and @strings2. - * - * The difference from lev_edit_seq_distance() is that order doesn't matter. - * The optimal association of @strings1 and @strings2 is found first and - * the similarity is computed for that. - * - * Uses sequential Munkers-Blackman algorithm. - * - * Returns: The distance of the two sets. - **/ -_LEV_STATIC_PY double -lev_set_distance(size_t n1, const size_t *lengths1, - const lev_byte *strings1[], - size_t n2, const size_t *lengths2, - const lev_byte *strings2[]) -{ - double *dists; /* the (modified) distance matrix, indexed [row*n1 + col] */ - double *r; - size_t i, j; - size_t *map; - double sum; - - /* catch trivial cases */ - if (n1 == 0) - return (double)n2; - if (n2 == 0) - return (double)n1; - - /* make the number of columns (n1) smaller than the number of rows */ - if (n1 > n2) { - size_t nx = n1; - const size_t *lx = lengths1; - const lev_byte **sx = strings1; - n1 = n2; - n2 = nx; - lengths1 = lengths2; - lengths2 = lx; - strings1 = strings2; - strings2 = sx; - } - - /* compute distances from each to each */ - r = dists = (double*)safe_malloc_3(n1, n2, sizeof(double)); - if (!r) - return -1.0; - for (i = 0; i < n2; i++) { - size_t len2 = lengths2[i]; - const lev_byte *str2 = strings2[i]; - const size_t *len1p = lengths1; - const lev_byte **str1p = strings1; - for (j = 0; j < n1; j++) { - size_t l = len2 + *len1p; - if (l == 0) - *(r++) = 0.0; - else { - size_t d = lev_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1); - if (d == (size_t)(-1)) { - free(r); - return -1.0; - } - *(r++) = (double)d/l; - } - } - } - - /* find the optimal mapping between the two sets */ - map = munkers_blackman(n1, n2, dists); - if (!map) - return -1.0; - - /* sum the set distance */ - sum = n2 - n1; - for (j = 0; j < n1; j++) { - size_t l; - i = map[j]; - l = lengths1[j] + lengths2[i]; - if (l > 0) { - size_t d = lev_edit_distance(lengths1[j], strings1[j], - lengths2[i], strings2[i], 1); - if (d == (size_t)(-1)) { - free(map); - return -1.0; - } - sum += 2.0*d/l; - } - } - free(map); - - return sum; -} - -/** - * lev_u_set_distance: - * @n1: The length of @lengths1 and @strings1. - * @lengths1: The lengths of strings in @strings1. - * @strings1: An array of strings that may contain NUL characters. - * @n2: The length of @lengths2 and @strings2. - * @lengths2: The lengths of strings in @strings2. - * @strings2: An array of strings that may contain NUL characters. - * - * Finds the distance between string sets @strings1 and @strings2. - * - * The difference from lev_u_edit_seq_distance() is that order doesn't matter. - * The optimal association of @strings1 and @strings2 is found first and - * the similarity is computed for that. - * - * Uses sequential Munkers-Blackman algorithm. - * - * Returns: The distance of the two sets. - **/ -_LEV_STATIC_PY double -lev_u_set_distance(size_t n1, const size_t *lengths1, - const lev_wchar *strings1[], - size_t n2, const size_t *lengths2, - const lev_wchar *strings2[]) -{ - double *dists; /* the (modified) distance matrix, indexed [row*n1 + col] */ - double *r; - size_t i, j; - size_t *map; - double sum; - - /* catch trivial cases */ - if (n1 == 0) - return (double)n2; - if (n2 == 0) - return (double)n1; - - /* make the number of columns (n1) smaller than the number of rows */ - if (n1 > n2) { - size_t nx = n1; - const size_t *lx = lengths1; - const lev_wchar **sx = strings1; - n1 = n2; - n2 = nx; - lengths1 = lengths2; - lengths2 = lx; - strings1 = strings2; - strings2 = sx; - } - - /* compute distances from each to each */ - r = dists = (double*)safe_malloc_3(n1, n2, sizeof(double)); - if (!r) - return -1.0; - for (i = 0; i < n2; i++) { - size_t len2 = lengths2[i]; - const lev_wchar *str2 = strings2[i]; - const size_t *len1p = lengths1; - const lev_wchar **str1p = strings1; - for (j = 0; j < n1; j++) { - size_t l = len2 + *len1p; - if (l == 0) - *(r++) = 0.0; - else { - size_t d = lev_u_edit_distance(len2, str2, *(len1p++), *(str1p)++, 1); - if (d == (size_t)(-1)) { - free(r); - return -1.0; - } - *(r++) = (double)d/l; - } - } - } - - /* find the optimal mapping between the two sets */ - map = munkers_blackman(n1, n2, dists); - if (!map) - return -1.0; - - /* sum the set distance */ - sum = n2 - n1; - for (j = 0; j < n1; j++) { - size_t l; - i = map[j]; - l = lengths1[j] + lengths2[i]; - if (l > 0) { - size_t d = lev_u_edit_distance(lengths1[j], strings1[j], - lengths2[i], strings2[i], 1); - if (d == (size_t)(-1)) { - free(map); - return -1.0; - } - sum += 2.0*d/l; - } - } - free(map); - - return sum; -} - -/* - * Munkers-Blackman algorithm. - */ -static size_t* -munkers_blackman(size_t n1, size_t n2, double *dists) -{ - size_t i, j; - size_t *covc, *covr; /* 1 if column/row is covered */ - /* these contain 1-based indices, so we can use zero as `none' - * zstarr: column of a z* in given row - * zstarc: row of a z* in given column - * zprimer: column of a z' in given row */ - size_t *zstarr, *zstarc, *zprimer; - - /* allocate memory */ - covc = calloc(n1, sizeof(size_t)); - if (!covc) - return NULL; - zstarc = calloc(n1, sizeof(size_t)); - if (!zstarc) { - free(covc); - return NULL; - } - covr = calloc(n2, sizeof(size_t)); - if (!covr) { - free(zstarc); - free(covc); - return NULL; - } - zstarr = calloc(n2, sizeof(size_t)); - if (!zstarr) { - free(covr); - free(zstarc); - free(covc); - return NULL; - } - zprimer = calloc(n2, sizeof(size_t)); - if (!zprimer) { - free(zstarr); - free(covr); - free(zstarc); - free(covc); - return NULL; - } - - /* step 0 (subtract minimal distance) and step 1 (find zeroes) */ - for (j = 0; j < n1; j++) { - size_t minidx = 0; - double *col = dists + j; - double min = *col; - double *p = col + n1; - for (i = 1; i < n2; i++) { - if (min > *p) { - minidx = i; - min = *p; - } - p += n1; - } - /* subtract */ - p = col; - for (i = 0; i < n2; i++) { - *p -= min; - if (*p < LEV_EPSILON) - *p = 0.0; - p += n1; - } - /* star the zero, if possible */ - if (!zstarc[j] && !zstarr[minidx]) { - zstarc[j] = minidx + 1; - zstarr[minidx] = j + 1; - } - else { - /* otherwise try to find some other */ - p = col; - for (i = 0; i < n2; i++) { - if (i != minidx && *p == 0.0 - && !zstarc[j] && !zstarr[i]) { - zstarc[j] = i + 1; - zstarr[i] = j + 1; - break; - } - p += n1; - } - } - } - - /* main */ - while (1) { - /* step 2 (cover columns containing z*) */ - { - size_t nc = 0; - for (j = 0; j < n1; j++) { - if (zstarc[j]) { - covc[j] = 1; - nc++; - } - } - if (nc == n1) - break; - } - - /* step 3 (find uncovered zeroes) */ - while (1) { - step_3: - /* search uncovered matrix entries */ - for (j = 0; j < n1; j++) { - double *p = dists + j; - if (covc[j]) - continue; - for (i = 0; i < n2; i++) { - if (!covr[i] && *p == 0.0) { - /* when a zero is found, prime it */ - zprimer[i] = j + 1; - if (zstarr[i]) { - /* if there's a z* in the same row, - * uncover the column, cover the row and redo */ - covr[i] = 1; - covc[zstarr[i] - 1] = 0; - goto step_3; - } - /* if there's no z*, - * we are at the end of our path an can convert z' - * to z* */ - goto step_4; - } - p += n1; - } - } - - /* step 5 (new zero manufacturer) - * we can't get here, unless no zero is found at all */ - { - /* find the smallest uncovered entry */ - double min = LEV_INFINITY; - for (j = 0; j < n1; j++) { - double *p = dists + j; - if (covc[j]) - continue; - for (i = 0; i < n2; i++) { - if (!covr[i] && min > *p) { - min = *p; - } - p += n1; - } - } - /* add it to all covered rows */ - for (i = 0; i < n2; i++) { - double *p = dists + i*n1; - if (!covr[i]) - continue; - for (j = 0; j < n1; j++) - *(p++) += min; - } - /* subtract if from all uncovered columns */ - for (j = 0; j < n1; j++) { - double *p = dists + j; - if (covc[j]) - continue; - for (i = 0; i < n2; i++) { - *p -= min; - if (*p < LEV_EPSILON) - *p = 0.0; - p += n1; - } - } - } - } - - /* step 4 (increment the number of z*) - * i is the row number (we get it from step 3) */ - step_4: - i++; - do { - size_t x = i; - - i--; - j = zprimer[i] - 1; /* move to z' in the same row */ - zstarr[i] = j + 1; /* mark it as z* in row buffer */ - i = zstarc[j]; /* move to z* in the same column */ - zstarc[j] = x; /* mark the z' as being new z* */ - } while (i); - memset(zprimer, 0, n2*sizeof(size_t)); - memset(covr, 0, n2*sizeof(size_t)); - memset(covc, 0, n1*sizeof(size_t)); - } - - free(dists); - free(covc); - free(covr); - /*free(zstarc); this is the result */ - free(zstarr); - free(zprimer); - - for (j = 0; j < n1; j++) - zstarc[j]--; - return zstarc; -} -/* }}} */ - -/**************************************************************************** - * - * Editops and other difflib-like stuff. - * - ****************************************************************************/ -/* {{{ */ - -/** - * lev_editops_check_errors: - * @len1: The length of an eventual @ops source string. - * @len2: The length of an eventual @ops destination string. - * @n: The length of @ops. - * @ops: An array of elementary edit operations. - * - * Checks whether @ops is consistent and applicable as a partial edit from a - * string of length @len1 to a string of length @len2. - * - * Returns: Zero if @ops seems OK, a nonzero error code otherwise. - **/ -_LEV_STATIC_PY int -lev_editops_check_errors(size_t len1, size_t len2, - size_t n, const LevEditOp *ops) -{ - const LevEditOp *o; - size_t i; - - if (!n) - return LEV_EDIT_ERR_OK; - - /* check bounds */ - o = ops; - for (i = n; i; i--, o++) { - if (o->type >= LEV_EDIT_LAST) - return LEV_EDIT_ERR_TYPE; - if (o->spos > len1 || o->dpos > len2) - return LEV_EDIT_ERR_OUT; - if (o->spos == len1 && o->type != LEV_EDIT_INSERT) - return LEV_EDIT_ERR_OUT; - if (o->dpos == len2 && o->type != LEV_EDIT_DELETE) - return LEV_EDIT_ERR_OUT; - } - - /* check ordering */ - o = ops + 1; - for (i = n - 1; i; i--, o++, ops++) { - if (o->spos < ops->spos || o->dpos < ops->dpos) - return LEV_EDIT_ERR_ORDER; - } - - return LEV_EDIT_ERR_OK; -} - -/** - * lev_opcodes_check_errors: - * @len1: The length of an eventual @ops source string. - * @len2: The length of an eventual @ops destination string. - * @nb: The length of @bops. - * @bops: An array of difflib block edit operation codes. - * - * Checks whether @bops is consistent and applicable as an edit from a - * string of length @len1 to a string of length @len2. - * - * Returns: Zero if @bops seems OK, a nonzero error code otherwise. - **/ -_LEV_STATIC_PY int -lev_opcodes_check_errors(size_t len1, size_t len2, - size_t nb, const LevOpCode *bops) -{ - const LevOpCode *b; - size_t i; - - if (!nb) - return 1; - - /* check completenes */ - if (bops->sbeg || bops->dbeg - || bops[nb - 1].send != len1 || bops[nb - 1].dend != len2) - return LEV_EDIT_ERR_SPAN; - - /* check bounds and block consistency */ - b = bops; - for (i = nb; i; i--, b++) { - if (b->send > len1 || b->dend > len2) - return LEV_EDIT_ERR_OUT; - switch (b->type) { - case LEV_EDIT_KEEP: - case LEV_EDIT_REPLACE: - if (b->dend - b->dbeg != b->send - b->sbeg || b->dend == b->dbeg) - return LEV_EDIT_ERR_BLOCK; - break; - - case LEV_EDIT_INSERT: - if (b->dend - b->dbeg == 0 || b->send - b->sbeg != 0) - return LEV_EDIT_ERR_BLOCK; - break; - - case LEV_EDIT_DELETE: - if (b->send - b->sbeg == 0 || b->dend - b->dbeg != 0) - return LEV_EDIT_ERR_BLOCK; - break; - - default: - return LEV_EDIT_ERR_TYPE; - break; - } - } - - /* check ordering */ - b = bops + 1; - for (i = nb - 1; i; i--, b++, bops++) { - if (b->sbeg != bops->send || b->dbeg != bops->dend) - return LEV_EDIT_ERR_ORDER; - } - - return LEV_EDIT_ERR_OK; -} - -/** - * lev_editops_invert: - * @n: The length of @ops. - * @ops: An array of elementary edit operations. - * - * Inverts the sense of @ops. It is modified in place. - * - * In other words, @ops becomes a valid partial edit for the original source - * and destination strings with their roles exchanged. - **/ -_LEV_STATIC_PY void -lev_editops_invert(size_t n, LevEditOp *ops) -{ - size_t i; - - for (i = n; i; i--, ops++) { - size_t z; - - z = ops->dpos; - ops->dpos = ops->spos; - ops->spos = z; - if (ops->type & 2) - ops->type ^= 1; - } -} - -/** - * lev_editops_apply: - * @len1: The length of @string1. - * @string1: A string of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A string of length @len2, may contain NUL characters. - * @n: The size of @ops. - * @ops: An array of elementary edit operations. - * @len: Where the size of the resulting string should be stored. - * - * Applies a partial edit @ops from @string1 to @string2. - * - * NB: @ops is not checked for applicability. - * - * Returns: The result of the partial edit as a newly allocated string, its - * length is stored in @len. - **/ -_LEV_STATIC_PY lev_byte* -lev_editops_apply(size_t len1, const lev_byte *string1, - __attribute__((unused)) size_t len2, const lev_byte *string2, - size_t n, const LevEditOp *ops, - size_t *len) -{ - lev_byte *dst, *dpos; /* destination string */ - const lev_byte *spos; /* source string position */ - size_t i, j; - - /* this looks too complex for such a simple task, but note ops is not - * a complete edit sequence, we have to be able to apply anything anywhere */ - dpos = dst = (lev_byte*)safe_malloc((n + len1), sizeof(lev_byte)); - if (!dst) { - *len = (size_t)(-1); - return NULL; - } - spos = string1; - for (i = n; i; i--, ops++) { - /* XXX: this fine with gcc internal memcpy, but when memcpy is - * actually a function, it may be pretty slow */ - j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP); - if (j) { - memcpy(dpos, spos, j*sizeof(lev_byte)); - spos += j; - dpos += j; - } - switch (ops->type) { - case LEV_EDIT_DELETE: - spos++; - break; - - case LEV_EDIT_REPLACE: - spos++; - case LEV_EDIT_INSERT: - *(dpos++) = string2[ops->dpos]; - break; - - default: - break; - } - } - j = len1 - (spos - string1); - if (j) { - memcpy(dpos, spos, j*sizeof(lev_byte)); - spos += j; - dpos += j; - } - - *len = dpos - dst; - /* possible realloc failure is detected with *len != 0 */ - return realloc(dst, *len*sizeof(lev_byte)); -} - -/** - * lev_u_editops_apply: - * @len1: The length of @string1. - * @string1: A string of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A string of length @len2, may contain NUL characters. - * @n: The size of @ops. - * @ops: An array of elementary edit operations. - * @len: Where the size of the resulting string should be stored. - * - * Applies a partial edit @ops from @string1 to @string2. - * - * NB: @ops is not checked for applicability. - * - * Returns: The result of the partial edit as a newly allocated string, its - * length is stored in @len. - **/ -_LEV_STATIC_PY lev_wchar* -lev_u_editops_apply(size_t len1, const lev_wchar *string1, - __attribute__((unused)) size_t len2, - const lev_wchar *string2, - size_t n, const LevEditOp *ops, - size_t *len) -{ - lev_wchar *dst, *dpos; /* destination string */ - const lev_wchar *spos; /* source string position */ - size_t i, j; - - /* this looks too complex for such a simple task, but note ops is not - * a complete edit sequence, we have to be able to apply anything anywhere */ - dpos = dst = (lev_wchar*)safe_malloc((n + len1), sizeof(lev_wchar)); - if (!dst) { - *len = (size_t)(-1); - return NULL; - } - spos = string1; - for (i = n; i; i--, ops++) { - /* XXX: this is fine with gcc internal memcpy, but when memcpy is - * actually a function, it may be pretty slow */ - j = ops->spos - (spos - string1) + (ops->type == LEV_EDIT_KEEP); - if (j) { - memcpy(dpos, spos, j*sizeof(lev_wchar)); - spos += j; - dpos += j; - } - switch (ops->type) { - case LEV_EDIT_DELETE: - spos++; - break; - - case LEV_EDIT_REPLACE: - spos++; - case LEV_EDIT_INSERT: - *(dpos++) = string2[ops->dpos]; - break; - - default: - break; - } - } - j = len1 - (spos - string1); - if (j) { - memcpy(dpos, spos, j*sizeof(lev_wchar)); - spos += j; - dpos += j; - } - - *len = dpos - dst; - /* possible realloc failure is detected with *len != 0 */ - return realloc(dst, *len*sizeof(lev_wchar)); -} - -/** - * editops_from_cost_matrix: - * @len1: The length of @string1. - * @string1: A string of length @len1, may contain NUL characters. - * @o1: The offset where the matrix starts from the start of @string1. - * @len2: The length of @string2. - * @string2: A string of length @len2, may contain NUL characters. - * @o2: The offset where the matrix starts from the start of @string2. - * @matrix: The cost matrix. - * @n: Where the number of edit operations should be stored. - * - * Reconstructs the optimal edit sequence from the cost matrix @matrix. - * - * The matrix is freed. - * - * Returns: The optimal edit sequence, as a newly allocated array of - * elementary edit operations, it length is stored in @n. - **/ -static LevEditOp* -editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1, - size_t len2, const lev_byte *string2, size_t off2, - size_t *matrix, size_t *n) -{ - size_t *p; - size_t i, j, pos; - LevEditOp *ops; - int dir = 0; - - pos = *n = matrix[len1*len2 - 1]; - if (!*n) { - free(matrix); - return NULL; - } - ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); - if (!ops) { - free(matrix); - *n = (size_t)(-1); - return NULL; - } - i = len1 - 1; - j = len2 - 1; - p = matrix + len1*len2 - 1; - while (i || j) { - /* prefer contiuning in the same direction */ - if (dir < 0 && j && *p == *(p - 1) + 1) { - pos--; - ops[pos].type = LEV_EDIT_INSERT; - ops[pos].spos = i + off1; - ops[pos].dpos = --j + off2; - p--; - continue; - } - if (dir > 0 && i && *p == *(p - len2) + 1) { - pos--; - ops[pos].type = LEV_EDIT_DELETE; - ops[pos].spos = --i + off1; - ops[pos].dpos = j + off2; - p -= len2; - continue; - } - if (i && j && *p == *(p - len2 - 1) - && string1[i - 1] == string2[j - 1]) { - /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */ - i--; - j--; - p -= len2 + 1; - dir = 0; - continue; - } - if (i && j && *p == *(p - len2 - 1) + 1) { - pos--; - ops[pos].type = LEV_EDIT_REPLACE; - ops[pos].spos = --i + off1; - ops[pos].dpos = --j + off2; - p -= len2 + 1; - dir = 0; - continue; - } - /* we cant't turn directly from -1 to 1, in this case it would be better - * to go diagonally, but check it (dir == 0) */ - if (dir == 0 && j && *p == *(p - 1) + 1) { - pos--; - ops[pos].type = LEV_EDIT_INSERT; - ops[pos].spos = i + off1; - ops[pos].dpos = --j + off2; - p--; - dir = -1; - continue; - } - if (dir == 0 && i && *p == *(p - len2) + 1) { - pos--; - ops[pos].type = LEV_EDIT_DELETE; - ops[pos].spos = --i + off1; - ops[pos].dpos = j + off2; - p -= len2; - dir = 1; - continue; - } - /* coredump right now, later might be too late ;-) */ - assert("lost in the cost matrix" == NULL); - } - free(matrix); - - return ops; -} - -/** - * lev_editops_find: - * @len1: The length of @string1. - * @string1: A string of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A string of length @len2, may contain NUL characters. - * @n: Where the number of edit operations should be stored. - * - * Find an optimal edit sequence from @string1 to @string2. - * - * When there's more than one optimal sequence, a one is arbitrarily (though - * deterministically) chosen. - * - * Returns: The optimal edit sequence, as a newly allocated array of - * elementary edit operations, it length is stored in @n. - * It is normalized, i.e., keep operations are not included. - **/ -_LEV_STATIC_PY LevEditOp* -lev_editops_find(size_t len1, const lev_byte *string1, - size_t len2, const lev_byte *string2, - size_t *n) -{ - size_t len1o, len2o; - size_t i; - size_t *matrix; /* cost matrix */ - - /* strip common prefix */ - len1o = 0; - while (len1 > 0 && len2 > 0 && *string1 == *string2) { - len1--; - len2--; - string1++; - string2++; - len1o++; - } - len2o = len1o; - - /* strip common suffix */ - while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { - len1--; - len2--; - } - len1++; - len2++; - - /* initialize first row and column */ - matrix = (size_t*)safe_malloc_3(len1, len2, sizeof(size_t)); - if (!matrix) { - *n = (size_t)(-1); - return NULL; - } - for (i = 0; i < len2; i++) - matrix[i] = i; - for (i = 1; i < len1; i++) - matrix[len2*i] = i; - - /* find the costs and fill the matrix */ - for (i = 1; i < len1; i++) { - size_t *prev = matrix + (i - 1)*len2; - size_t *p = matrix + i*len2; - size_t *end = p + len2 - 1; - const lev_byte char1 = string1[i - 1]; - const lev_byte *char2p = string2; - size_t x = i; - p++; - while (p <= end) { - size_t c3 = *(prev++) + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - c3 = *prev + 1; - if (x > c3) - x = c3; - *(p++) = x; - } - } - - /* find the way back */ - return editops_from_cost_matrix(len1, string1, len1o, - len2, string2, len2o, - matrix, n); -} - -/** - * ueditops_from_cost_matrix: - * @len1: The length of @string1. - * @string1: A string of length @len1, may contain NUL characters. - * @o1: The offset where the matrix starts from the start of @string1. - * @len2: The length of @string2. - * @string2: A string of length @len2, may contain NUL characters. - * @o2: The offset where the matrix starts from the start of @string2. - * @matrix: The cost matrix. - * @n: Where the number of edit operations should be stored. - * - * Reconstructs the optimal edit sequence from the cost matrix @matrix. - * - * The matrix is freed. - * - * Returns: The optimal edit sequence, as a newly allocated array of - * elementary edit operations, it length is stored in @n. - **/ -static LevEditOp* -ueditops_from_cost_matrix(size_t len1, const lev_wchar *string1, size_t o1, - size_t len2, const lev_wchar *string2, size_t o2, - size_t *matrix, size_t *n) -{ - size_t *p; - size_t i, j, pos; - LevEditOp *ops; - int dir = 0; - - pos = *n = matrix[len1*len2 - 1]; - if (!*n) { - free(matrix); - return NULL; - } - ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); - if (!ops) { - free(matrix); - *n = (size_t)(-1); - return NULL; - } - i = len1 - 1; - j = len2 - 1; - p = matrix + len1*len2 - 1; - while (i || j) { - /* prefer contiuning in the same direction */ - if (dir < 0 && j && *p == *(p - 1) + 1) { - pos--; - ops[pos].type = LEV_EDIT_INSERT; - ops[pos].spos = i + o1; - ops[pos].dpos = --j + o2; - p--; - continue; - } - if (dir > 0 && i && *p == *(p - len2) + 1) { - pos--; - ops[pos].type = LEV_EDIT_DELETE; - ops[pos].spos = --i + o1; - ops[pos].dpos = j + o2; - p -= len2; - continue; - } - if (i && j && *p == *(p - len2 - 1) - && string1[i - 1] == string2[j - 1]) { - /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */ - i--; - j--; - p -= len2 + 1; - dir = 0; - continue; - } - if (i && j && *p == *(p - len2 - 1) + 1) { - pos--; - ops[pos].type = LEV_EDIT_REPLACE; - ops[pos].spos = --i + o1; - ops[pos].dpos = --j + o2; - p -= len2 + 1; - dir = 0; - continue; - } - /* we cant't turn directly from -1 to 1, in this case it would be better - * to go diagonally, but check it (dir == 0) */ - if (dir == 0 && j && *p == *(p - 1) + 1) { - pos--; - ops[pos].type = LEV_EDIT_INSERT; - ops[pos].spos = i + o1; - ops[pos].dpos = --j + o2; - p--; - dir = -1; - continue; - } - if (dir == 0 && i && *p == *(p - len2) + 1) { - pos--; - ops[pos].type = LEV_EDIT_DELETE; - ops[pos].spos = --i + o1; - ops[pos].dpos = j + o2; - p -= len2; - dir = 1; - continue; - } - /* coredump right now, later might be too late ;-) */ - assert("lost in the cost matrix" == NULL); - } - free(matrix); - - return ops; -} - -/** - * lev_u_editops_find: - * @len1: The length of @string1. - * @string1: A string of length @len1, may contain NUL characters. - * @len2: The length of @string2. - * @string2: A string of length @len2, may contain NUL characters. - * @n: Where the number of edit operations should be stored. - * - * Find an optimal edit sequence from @string1 to @string2. - * - * When there's more than one optimal sequence, a one is arbitrarily (though - * deterministically) chosen. - * - * Returns: The optimal edit sequence, as a newly allocated array of - * elementary edit operations, it length is stored in @n. - * It is normalized, i.e., keep operations are not included. - **/ -_LEV_STATIC_PY LevEditOp* -lev_u_editops_find(size_t len1, const lev_wchar *string1, - size_t len2, const lev_wchar *string2, - size_t *n) -{ - size_t len1o, len2o; - size_t i; - size_t *matrix; /* cost matrix */ - - /* strip common prefix */ - len1o = 0; - while (len1 > 0 && len2 > 0 && *string1 == *string2) { - len1--; - len2--; - string1++; - string2++; - len1o++; - } - len2o = len1o; - - /* strip common suffix */ - while (len1 > 0 && len2 > 0 && string1[len1-1] == string2[len2-1]) { - len1--; - len2--; - } - len1++; - len2++; - - /* initalize first row and column */ - matrix = (size_t*)safe_malloc_3(len1, len2, sizeof(size_t)); - if (!matrix) { - *n = (size_t)(-1); - return NULL; - } - for (i = 0; i < len2; i++) - matrix[i] = i; - for (i = 1; i < len1; i++) - matrix[len2*i] = i; - - /* find the costs and fill the matrix */ - for (i = 1; i < len1; i++) { - size_t *prev = matrix + (i - 1)*len2; - size_t *p = matrix + i*len2; - size_t *end = p + len2 - 1; - const lev_wchar char1 = string1[i - 1]; - const lev_wchar *char2p = string2; - size_t x = i; - p++; - while (p <= end) { - size_t c3 = *(prev++) + (char1 != *(char2p++)); - x++; - if (x > c3) - x = c3; - c3 = *prev + 1; - if (x > c3) - x = c3; - *(p++) = x; - } - } - - /* find the way back */ - return ueditops_from_cost_matrix(len1, string1, len1o, - len2, string2, len2o, - matrix, n); -} - -/** - * lev_opcodes_to_editops: - * @nb: The length of @bops. - * @bops: An array of difflib block edit operation codes. - * @n: Where the number of edit operations should be stored. - * @keepkeep: If nonzero, keep operations will be included. Otherwise the - * result will be normalized, i.e. without any keep operations. - * - * Converts difflib block operation codes to elementary edit operations. - * - * Returns: The converted edit operatiosn, as a newly allocated array; its - * size is stored in @n. - **/ -_LEV_STATIC_PY LevEditOp* -lev_opcodes_to_editops(size_t nb, const LevOpCode *bops, - size_t *n, int keepkeep) -{ - size_t i; - const LevOpCode *b; - LevEditOp *ops, *o; - - /* compute the number of atomic operations */ - *n = 0; - if (!nb) - return NULL; - - b = bops; - if (keepkeep) { - for (i = nb; i; i--, b++) { - size_t sd = b->send - b->sbeg; - size_t dd = b->dend - b->dbeg; - *n += (sd > dd ? sd : dd); - } - } - else { - for (i = nb; i; i--, b++) { - size_t sd = b->send - b->sbeg; - size_t dd = b->dend - b->dbeg; - *n += (b->type != LEV_EDIT_KEEP ? (sd > dd ? sd : dd) : 0); - } - } - - /* convert */ - o = ops = (LevEditOp*)safe_malloc((*n), sizeof(LevEditOp)); - if (!ops) { - *n = (size_t)(-1); - return NULL; - } - b = bops; - for (i = nb; i; i--, b++) { - size_t j; - - switch (b->type) { - case LEV_EDIT_KEEP: - if (keepkeep) { - for (j = 0; j < b->send - b->sbeg; j++, o++) { - o->type = LEV_EDIT_KEEP; - o->spos = b->sbeg + j; - o->dpos = b->dbeg + j; - } - } - break; - - case LEV_EDIT_REPLACE: - for (j = 0; j < b->send - b->sbeg; j++, o++) { - o->type = LEV_EDIT_REPLACE; - o->spos = b->sbeg + j; - o->dpos = b->dbeg + j; - } - break; - - case LEV_EDIT_DELETE: - for (j = 0; j < b->send - b->sbeg; j++, o++) { - o->type = LEV_EDIT_DELETE; - o->spos = b->sbeg + j; - o->dpos = b->dbeg; - } - break; - - case LEV_EDIT_INSERT: - for (j = 0; j < b->dend - b->dbeg; j++, o++) { - o->type = LEV_EDIT_INSERT; - o->spos = b->sbeg; - o->dpos = b->dbeg + j; - } - break; - - default: - break; - } - } - assert((size_t)(o - ops) == *n); - - return ops; -} - -/** - * lev_editops_to_opcodes: - * @n: The size of @ops. - * @ops: An array of elementary edit operations. - * @nb: Where the number of difflib block operation codes should be stored. - * @len1: The length of the source string. - * @len2: The length of the destination string. - * - * Converts elementary edit operations to difflib block operation codes. - * - * Note the string lengths are necessary since difflib doesn't allow omitting - * keep operations. - * - * Returns: The converted block operation codes, as a newly allocated array; - * its length is stored in @nb. - **/ -_LEV_STATIC_PY LevOpCode* -lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb, - size_t len1, size_t len2) -{ - size_t nbl, i, spos, dpos; - const LevEditOp *o; - LevOpCode *bops, *b; - LevEditType type; - - /* compute the number of blocks */ - nbl = 0; - o = ops; - spos = dpos = 0; - type = LEV_EDIT_KEEP; - for (i = n; i; ) { - /* simply pretend there are no keep blocks */ - while (o->type == LEV_EDIT_KEEP && --i) - o++; - if (!i) - break; - if (spos < o->spos || dpos < o->dpos) { - nbl++; - spos = o->spos; - dpos = o->dpos; - } - nbl++; - type = o->type; - switch (type) { - case LEV_EDIT_REPLACE: - do { - spos++; - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_DELETE: - do { - spos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_INSERT: - do { - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - default: - break; - } - } - if (spos < len1 || dpos < len2) - nbl++; - - /* convert */ - b = bops = (LevOpCode*)safe_malloc(nbl, sizeof(LevOpCode)); - if (!bops) { - *nb = (size_t)(-1); - return NULL; - } - o = ops; - spos = dpos = 0; - type = LEV_EDIT_KEEP; - for (i = n; i; ) { - /* simply pretend there are no keep blocks */ - while (o->type == LEV_EDIT_KEEP && --i) - o++; - if (!i) - break; - b->sbeg = spos; - b->dbeg = dpos; - if (spos < o->spos || dpos < o->dpos) { - b->type = LEV_EDIT_KEEP; - spos = b->send = o->spos; - dpos = b->dend = o->dpos; - b++; - b->sbeg = spos; - b->dbeg = dpos; - } - type = o->type; - switch (type) { - case LEV_EDIT_REPLACE: - do { - spos++; - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_DELETE: - do { - spos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_INSERT: - do { - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - default: - break; - } - b->type = type; - b->send = spos; - b->dend = dpos; - b++; - } - if (spos < len1 || dpos < len2) { - assert(len1 - spos == len2 - dpos); - b->type = LEV_EDIT_KEEP; - b->sbeg = spos; - b->dbeg = dpos; - b->send = len1; - b->dend = len2; - b++; - } - assert((size_t)(b - bops) == nbl); - - *nb = nbl; - return bops; -} - -/** - * lev_opcodes_apply: - * @len1: The length of the source string. - * @string1: A string of length @len1, may contain NUL characters. - * @len2: The length of the destination string. - * @string2: A string of length @len2, may contain NUL characters. - * @nb: The length of @bops. - * @bops: An array of difflib block edit operation codes. - * @len: Where the size of the resulting string should be stored. - * - * Applies a sequence of difflib block operations to a string. - * - * NB: @bops is not checked for applicability. - * - * Returns: The result of the edit as a newly allocated string, its length - * is stored in @len. - **/ -_LEV_STATIC_PY lev_byte* -lev_opcodes_apply(size_t len1, const lev_byte *string1, - size_t len2, const lev_byte *string2, - size_t nb, const LevOpCode *bops, - size_t *len) -{ - lev_byte *dst, *dpos; /* destination string */ - const lev_byte *spos; /* source string position */ - size_t i; - - /* this looks too complex for such a simple task, but note ops is not - * a complete edit sequence, we have to be able to apply anything anywhere */ - dpos = dst = (lev_byte*)safe_malloc((len1 + len2), sizeof(lev_byte)); - if (!dst) { - *len = (size_t)(-1); - return NULL; - } - spos = string1; - for (i = nb; i; i--, bops++) { - switch (bops->type) { - case LEV_EDIT_INSERT: - case LEV_EDIT_REPLACE: - memcpy(dpos, string2 + bops->dbeg, - (bops->dend - bops->dbeg)*sizeof(lev_byte)); - break; - - case LEV_EDIT_KEEP: - memcpy(dpos, string1 + bops->sbeg, - (bops->send - bops->sbeg)*sizeof(lev_byte)); - break; - - default: - break; - } - spos += bops->send - bops->sbeg; - dpos += bops->dend - bops->dbeg; - } - - *len = dpos - dst; - /* possible realloc failure is detected with *len != 0 */ - return realloc(dst, *len*sizeof(lev_byte)); -} - -/** - * lev_u_opcodes_apply: - * @len1: The length of the source string. - * @string1: A string of length @len1, may contain NUL characters. - * @len2: The length of the destination string. - * @string2: A string of length @len2, may contain NUL characters. - * @nb: The length of @bops. - * @bops: An array of difflib block edit operation codes. - * @len: Where the size of the resulting string should be stored. - * - * Applies a sequence of difflib block operations to a string. - * - * NB: @bops is not checked for applicability. - * - * Returns: The result of the edit as a newly allocated string, its length - * is stored in @len. - **/ -_LEV_STATIC_PY lev_wchar* -lev_u_opcodes_apply(size_t len1, const lev_wchar *string1, - size_t len2, const lev_wchar *string2, - size_t nb, const LevOpCode *bops, - size_t *len) -{ - lev_wchar *dst, *dpos; /* destination string */ - const lev_wchar *spos; /* source string position */ - size_t i; - - /* this looks too complex for such a simple task, but note ops is not - * a complete edit sequence, we have to be able to apply anything anywhere */ - dpos = dst = (lev_wchar*)safe_malloc((len1 + len2), sizeof(lev_wchar)); - if (!dst) { - *len = (size_t)(-1); - return NULL; - } - spos = string1; - for (i = nb; i; i--, bops++) { - switch (bops->type) { - case LEV_EDIT_INSERT: - case LEV_EDIT_REPLACE: - memcpy(dpos, string2 + bops->dbeg, - (bops->dend - bops->dbeg)*sizeof(lev_wchar)); - break; - - case LEV_EDIT_KEEP: - memcpy(dpos, string1 + bops->sbeg, - (bops->send - bops->sbeg)*sizeof(lev_wchar)); - break; - - default: - break; - } - spos += bops->send - bops->sbeg; - dpos += bops->dend - bops->dbeg; - } - - *len = dpos - dst; - /* possible realloc failure is detected with *len != 0 */ - return realloc(dst, *len*sizeof(lev_wchar)); -} - -/** - * lev_opcodes_invert: - * @nb: The length of @ops. - * @bops: An array of difflib block edit operation codes. - * - * Inverts the sense of @ops. It is modified in place. - * - * In other words, @ops becomes a partial edit for the original source - * and destination strings with their roles exchanged. - **/ -_LEV_STATIC_PY void -lev_opcodes_invert(size_t nb, LevOpCode *bops) -{ - size_t i; - - for (i = nb; i; i--, bops++) { - size_t z; - - z = bops->dbeg; - bops->dbeg = bops->sbeg; - bops->sbeg = z; - z = bops->dend; - bops->dend = bops->send; - bops->send = z; - if (bops->type & 2) - bops->type ^= 1; - } -} - -/** - * lev_editops_matching_blocks: - * @len1: The length of the source string. - * @len2: The length of the destination string. - * @n: The size of @ops. - * @ops: An array of elementary edit operations. - * @nmblocks: Where the number of matching block should be stored. - * - * Computes the matching block corresponding to an optimal edit @ops. - * - * Returns: The matching blocks as a newly allocated array, it length is - * stored in @nmblocks. - **/ -_LEV_STATIC_PY LevMatchingBlock* -lev_editops_matching_blocks(size_t len1, - size_t len2, - size_t n, - const LevEditOp *ops, - size_t *nmblocks) -{ - size_t nmb, i, spos, dpos; - LevEditType type; - const LevEditOp *o; - LevMatchingBlock *mblocks, *mb; - - /* compute the number of matching blocks */ - nmb = 0; - o = ops; - spos = dpos = 0; - type = LEV_EDIT_KEEP; - for (i = n; i; ) { - /* simply pretend there are no keep blocks */ - while (o->type == LEV_EDIT_KEEP && --i) - o++; - if (!i) - break; - if (spos < o->spos || dpos < o->dpos) { - nmb++; - spos = o->spos; - dpos = o->dpos; - } - type = o->type; - switch (type) { - case LEV_EDIT_REPLACE: - do { - spos++; - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_DELETE: - do { - spos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_INSERT: - do { - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - default: - break; - } - } - if (spos < len1 || dpos < len2) - nmb++; - - /* fill the info */ - mb = mblocks = (LevMatchingBlock*)safe_malloc(nmb, sizeof(LevOpCode)); - if (!mblocks) { - *nmblocks = (size_t)(-1); - return NULL; - } - o = ops; - spos = dpos = 0; - type = LEV_EDIT_KEEP; - for (i = n; i; ) { - /* simply pretend there are no keep blocks */ - while (o->type == LEV_EDIT_KEEP && --i) - o++; - if (!i) - break; - if (spos < o->spos || dpos < o->dpos) { - mb->spos = spos; - mb->dpos = dpos; - mb->len = o->spos - spos; - spos = o->spos; - dpos = o->dpos; - mb++; - } - type = o->type; - switch (type) { - case LEV_EDIT_REPLACE: - do { - spos++; - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_DELETE: - do { - spos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - case LEV_EDIT_INSERT: - do { - dpos++; - i--; - o++; - } while (i && o->type == type && spos == o->spos && dpos == o->dpos); - break; - - default: - break; - } - } - if (spos < len1 || dpos < len2) { - assert(len1 - spos == len2 - dpos); - mb->spos = spos; - mb->dpos = dpos; - mb->len = len1 - spos; - mb++; - } - assert((size_t)(mb - mblocks) == nmb); - - *nmblocks = nmb; - return mblocks; -} - -/** - * lev_opcodes_matching_blocks: - * @len1: The length of the source string. - * @len2: The length of the destination string. - * @nb: The size of @bops. - * @bops: An array of difflib block edit operation codes. - * @nmblocks: Where the number of matching block should be stored. - * - * Computes the matching block corresponding to an optimal edit @bops. - * - * Returns: The matching blocks as a newly allocated array, it length is - * stored in @nmblocks. - **/ -_LEV_STATIC_PY LevMatchingBlock* -lev_opcodes_matching_blocks(size_t len1, - __attribute__((unused)) size_t len2, - size_t nb, - const LevOpCode *bops, - size_t *nmblocks) -{ - size_t nmb, i; - const LevOpCode *b; - LevMatchingBlock *mblocks, *mb; - - /* compute the number of matching blocks */ - nmb = 0; - b = bops; - for (i = nb; i; i--, b++) { - if (b->type == LEV_EDIT_KEEP) { - nmb++; - /* adjacent KEEP blocks -- we never produce it, but... */ - while (i && b->type == LEV_EDIT_KEEP) { - i--; - b++; - } - if (!i) - break; - } - } - - /* convert */ - mb = mblocks = (LevMatchingBlock*)safe_malloc(nmb, sizeof(LevOpCode)); - if (!mblocks) { - *nmblocks = (size_t)(-1); - return NULL; - } - b = bops; - for (i = nb; i; i--, b++) { - if (b->type == LEV_EDIT_KEEP) { - mb->spos = b->sbeg; - mb->dpos = b->dbeg; - /* adjacent KEEP blocks -- we never produce it, but... */ - while (i && b->type == LEV_EDIT_KEEP) { - i--; - b++; - } - if (!i) { - mb->len = len1 - mb->spos; - mb++; - break; - } - mb->len = b->sbeg - mb->spos; - mb++; - } - } - assert((size_t)(mb - mblocks) == nmb); - - *nmblocks = nmb; - return mblocks; -} - -/** - * lev_editops_total_cost: - * @n: Tne size of @ops. - * @ops: An array of elementary edit operations. - * - * Computes the total cost of operations in @ops. - * - * The costs of elementary operations are all 1. - * - * Returns: The total cost. - **/ -_LEV_STATIC_PY size_t -lev_editops_total_cost(size_t n, - const LevEditOp *ops) -{ - size_t i, sum = 0; - - for (i = n; i; i--, ops++) - sum += !!ops->type; - - return sum; -} - -/** - * lev_editops_normalize: - * @n: The size of @ops. - * @ops: An array of elementary edit operations. - * @nnorm: Where to store then length of the normalized array. - * - * Normalizes a list of edit operations to contain no keep operations. - * - * Note it returns %NULL for an empty array. - * - * Returns: A newly allocated array of normalized edit operations, its length - * is stored to @nnorm. - **/ -_LEV_STATIC_PY LevEditOp* -lev_editops_normalize(size_t n, - const LevEditOp *ops, - size_t *nnorm) -{ - size_t nx, i; - const LevEditOp *o; - LevEditOp *opsnorm, *on; - - if (!n || !ops) { - *nnorm = 0; - return NULL; - } - - nx = 0; - o = ops; - for (i = n; i; i--, o++) - nx += (o->type == LEV_EDIT_KEEP); - - *nnorm = n - nx; - if (!*nnorm) - return NULL; - - opsnorm = on = (LevEditOp*)safe_malloc((n - nx), sizeof(LevEditOp)); - o = ops; - for (i = n; i; i--, o++) { - if (o->type == LEV_EDIT_KEEP) - continue; - memcpy(on++, o, sizeof(LevEditOp)); - } - - return opsnorm; -} - -/** - * lev_opcodes_total_cost: - * @nb: Tne size of @bops. - * @bops: An array of difflib block operation codes. - * - * Computes the total cost of operations in @bops. - * - * The costs of elementary operations are all 1. - * - * Returns: The total cost. - **/ -_LEV_STATIC_PY size_t -lev_opcodes_total_cost(size_t nb, - const LevOpCode *bops) -{ - size_t i, sum = 0; - - for (i = nb; i; i--, bops++) { - switch (bops->type) { - case LEV_EDIT_REPLACE: - case LEV_EDIT_DELETE: - sum += (bops->send - bops->sbeg); - break; - - case LEV_EDIT_INSERT: - sum += (bops->dend - bops->dbeg); - break; - - default: - break; - } - } - - return sum; -} - -/** - * lev_editops_subtract: - * @n: The size of @ops. - * @ops: An array of elementary edit operations. - * @ns: The size of @sub. - * @sub: A subsequence (ordered subset) of @ops - * @nrem: Where to store then length of the remainder array. - * - * Subtracts a subsequence of elementary edit operations from a sequence. - * - * The remainder is a sequence that, applied to result of application of @sub, - * gives the same final result as application of @ops to original string. - * - * Returns: A newly allocated array of normalized edit operations, its length - * is stored to @nrem. It is always normalized, i.e, without any - * keep operations. On failure, %NULL is returned. - **/ -_LEV_STATIC_PY LevEditOp* -lev_editops_subtract(size_t n, - const LevEditOp *ops, - size_t ns, - const LevEditOp *sub, - size_t *nrem) -{ - static const int shifts[] = { 0, 0, 1, -1 }; - LevEditOp *rem; - size_t i, j, nr, nn; - int shift; - - /* compute remainder size */ - *nrem = -1; - nr = nn = 0; - for (i = 0; i < n; i++) { - if (ops[i].type != LEV_EDIT_KEEP) - nr++; - } - for (i = 0; i < ns; i++) { - if (sub[i].type != LEV_EDIT_KEEP) - nn++; - } - if (nn > nr) - return NULL; - nr -= nn; - - - /* subtract */ - /* we could simply return NULL when nr == 0, but then it would be possible - * to subtract *any* sequence of the right length to get an empty sequence - * -- clrealy incorrectly; so we have to scan the list to check */ - rem = nr ? (LevEditOp*)safe_malloc(nr, sizeof(LevEditOp)) : NULL; - j = nn = shift = 0; - for (i = 0; i < ns; i++) { - while ((ops[j].spos != sub[i].spos - || ops[j].dpos != sub[i].dpos - || ops[j].type != sub[i].type) - && j < n) { - if (ops[j].type != LEV_EDIT_KEEP) { - rem[nn] = ops[j]; - rem[nn].spos += shift; - nn++; - } - j++; - } - if (j == n) { - free(rem); - return NULL; - } - - shift += shifts[sub[i].type]; - j++; - } - - while (j < n) { - if (ops[j].type != LEV_EDIT_KEEP) { - rem[nn] = ops[j]; - rem[nn].spos += shift; - nn++; - } - j++; - } - assert(nn == nr); - - *nrem = nr; - return rem; -} -/* }}} */ - -/**************************************************************************** - * - * Weighted mean strings - * - ****************************************************************************/ - -/* -_LEV_STATIC_PY lev_byte* -lev_weighted_average(size_t len1, - const lev_byte* string1, - size_t len2, - const lev_byte* string2, - size_t n, - const LevEditOp *ops, - LevAveragingType avtype, - size_t total_cost, - double alpha, - size_t *len) -{ - return NULL; -} -*/ -