sanchit-gandhi HF staff commited on
Commit
8a02493
1 Parent(s): 1a0847d
Files changed (5) hide show
  1. Makefile +9 -0
  2. app.py +96 -0
  3. assets/large-v2.csv +0 -0
  4. diff_match_patch.py +2019 -0
  5. requirements.txt +3 -0
Makefile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ check_dirs := .
2
+
3
+ quality:
4
+ black --check $(check_dirs)
5
+ ruff $(check_dirs)
6
+
7
+ style:
8
+ black $(check_dirs)
9
+ ruff $(check_dirs) --fix
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import unicodedata
3
+ import diff_match_patch as dmp_module
4
+ from enum import Enum
5
+ import gradio as gr
6
+ from datasets import load_dataset
7
+ import pandas as pd
8
+ from jiwer import process_words, wer_default
9
+
10
+
11
+ class Action(Enum):
12
+ INSERTION = 1
13
+ DELETION = -1
14
+ EQUAL = 0
15
+
16
+
17
+ def compare_string(text1: str, text2: str) -> list:
18
+ text1_normalized = unicodedata.normalize("NFKC", text1)
19
+ text2_normalized = unicodedata.normalize("NFKC", text2)
20
+
21
+ dmp = dmp_module.diff_match_patch()
22
+ diff = dmp.diff_main(text1_normalized, text2_normalized)
23
+ dmp.diff_cleanupSemantic(diff)
24
+
25
+ return diff
26
+
27
+
28
+ def style_text(diff):
29
+ fullText = ""
30
+ for action, text in diff:
31
+ if action == Action.INSERTION.value:
32
+ fullText += f"<span style='background-color:Lightgreen'>{text}</span>"
33
+ elif action == Action.DELETION.value:
34
+ fullText += f"<span style='background-color:#FFCCCB'><s>{text}</s></span>"
35
+ elif action == Action.EQUAL.value:
36
+ fullText += f"{text}"
37
+ else:
38
+ raise Exception("Not Implemented")
39
+ fullText = fullText.replace("](", "]\(").replace("~", "\~")
40
+ return fullText
41
+
42
+
43
+ dataset = load_dataset("distil-whisper/tedlium-long-form", split="validation")
44
+
45
+ csv = pd.read_csv("assets/large-v2.csv")
46
+
47
+ norm_target = csv["Norm Target"]
48
+ norm_pred = csv["Norm Pred"]
49
+
50
+ norm_target = [norm_target[i] for i in range(len(norm_target))]
51
+ norm_pred = [norm_pred[i] for i in range(len(norm_pred))]
52
+
53
+ target_dtype = np.int16
54
+ max_range = np.iinfo(target_dtype).max
55
+
56
+
57
+ def get_visualisation(idx):
58
+ audio = dataset[idx]["audio"]
59
+ array = (audio["array"] * max_range).astype(np.int16)
60
+ sampling_rate = audio["sampling_rate"]
61
+
62
+ text1 = norm_target[idx]
63
+ text2 = norm_pred[idx]
64
+
65
+ wer_output = process_words(text1, text2, wer_default, wer_default)
66
+ wer_percentage = 100 * wer_output.wer
67
+ num_insertions = wer_output.insertions
68
+
69
+ rel_length = len(text2.split()) / len(text1.split())
70
+
71
+ diff = compare_string(text1, text2)
72
+ full_text = style_text(diff)
73
+
74
+ return (sampling_rate, array), wer_percentage, num_insertions, rel_length, full_text
75
+
76
+
77
+ if __name__ == "__main__":
78
+ with gr.Blocks() as demo:
79
+ slider = gr.Slider(
80
+ minimum=0, maximum=len(norm_target), step=1, label="Dataset sample"
81
+ )
82
+ btn = gr.Button("Analyse")
83
+ audio_out = gr.Audio(label="Audio input")
84
+ with gr.Row():
85
+ wer = gr.Number(label="WER")
86
+ insertions = gr.Number(label="Insertions")
87
+ relative_length = gr.Number(label="Relative length of target / reference")
88
+ text_out = gr.Markdown(label="Text difference")
89
+
90
+ btn.click(
91
+ fn=get_visualisation,
92
+ inputs=slider,
93
+ outputs=[audio_out, wer, insertions, relative_length, text_out],
94
+ )
95
+
96
+ demo.launch()
assets/large-v2.csv ADDED
The diff for this file is too large to render. See raw diff
 
diff_match_patch.py ADDED
@@ -0,0 +1,2019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+
3
+ """Diff Match and Patch
4
+ Copyright 2018 The diff-match-patch Authors.
5
+ https://github.com/google/diff-match-patch
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.
18
+ """
19
+
20
+ """Functions for diff, match and patch.
21
+
22
+ Computes the difference between two texts to create a patch.
23
+ Applies the patch onto another text, allowing for errors.
24
+ """
25
+
26
+ __author__ = "fraser@google.com (Neil Fraser)"
27
+
28
+ import re
29
+ import sys
30
+ import time
31
+ import urllib.parse
32
+
33
+
34
+ class diff_match_patch:
35
+ """Class containing the diff, match and patch methods.
36
+
37
+ Also contains the behaviour settings.
38
+ """
39
+
40
+ def __init__(self):
41
+ """Inits a diff_match_patch object with default settings.
42
+ Redefine these in your program to override the defaults.
43
+ """
44
+
45
+ # Number of seconds to map a diff before giving up (0 for infinity).
46
+ self.Diff_Timeout = 1.0
47
+ # Cost of an empty edit operation in terms of edit characters.
48
+ self.Diff_EditCost = 4
49
+ # At what point is no match declared (0.0 = perfection, 1.0 = very loose).
50
+ self.Match_Threshold = 0.5
51
+ # How far to search for a match (0 = exact location, 1000+ = broad match).
52
+ # A match this many characters away from the expected location will add
53
+ # 1.0 to the score (0.0 is a perfect match).
54
+ self.Match_Distance = 1000
55
+ # When deleting a large block of text (over ~64 characters), how close do
56
+ # the contents have to be to match the expected contents. (0.0 = perfection,
57
+ # 1.0 = very loose). Note that Match_Threshold controls how closely the
58
+ # end points of a delete need to match.
59
+ self.Patch_DeleteThreshold = 0.5
60
+ # Chunk size for context length.
61
+ self.Patch_Margin = 4
62
+
63
+ # The number of bits in an int.
64
+ # Python has no maximum, thus to disable patch splitting set to 0.
65
+ # However to avoid long patches in certain pathological cases, use 32.
66
+ # Multiple short patches (using native ints) are much faster than long ones.
67
+ self.Match_MaxBits = 32
68
+
69
+ # DIFF FUNCTIONS
70
+
71
+ # The data structure representing a diff is an array of tuples:
72
+ # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")]
73
+ # which means: delete "Hello", add "Goodbye" and keep " world."
74
+ DIFF_DELETE = -1
75
+ DIFF_INSERT = 1
76
+ DIFF_EQUAL = 0
77
+
78
+ def diff_main(self, text1, text2, checklines=True, deadline=None):
79
+ """Find the differences between two texts. Simplifies the problem by
80
+ stripping any common prefix or suffix off the texts before diffing.
81
+
82
+ Args:
83
+ text1: Old string to be diffed.
84
+ text2: New string to be diffed.
85
+ checklines: Optional speedup flag. If present and false, then don't run
86
+ a line-level diff first to identify the changed areas.
87
+ Defaults to true, which does a faster, slightly less optimal diff.
88
+ deadline: Optional time when the diff should be complete by. Used
89
+ internally for recursive calls. Users should set DiffTimeout instead.
90
+
91
+ Returns:
92
+ Array of changes.
93
+ """
94
+ # Set a deadline by which time the diff must be complete.
95
+ if deadline is None:
96
+ # Unlike in most languages, Python counts time in seconds.
97
+ if self.Diff_Timeout <= 0:
98
+ deadline = sys.maxsize
99
+ else:
100
+ deadline = time.time() + self.Diff_Timeout
101
+
102
+ # Check for null inputs.
103
+ if text1 is None or text2 is None:
104
+ raise ValueError("Null inputs. (diff_main)")
105
+
106
+ # Check for equality (speedup).
107
+ if text1 == text2:
108
+ if text1:
109
+ return [(self.DIFF_EQUAL, text1)]
110
+ return []
111
+
112
+ # Trim off common prefix (speedup).
113
+ commonlength = self.diff_commonPrefix(text1, text2)
114
+ commonprefix = text1[:commonlength]
115
+ text1 = text1[commonlength:]
116
+ text2 = text2[commonlength:]
117
+
118
+ # Trim off common suffix (speedup).
119
+ commonlength = self.diff_commonSuffix(text1, text2)
120
+ if commonlength == 0:
121
+ commonsuffix = ""
122
+ else:
123
+ commonsuffix = text1[-commonlength:]
124
+ text1 = text1[:-commonlength]
125
+ text2 = text2[:-commonlength]
126
+
127
+ # Compute the diff on the middle block.
128
+ diffs = self.diff_compute(text1, text2, checklines, deadline)
129
+
130
+ # Restore the prefix and suffix.
131
+ if commonprefix:
132
+ diffs[:0] = [(self.DIFF_EQUAL, commonprefix)]
133
+ if commonsuffix:
134
+ diffs.append((self.DIFF_EQUAL, commonsuffix))
135
+ self.diff_cleanupMerge(diffs)
136
+ return diffs
137
+
138
+ def diff_compute(self, text1, text2, checklines, deadline):
139
+ """Find the differences between two texts. Assumes that the texts do not
140
+ have any common prefix or suffix.
141
+
142
+ Args:
143
+ text1: Old string to be diffed.
144
+ text2: New string to be diffed.
145
+ checklines: Speedup flag. If false, then don't run a line-level diff
146
+ first to identify the changed areas.
147
+ If true, then run a faster, slightly less optimal diff.
148
+ deadline: Time when the diff should be complete by.
149
+
150
+ Returns:
151
+ Array of changes.
152
+ """
153
+ if not text1:
154
+ # Just add some text (speedup).
155
+ return [(self.DIFF_INSERT, text2)]
156
+
157
+ if not text2:
158
+ # Just delete some text (speedup).
159
+ return [(self.DIFF_DELETE, text1)]
160
+
161
+ if len(text1) > len(text2):
162
+ (longtext, shorttext) = (text1, text2)
163
+ else:
164
+ (shorttext, longtext) = (text1, text2)
165
+ i = longtext.find(shorttext)
166
+ if i != -1:
167
+ # Shorter text is inside the longer text (speedup).
168
+ diffs = [
169
+ (self.DIFF_INSERT, longtext[:i]),
170
+ (self.DIFF_EQUAL, shorttext),
171
+ (self.DIFF_INSERT, longtext[i + len(shorttext) :]),
172
+ ]
173
+ # Swap insertions for deletions if diff is reversed.
174
+ if len(text1) > len(text2):
175
+ diffs[0] = (self.DIFF_DELETE, diffs[0][1])
176
+ diffs[2] = (self.DIFF_DELETE, diffs[2][1])
177
+ return diffs
178
+
179
+ if len(shorttext) == 1:
180
+ # Single character string.
181
+ # After the previous speedup, the character can't be an equality.
182
+ return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
183
+
184
+ # Check to see if the problem can be split in two.
185
+ hm = self.diff_halfMatch(text1, text2)
186
+ if hm:
187
+ # A half-match was found, sort out the return data.
188
+ (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
189
+ # Send both pairs off for separate processing.
190
+ diffs_a = self.diff_main(text1_a, text2_a, checklines, deadline)
191
+ diffs_b = self.diff_main(text1_b, text2_b, checklines, deadline)
192
+ # Merge the results.
193
+ return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b
194
+
195
+ if checklines and len(text1) > 100 and len(text2) > 100:
196
+ return self.diff_lineMode(text1, text2, deadline)
197
+
198
+ return self.diff_bisect(text1, text2, deadline)
199
+
200
+ def diff_lineMode(self, text1, text2, deadline):
201
+ """Do a quick line-level diff on both strings, then rediff the parts for
202
+ greater accuracy.
203
+ This speedup can produce non-minimal diffs.
204
+
205
+ Args:
206
+ text1: Old string to be diffed.
207
+ text2: New string to be diffed.
208
+ deadline: Time when the diff should be complete by.
209
+
210
+ Returns:
211
+ Array of changes.
212
+ """
213
+
214
+ # Scan the text on a line-by-line basis first.
215
+ (text1, text2, linearray) = self.diff_linesToChars(text1, text2)
216
+
217
+ diffs = self.diff_main(text1, text2, False, deadline)
218
+
219
+ # Convert the diff back to original text.
220
+ self.diff_charsToLines(diffs, linearray)
221
+ # Eliminate freak matches (e.g. blank lines)
222
+ self.diff_cleanupSemantic(diffs)
223
+
224
+ # Rediff any replacement blocks, this time character-by-character.
225
+ # Add a dummy entry at the end.
226
+ diffs.append((self.DIFF_EQUAL, ""))
227
+ pointer = 0
228
+ count_delete = 0
229
+ count_insert = 0
230
+ text_delete = ""
231
+ text_insert = ""
232
+ while pointer < len(diffs):
233
+ if diffs[pointer][0] == self.DIFF_INSERT:
234
+ count_insert += 1
235
+ text_insert += diffs[pointer][1]
236
+ elif diffs[pointer][0] == self.DIFF_DELETE:
237
+ count_delete += 1
238
+ text_delete += diffs[pointer][1]
239
+ elif diffs[pointer][0] == self.DIFF_EQUAL:
240
+ # Upon reaching an equality, check for prior redundancies.
241
+ if count_delete >= 1 and count_insert >= 1:
242
+ # Delete the offending records and add the merged ones.
243
+ subDiff = self.diff_main(text_delete, text_insert, False, deadline)
244
+ diffs[pointer - count_delete - count_insert : pointer] = subDiff
245
+ pointer = pointer - count_delete - count_insert + len(subDiff)
246
+ count_insert = 0
247
+ count_delete = 0
248
+ text_delete = ""
249
+ text_insert = ""
250
+
251
+ pointer += 1
252
+
253
+ diffs.pop() # Remove the dummy entry at the end.
254
+
255
+ return diffs
256
+
257
+ def diff_bisect(self, text1, text2, deadline):
258
+ """Find the 'middle snake' of a diff, split the problem in two
259
+ and return the recursively constructed diff.
260
+ See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations.
261
+
262
+ Args:
263
+ text1: Old string to be diffed.
264
+ text2: New string to be diffed.
265
+ deadline: Time at which to bail if not yet complete.
266
+
267
+ Returns:
268
+ Array of diff tuples.
269
+ """
270
+
271
+ # Cache the text lengths to prevent multiple calls.
272
+ text1_length = len(text1)
273
+ text2_length = len(text2)
274
+ max_d = (text1_length + text2_length + 1) // 2
275
+ v_offset = max_d
276
+ v_length = 2 * max_d
277
+ v1 = [-1] * v_length
278
+ v1[v_offset + 1] = 0
279
+ v2 = v1[:]
280
+ delta = text1_length - text2_length
281
+ # If the total number of characters is odd, then the front path will
282
+ # collide with the reverse path.
283
+ front = delta % 2 != 0
284
+ # Offsets for start and end of k loop.
285
+ # Prevents mapping of space beyond the grid.
286
+ k1start = 0
287
+ k1end = 0
288
+ k2start = 0
289
+ k2end = 0
290
+ for d in range(max_d):
291
+ # Bail out if deadline is reached.
292
+ if time.time() > deadline:
293
+ break
294
+
295
+ # Walk the front path one step.
296
+ for k1 in range(-d + k1start, d + 1 - k1end, 2):
297
+ k1_offset = v_offset + k1
298
+ if k1 == -d or (k1 != d and v1[k1_offset - 1] < v1[k1_offset + 1]):
299
+ x1 = v1[k1_offset + 1]
300
+ else:
301
+ x1 = v1[k1_offset - 1] + 1
302
+ y1 = x1 - k1
303
+ while (
304
+ x1 < text1_length and y1 < text2_length and text1[x1] == text2[y1]
305
+ ):
306
+ x1 += 1
307
+ y1 += 1
308
+ v1[k1_offset] = x1
309
+ if x1 > text1_length:
310
+ # Ran off the right of the graph.
311
+ k1end += 2
312
+ elif y1 > text2_length:
313
+ # Ran off the bottom of the graph.
314
+ k1start += 2
315
+ elif front:
316
+ k2_offset = v_offset + delta - k1
317
+ if k2_offset >= 0 and k2_offset < v_length and v2[k2_offset] != -1:
318
+ # Mirror x2 onto top-left coordinate system.
319
+ x2 = text1_length - v2[k2_offset]
320
+ if x1 >= x2:
321
+ # Overlap detected.
322
+ return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
323
+
324
+ # Walk the reverse path one step.
325
+ for k2 in range(-d + k2start, d + 1 - k2end, 2):
326
+ k2_offset = v_offset + k2
327
+ if k2 == -d or (k2 != d and v2[k2_offset - 1] < v2[k2_offset + 1]):
328
+ x2 = v2[k2_offset + 1]
329
+ else:
330
+ x2 = v2[k2_offset - 1] + 1
331
+ y2 = x2 - k2
332
+ while (
333
+ x2 < text1_length
334
+ and y2 < text2_length
335
+ and text1[-x2 - 1] == text2[-y2 - 1]
336
+ ):
337
+ x2 += 1
338
+ y2 += 1
339
+ v2[k2_offset] = x2
340
+ if x2 > text1_length:
341
+ # Ran off the left of the graph.
342
+ k2end += 2
343
+ elif y2 > text2_length:
344
+ # Ran off the top of the graph.
345
+ k2start += 2
346
+ elif not front:
347
+ k1_offset = v_offset + delta - k2
348
+ if k1_offset >= 0 and k1_offset < v_length and v1[k1_offset] != -1:
349
+ x1 = v1[k1_offset]
350
+ y1 = v_offset + x1 - k1_offset
351
+ # Mirror x2 onto top-left coordinate system.
352
+ x2 = text1_length - x2
353
+ if x1 >= x2:
354
+ # Overlap detected.
355
+ return self.diff_bisectSplit(text1, text2, x1, y1, deadline)
356
+
357
+ # Diff took too long and hit the deadline or
358
+ # number of diffs equals number of characters, no commonality at all.
359
+ return [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)]
360
+
361
+ def diff_bisectSplit(self, text1, text2, x, y, deadline):
362
+ """Given the location of the 'middle snake', split the diff in two parts
363
+ and recurse.
364
+
365
+ Args:
366
+ text1: Old string to be diffed.
367
+ text2: New string to be diffed.
368
+ x: Index of split point in text1.
369
+ y: Index of split point in text2.
370
+ deadline: Time at which to bail if not yet complete.
371
+
372
+ Returns:
373
+ Array of diff tuples.
374
+ """
375
+ text1a = text1[:x]
376
+ text2a = text2[:y]
377
+ text1b = text1[x:]
378
+ text2b = text2[y:]
379
+
380
+ # Compute both diffs serially.
381
+ diffs = self.diff_main(text1a, text2a, False, deadline)
382
+ diffsb = self.diff_main(text1b, text2b, False, deadline)
383
+
384
+ return diffs + diffsb
385
+
386
+ def diff_linesToChars(self, text1, text2):
387
+ """Split two texts into an array of strings. Reduce the texts to a string
388
+ of hashes where each Unicode character represents one line.
389
+
390
+ Args:
391
+ text1: First string.
392
+ text2: Second string.
393
+
394
+ Returns:
395
+ Three element tuple, containing the encoded text1, the encoded text2 and
396
+ the array of unique strings. The zeroth element of the array of unique
397
+ strings is intentionally blank.
398
+ """
399
+ lineArray = [] # e.g. lineArray[4] == "Hello\n"
400
+ lineHash = {} # e.g. lineHash["Hello\n"] == 4
401
+
402
+ # "\x00" is a valid character, but various debuggers don't like it.
403
+ # So we'll insert a junk entry to avoid generating a null character.
404
+ lineArray.append("")
405
+
406
+ def diff_linesToCharsMunge(text):
407
+ """Split a text into an array of strings. Reduce the texts to a string
408
+ of hashes where each Unicode character represents one line.
409
+ Modifies linearray and linehash through being a closure.
410
+
411
+ Args:
412
+ text: String to encode.
413
+
414
+ Returns:
415
+ Encoded string.
416
+ """
417
+ chars = []
418
+ # Walk the text, pulling out a substring for each line.
419
+ # text.split('\n') would would temporarily double our memory footprint.
420
+ # Modifying text would create many large strings to garbage collect.
421
+ lineStart = 0
422
+ lineEnd = -1
423
+ while lineEnd < len(text) - 1:
424
+ lineEnd = text.find("\n", lineStart)
425
+ if lineEnd == -1:
426
+ lineEnd = len(text) - 1
427
+ line = text[lineStart : lineEnd + 1]
428
+
429
+ if line in lineHash:
430
+ chars.append(chr(lineHash[line]))
431
+ else:
432
+ if len(lineArray) == maxLines:
433
+ # Bail out at 1114111 because chr(1114112) throws.
434
+ line = text[lineStart:]
435
+ lineEnd = len(text)
436
+ lineArray.append(line)
437
+ lineHash[line] = len(lineArray) - 1
438
+ chars.append(chr(len(lineArray) - 1))
439
+ lineStart = lineEnd + 1
440
+ return "".join(chars)
441
+
442
+ # Allocate 2/3rds of the space for text1, the rest for text2.
443
+ maxLines = 666666
444
+ chars1 = diff_linesToCharsMunge(text1)
445
+ maxLines = 1114111
446
+ chars2 = diff_linesToCharsMunge(text2)
447
+ return (chars1, chars2, lineArray)
448
+
449
+ def diff_charsToLines(self, diffs, lineArray):
450
+ """Rehydrate the text in a diff from a string of line hashes to real lines
451
+ of text.
452
+
453
+ Args:
454
+ diffs: Array of diff tuples.
455
+ lineArray: Array of unique strings.
456
+ """
457
+ for i in range(len(diffs)):
458
+ text = []
459
+ for char in diffs[i][1]:
460
+ text.append(lineArray[ord(char)])
461
+ diffs[i] = (diffs[i][0], "".join(text))
462
+
463
+ def diff_commonPrefix(self, text1, text2):
464
+ """Determine the common prefix of two strings.
465
+
466
+ Args:
467
+ text1: First string.
468
+ text2: Second string.
469
+
470
+ Returns:
471
+ The number of characters common to the start of each string.
472
+ """
473
+ # Quick check for common null cases.
474
+ if not text1 or not text2 or text1[0] != text2[0]:
475
+ return 0
476
+ # Binary search.
477
+ # Performance analysis: https://neil.fraser.name/news/2007/10/09/
478
+ pointermin = 0
479
+ pointermax = min(len(text1), len(text2))
480
+ pointermid = pointermax
481
+ pointerstart = 0
482
+ while pointermin < pointermid:
483
+ if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]:
484
+ pointermin = pointermid
485
+ pointerstart = pointermin
486
+ else:
487
+ pointermax = pointermid
488
+ pointermid = (pointermax - pointermin) // 2 + pointermin
489
+ return pointermid
490
+
491
+ def diff_commonSuffix(self, text1, text2):
492
+ """Determine the common suffix of two strings.
493
+
494
+ Args:
495
+ text1: First string.
496
+ text2: Second string.
497
+
498
+ Returns:
499
+ The number of characters common to the end of each string.
500
+ """
501
+ # Quick check for common null cases.
502
+ if not text1 or not text2 or text1[-1] != text2[-1]:
503
+ return 0
504
+ # Binary search.
505
+ # Performance analysis: https://neil.fraser.name/news/2007/10/09/
506
+ pointermin = 0
507
+ pointermax = min(len(text1), len(text2))
508
+ pointermid = pointermax
509
+ pointerend = 0
510
+ while pointermin < pointermid:
511
+ if (
512
+ text1[-pointermid : len(text1) - pointerend]
513
+ == text2[-pointermid : len(text2) - pointerend]
514
+ ):
515
+ pointermin = pointermid
516
+ pointerend = pointermin
517
+ else:
518
+ pointermax = pointermid
519
+ pointermid = (pointermax - pointermin) // 2 + pointermin
520
+ return pointermid
521
+
522
+ def diff_commonOverlap(self, text1, text2):
523
+ """Determine if the suffix of one string is the prefix of another.
524
+
525
+ Args:
526
+ text1 First string.
527
+ text2 Second string.
528
+
529
+ Returns:
530
+ The number of characters common to the end of the first
531
+ string and the start of the second string.
532
+ """
533
+ # Cache the text lengths to prevent multiple calls.
534
+ text1_length = len(text1)
535
+ text2_length = len(text2)
536
+ # Eliminate the null case.
537
+ if text1_length == 0 or text2_length == 0:
538
+ return 0
539
+ # Truncate the longer string.
540
+ if text1_length > text2_length:
541
+ text1 = text1[-text2_length:]
542
+ elif text1_length < text2_length:
543
+ text2 = text2[:text1_length]
544
+ text_length = min(text1_length, text2_length)
545
+ # Quick check for the worst case.
546
+ if text1 == text2:
547
+ return text_length
548
+
549
+ # Start by looking for a single character match
550
+ # and increase length until no match is found.
551
+ # Performance analysis: https://neil.fraser.name/news/2010/11/04/
552
+ best = 0
553
+ length = 1
554
+ while True:
555
+ pattern = text1[-length:]
556
+ found = text2.find(pattern)
557
+ if found == -1:
558
+ return best
559
+ length += found
560
+ if found == 0 or text1[-length:] == text2[:length]:
561
+ best = length
562
+ length += 1
563
+
564
+ def diff_halfMatch(self, text1, text2):
565
+ """Do the two texts share a substring which is at least half the length of
566
+ the longer text?
567
+ This speedup can produce non-minimal diffs.
568
+
569
+ Args:
570
+ text1: First string.
571
+ text2: Second string.
572
+
573
+ Returns:
574
+ Five element Array, containing the prefix of text1, the suffix of text1,
575
+ the prefix of text2, the suffix of text2 and the common middle. Or None
576
+ if there was no match.
577
+ """
578
+ if self.Diff_Timeout <= 0:
579
+ # Don't risk returning a non-optimal diff if we have unlimited time.
580
+ return None
581
+ if len(text1) > len(text2):
582
+ (longtext, shorttext) = (text1, text2)
583
+ else:
584
+ (shorttext, longtext) = (text1, text2)
585
+ if len(longtext) < 4 or len(shorttext) * 2 < len(longtext):
586
+ return None # Pointless.
587
+
588
+ def diff_halfMatchI(longtext, shorttext, i):
589
+ """Does a substring of shorttext exist within longtext such that the
590
+ substring is at least half the length of longtext?
591
+ Closure, but does not reference any external variables.
592
+
593
+ Args:
594
+ longtext: Longer string.
595
+ shorttext: Shorter string.
596
+ i: Start index of quarter length substring within longtext.
597
+
598
+ Returns:
599
+ Five element Array, containing the prefix of longtext, the suffix of
600
+ longtext, the prefix of shorttext, the suffix of shorttext and the
601
+ common middle. Or None if there was no match.
602
+ """
603
+ seed = longtext[i : i + len(longtext) // 4]
604
+ best_common = ""
605
+ j = shorttext.find(seed)
606
+ while j != -1:
607
+ prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:])
608
+ suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j])
609
+ if len(best_common) < suffixLength + prefixLength:
610
+ best_common = (
611
+ shorttext[j - suffixLength : j]
612
+ + shorttext[j : j + prefixLength]
613
+ )
614
+ best_longtext_a = longtext[: i - suffixLength]
615
+ best_longtext_b = longtext[i + prefixLength :]
616
+ best_shorttext_a = shorttext[: j - suffixLength]
617
+ best_shorttext_b = shorttext[j + prefixLength :]
618
+ j = shorttext.find(seed, j + 1)
619
+
620
+ if len(best_common) * 2 >= len(longtext):
621
+ return (
622
+ best_longtext_a,
623
+ best_longtext_b,
624
+ best_shorttext_a,
625
+ best_shorttext_b,
626
+ best_common,
627
+ )
628
+ else:
629
+ return None
630
+
631
+ # First check if the second quarter is the seed for a half-match.
632
+ hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) // 4)
633
+ # Check again based on the third quarter.
634
+ hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) // 2)
635
+ if not hm1 and not hm2:
636
+ return None
637
+ elif not hm2:
638
+ hm = hm1
639
+ elif not hm1:
640
+ hm = hm2
641
+ else:
642
+ # Both matched. Select the longest.
643
+ if len(hm1[4]) > len(hm2[4]):
644
+ hm = hm1
645
+ else:
646
+ hm = hm2
647
+
648
+ # A half-match was found, sort out the return data.
649
+ if len(text1) > len(text2):
650
+ (text1_a, text1_b, text2_a, text2_b, mid_common) = hm
651
+ else:
652
+ (text2_a, text2_b, text1_a, text1_b, mid_common) = hm
653
+ return (text1_a, text1_b, text2_a, text2_b, mid_common)
654
+
655
+ def diff_cleanupSemantic(self, diffs):
656
+ """Reduce the number of edits by eliminating semantically trivial
657
+ equalities.
658
+
659
+ Args:
660
+ diffs: Array of diff tuples.
661
+ """
662
+ changes = False
663
+ equalities = [] # Stack of indices where equalities are found.
664
+ lastEquality = None # Always equal to diffs[equalities[-1]][1]
665
+ pointer = 0 # Index of current position.
666
+ # Number of chars that changed prior to the equality.
667
+ length_insertions1, length_deletions1 = 0, 0
668
+ # Number of chars that changed after the equality.
669
+ length_insertions2, length_deletions2 = 0, 0
670
+ while pointer < len(diffs):
671
+ if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
672
+ equalities.append(pointer)
673
+ length_insertions1, length_insertions2 = length_insertions2, 0
674
+ length_deletions1, length_deletions2 = length_deletions2, 0
675
+ lastEquality = diffs[pointer][1]
676
+ else: # An insertion or deletion.
677
+ if diffs[pointer][0] == self.DIFF_INSERT:
678
+ length_insertions2 += len(diffs[pointer][1])
679
+ else:
680
+ length_deletions2 += len(diffs[pointer][1])
681
+ # Eliminate an equality that is smaller or equal to the edits on both
682
+ # sides of it.
683
+ if (
684
+ lastEquality
685
+ and (
686
+ len(lastEquality) <= max(length_insertions1, length_deletions1)
687
+ )
688
+ and (
689
+ len(lastEquality) <= max(length_insertions2, length_deletions2)
690
+ )
691
+ ):
692
+ # Duplicate record.
693
+ diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality))
694
+ # Change second copy to insert.
695
+ diffs[equalities[-1] + 1] = (
696
+ self.DIFF_INSERT,
697
+ diffs[equalities[-1] + 1][1],
698
+ )
699
+ # Throw away the equality we just deleted.
700
+ equalities.pop()
701
+ # Throw away the previous equality (it needs to be reevaluated).
702
+ if len(equalities):
703
+ equalities.pop()
704
+ if len(equalities):
705
+ pointer = equalities[-1]
706
+ else:
707
+ pointer = -1
708
+ # Reset the counters.
709
+ length_insertions1, length_deletions1 = 0, 0
710
+ length_insertions2, length_deletions2 = 0, 0
711
+ lastEquality = None
712
+ changes = True
713
+ pointer += 1
714
+
715
+ # Normalize the diff.
716
+ if changes:
717
+ self.diff_cleanupMerge(diffs)
718
+ self.diff_cleanupSemanticLossless(diffs)
719
+
720
+ # Find any overlaps between deletions and insertions.
721
+ # e.g: <del>abcxxx</del><ins>xxxdef</ins>
722
+ # -> <del>abc</del>xxx<ins>def</ins>
723
+ # e.g: <del>xxxabc</del><ins>defxxx</ins>
724
+ # -> <ins>def</ins>xxx<del>abc</del>
725
+ # Only extract an overlap if it is as big as the edit ahead or behind it.
726
+ pointer = 1
727
+ while pointer < len(diffs):
728
+ if (
729
+ diffs[pointer - 1][0] == self.DIFF_DELETE
730
+ and diffs[pointer][0] == self.DIFF_INSERT
731
+ ):
732
+ deletion = diffs[pointer - 1][1]
733
+ insertion = diffs[pointer][1]
734
+ overlap_length1 = self.diff_commonOverlap(deletion, insertion)
735
+ overlap_length2 = self.diff_commonOverlap(insertion, deletion)
736
+ if overlap_length1 >= overlap_length2:
737
+ if (
738
+ overlap_length1 >= len(deletion) / 2.0
739
+ or overlap_length1 >= len(insertion) / 2.0
740
+ ):
741
+ # Overlap found. Insert an equality and trim the surrounding edits.
742
+ diffs.insert(
743
+ pointer, (self.DIFF_EQUAL, insertion[:overlap_length1])
744
+ )
745
+ diffs[pointer - 1] = (
746
+ self.DIFF_DELETE,
747
+ deletion[: len(deletion) - overlap_length1],
748
+ )
749
+ diffs[pointer + 1] = (
750
+ self.DIFF_INSERT,
751
+ insertion[overlap_length1:],
752
+ )
753
+ pointer += 1
754
+ else:
755
+ if (
756
+ overlap_length2 >= len(deletion) / 2.0
757
+ or overlap_length2 >= len(insertion) / 2.0
758
+ ):
759
+ # Reverse overlap found.
760
+ # Insert an equality and swap and trim the surrounding edits.
761
+ diffs.insert(
762
+ pointer, (self.DIFF_EQUAL, deletion[:overlap_length2])
763
+ )
764
+ diffs[pointer - 1] = (
765
+ self.DIFF_INSERT,
766
+ insertion[: len(insertion) - overlap_length2],
767
+ )
768
+ diffs[pointer + 1] = (
769
+ self.DIFF_DELETE,
770
+ deletion[overlap_length2:],
771
+ )
772
+ pointer += 1
773
+ pointer += 1
774
+ pointer += 1
775
+
776
+ def diff_cleanupSemanticLossless(self, diffs):
777
+ """Look for single edits surrounded on both sides by equalities
778
+ which can be shifted sideways to align the edit to a word boundary.
779
+ e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came.
780
+
781
+ Args:
782
+ diffs: Array of diff tuples.
783
+ """
784
+
785
+ def diff_cleanupSemanticScore(one, two):
786
+ """Given two strings, compute a score representing whether the
787
+ internal boundary falls on logical boundaries.
788
+ Scores range from 6 (best) to 0 (worst).
789
+ Closure, but does not reference any external variables.
790
+
791
+ Args:
792
+ one: First string.
793
+ two: Second string.
794
+
795
+ Returns:
796
+ The score.
797
+ """
798
+ if not one or not two:
799
+ # Edges are the best.
800
+ return 6
801
+
802
+ # Each port of this function behaves slightly differently due to
803
+ # subtle differences in each language's definition of things like
804
+ # 'whitespace'. Since this function's purpose is largely cosmetic,
805
+ # the choice has been made to use each language's native features
806
+ # rather than force total conformity.
807
+ char1 = one[-1]
808
+ char2 = two[0]
809
+ nonAlphaNumeric1 = not char1.isalnum()
810
+ nonAlphaNumeric2 = not char2.isalnum()
811
+ whitespace1 = nonAlphaNumeric1 and char1.isspace()
812
+ whitespace2 = nonAlphaNumeric2 and char2.isspace()
813
+ lineBreak1 = whitespace1 and (char1 == "\r" or char1 == "\n")
814
+ lineBreak2 = whitespace2 and (char2 == "\r" or char2 == "\n")
815
+ blankLine1 = lineBreak1 and self.BLANKLINEEND.search(one)
816
+ blankLine2 = lineBreak2 and self.BLANKLINESTART.match(two)
817
+
818
+ if blankLine1 or blankLine2:
819
+ # Five points for blank lines.
820
+ return 5
821
+ elif lineBreak1 or lineBreak2:
822
+ # Four points for line breaks.
823
+ return 4
824
+ elif nonAlphaNumeric1 and not whitespace1 and whitespace2:
825
+ # Three points for end of sentences.
826
+ return 3
827
+ elif whitespace1 or whitespace2:
828
+ # Two points for whitespace.
829
+ return 2
830
+ elif nonAlphaNumeric1 or nonAlphaNumeric2:
831
+ # One point for non-alphanumeric.
832
+ return 1
833
+ return 0
834
+
835
+ pointer = 1
836
+ # Intentionally ignore the first and last element (don't need checking).
837
+ while pointer < len(diffs) - 1:
838
+ if (
839
+ diffs[pointer - 1][0] == self.DIFF_EQUAL
840
+ and diffs[pointer + 1][0] == self.DIFF_EQUAL
841
+ ):
842
+ # This is a single edit surrounded by equalities.
843
+ equality1 = diffs[pointer - 1][1]
844
+ edit = diffs[pointer][1]
845
+ equality2 = diffs[pointer + 1][1]
846
+
847
+ # First, shift the edit as far left as possible.
848
+ commonOffset = self.diff_commonSuffix(equality1, edit)
849
+ if commonOffset:
850
+ commonString = edit[-commonOffset:]
851
+ equality1 = equality1[:-commonOffset]
852
+ edit = commonString + edit[:-commonOffset]
853
+ equality2 = commonString + equality2
854
+
855
+ # Second, step character by character right, looking for the best fit.
856
+ bestEquality1 = equality1
857
+ bestEdit = edit
858
+ bestEquality2 = equality2
859
+ bestScore = diff_cleanupSemanticScore(
860
+ equality1, edit
861
+ ) + diff_cleanupSemanticScore(edit, equality2)
862
+ while edit and equality2 and edit[0] == equality2[0]:
863
+ equality1 += edit[0]
864
+ edit = edit[1:] + equality2[0]
865
+ equality2 = equality2[1:]
866
+ score = diff_cleanupSemanticScore(
867
+ equality1, edit
868
+ ) + diff_cleanupSemanticScore(edit, equality2)
869
+ # The >= encourages trailing rather than leading whitespace on edits.
870
+ if score >= bestScore:
871
+ bestScore = score
872
+ bestEquality1 = equality1
873
+ bestEdit = edit
874
+ bestEquality2 = equality2
875
+
876
+ if diffs[pointer - 1][1] != bestEquality1:
877
+ # We have an improvement, save it back to the diff.
878
+ if bestEquality1:
879
+ diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1)
880
+ else:
881
+ del diffs[pointer - 1]
882
+ pointer -= 1
883
+ diffs[pointer] = (diffs[pointer][0], bestEdit)
884
+ if bestEquality2:
885
+ diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2)
886
+ else:
887
+ del diffs[pointer + 1]
888
+ pointer -= 1
889
+ pointer += 1
890
+
891
+ # Define some regex patterns for matching boundaries.
892
+ BLANKLINEEND = re.compile(r"\n\r?\n$")
893
+ BLANKLINESTART = re.compile(r"^\r?\n\r?\n")
894
+
895
+ def diff_cleanupEfficiency(self, diffs):
896
+ """Reduce the number of edits by eliminating operationally trivial
897
+ equalities.
898
+
899
+ Args:
900
+ diffs: Array of diff tuples.
901
+ """
902
+ changes = False
903
+ equalities = [] # Stack of indices where equalities are found.
904
+ lastEquality = None # Always equal to diffs[equalities[-1]][1]
905
+ pointer = 0 # Index of current position.
906
+ pre_ins = False # Is there an insertion operation before the last equality.
907
+ pre_del = False # Is there a deletion operation before the last equality.
908
+ post_ins = False # Is there an insertion operation after the last equality.
909
+ post_del = False # Is there a deletion operation after the last equality.
910
+ while pointer < len(diffs):
911
+ if diffs[pointer][0] == self.DIFF_EQUAL: # Equality found.
912
+ if len(diffs[pointer][1]) < self.Diff_EditCost and (
913
+ post_ins or post_del
914
+ ):
915
+ # Candidate found.
916
+ equalities.append(pointer)
917
+ pre_ins = post_ins
918
+ pre_del = post_del
919
+ lastEquality = diffs[pointer][1]
920
+ else:
921
+ # Not a candidate, and can never become one.
922
+ equalities = []
923
+ lastEquality = None
924
+
925
+ post_ins = post_del = False
926
+ else: # An insertion or deletion.
927
+ if diffs[pointer][0] == self.DIFF_DELETE:
928
+ post_del = True
929
+ else:
930
+ post_ins = True
931
+
932
+ # Five types to be split:
933
+ # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del>
934
+ # <ins>A</ins>X<ins>C</ins><del>D</del>
935
+ # <ins>A</ins><del>B</del>X<ins>C</ins>
936
+ # <ins>A</del>X<ins>C</ins><del>D</del>
937
+ # <ins>A</ins><del>B</del>X<del>C</del>
938
+
939
+ if lastEquality and (
940
+ (pre_ins and pre_del and post_ins and post_del)
941
+ or (
942
+ (len(lastEquality) < self.Diff_EditCost / 2)
943
+ and (pre_ins + pre_del + post_ins + post_del) == 3
944
+ )
945
+ ):
946
+ # Duplicate record.
947
+ diffs.insert(equalities[-1], (self.DIFF_DELETE, lastEquality))
948
+ # Change second copy to insert.
949
+ diffs[equalities[-1] + 1] = (
950
+ self.DIFF_INSERT,
951
+ diffs[equalities[-1] + 1][1],
952
+ )
953
+ equalities.pop() # Throw away the equality we just deleted.
954
+ lastEquality = None
955
+ if pre_ins and pre_del:
956
+ # No changes made which could affect previous entry, keep going.
957
+ post_ins = post_del = True
958
+ equalities = []
959
+ else:
960
+ if len(equalities):
961
+ equalities.pop() # Throw away the previous equality.
962
+ if len(equalities):
963
+ pointer = equalities[-1]
964
+ else:
965
+ pointer = -1
966
+ post_ins = post_del = False
967
+ changes = True
968
+ pointer += 1
969
+
970
+ if changes:
971
+ self.diff_cleanupMerge(diffs)
972
+
973
+ def diff_cleanupMerge(self, diffs):
974
+ """Reorder and merge like edit sections. Merge equalities.
975
+ Any edit section can move as long as it doesn't cross an equality.
976
+
977
+ Args:
978
+ diffs: Array of diff tuples.
979
+ """
980
+ diffs.append((self.DIFF_EQUAL, "")) # Add a dummy entry at the end.
981
+ pointer = 0
982
+ count_delete = 0
983
+ count_insert = 0
984
+ text_delete = ""
985
+ text_insert = ""
986
+ while pointer < len(diffs):
987
+ if diffs[pointer][0] == self.DIFF_INSERT:
988
+ count_insert += 1
989
+ text_insert += diffs[pointer][1]
990
+ pointer += 1
991
+ elif diffs[pointer][0] == self.DIFF_DELETE:
992
+ count_delete += 1
993
+ text_delete += diffs[pointer][1]
994
+ pointer += 1
995
+ elif diffs[pointer][0] == self.DIFF_EQUAL:
996
+ # Upon reaching an equality, check for prior redundancies.
997
+ if count_delete + count_insert > 1:
998
+ if count_delete != 0 and count_insert != 0:
999
+ # Factor out any common prefixies.
1000
+ commonlength = self.diff_commonPrefix(text_insert, text_delete)
1001
+ if commonlength != 0:
1002
+ x = pointer - count_delete - count_insert - 1
1003
+ if x >= 0 and diffs[x][0] == self.DIFF_EQUAL:
1004
+ diffs[x] = (
1005
+ diffs[x][0],
1006
+ diffs[x][1] + text_insert[:commonlength],
1007
+ )
1008
+ else:
1009
+ diffs.insert(
1010
+ 0, (self.DIFF_EQUAL, text_insert[:commonlength])
1011
+ )
1012
+ pointer += 1
1013
+ text_insert = text_insert[commonlength:]
1014
+ text_delete = text_delete[commonlength:]
1015
+ # Factor out any common suffixies.
1016
+ commonlength = self.diff_commonSuffix(text_insert, text_delete)
1017
+ if commonlength != 0:
1018
+ diffs[pointer] = (
1019
+ diffs[pointer][0],
1020
+ text_insert[-commonlength:] + diffs[pointer][1],
1021
+ )
1022
+ text_insert = text_insert[:-commonlength]
1023
+ text_delete = text_delete[:-commonlength]
1024
+ # Delete the offending records and add the merged ones.
1025
+ new_ops = []
1026
+ if len(text_delete) != 0:
1027
+ new_ops.append((self.DIFF_DELETE, text_delete))
1028
+ if len(text_insert) != 0:
1029
+ new_ops.append((self.DIFF_INSERT, text_insert))
1030
+ pointer -= count_delete + count_insert
1031
+ diffs[pointer : pointer + count_delete + count_insert] = new_ops
1032
+ pointer += len(new_ops) + 1
1033
+ elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL:
1034
+ # Merge this equality with the previous one.
1035
+ diffs[pointer - 1] = (
1036
+ diffs[pointer - 1][0],
1037
+ diffs[pointer - 1][1] + diffs[pointer][1],
1038
+ )
1039
+ del diffs[pointer]
1040
+ else:
1041
+ pointer += 1
1042
+
1043
+ count_insert = 0
1044
+ count_delete = 0
1045
+ text_delete = ""
1046
+ text_insert = ""
1047
+
1048
+ if diffs[-1][1] == "":
1049
+ diffs.pop() # Remove the dummy entry at the end.
1050
+
1051
+ # Second pass: look for single edits surrounded on both sides by equalities
1052
+ # which can be shifted sideways to eliminate an equality.
1053
+ # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC
1054
+ changes = False
1055
+ pointer = 1
1056
+ # Intentionally ignore the first and last element (don't need checking).
1057
+ while pointer < len(diffs) - 1:
1058
+ if (
1059
+ diffs[pointer - 1][0] == self.DIFF_EQUAL
1060
+ and diffs[pointer + 1][0] == self.DIFF_EQUAL
1061
+ ):
1062
+ # This is a single edit surrounded by equalities.
1063
+ if diffs[pointer][1].endswith(diffs[pointer - 1][1]):
1064
+ # Shift the edit over the previous equality.
1065
+ if diffs[pointer - 1][1] != "":
1066
+ diffs[pointer] = (
1067
+ diffs[pointer][0],
1068
+ diffs[pointer - 1][1]
1069
+ + diffs[pointer][1][: -len(diffs[pointer - 1][1])],
1070
+ )
1071
+ diffs[pointer + 1] = (
1072
+ diffs[pointer + 1][0],
1073
+ diffs[pointer - 1][1] + diffs[pointer + 1][1],
1074
+ )
1075
+ del diffs[pointer - 1]
1076
+ changes = True
1077
+ elif diffs[pointer][1].startswith(diffs[pointer + 1][1]):
1078
+ # Shift the edit over the next equality.
1079
+ diffs[pointer - 1] = (
1080
+ diffs[pointer - 1][0],
1081
+ diffs[pointer - 1][1] + diffs[pointer + 1][1],
1082
+ )
1083
+ diffs[pointer] = (
1084
+ diffs[pointer][0],
1085
+ diffs[pointer][1][len(diffs[pointer + 1][1]) :]
1086
+ + diffs[pointer + 1][1],
1087
+ )
1088
+ del diffs[pointer + 1]
1089
+ changes = True
1090
+ pointer += 1
1091
+
1092
+ # If shifts were made, the diff needs reordering and another shift sweep.
1093
+ if changes:
1094
+ self.diff_cleanupMerge(diffs)
1095
+
1096
+ def diff_xIndex(self, diffs, loc):
1097
+ """loc is a location in text1, compute and return the equivalent location
1098
+ in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8
1099
+
1100
+ Args:
1101
+ diffs: Array of diff tuples.
1102
+ loc: Location within text1.
1103
+
1104
+ Returns:
1105
+ Location within text2.
1106
+ """
1107
+ chars1 = 0
1108
+ chars2 = 0
1109
+ last_chars1 = 0
1110
+ last_chars2 = 0
1111
+ for x in range(len(diffs)):
1112
+ (op, text) = diffs[x]
1113
+ if op != self.DIFF_INSERT: # Equality or deletion.
1114
+ chars1 += len(text)
1115
+ if op != self.DIFF_DELETE: # Equality or insertion.
1116
+ chars2 += len(text)
1117
+ if chars1 > loc: # Overshot the location.
1118
+ break
1119
+ last_chars1 = chars1
1120
+ last_chars2 = chars2
1121
+
1122
+ if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE:
1123
+ # The location was deleted.
1124
+ return last_chars2
1125
+ # Add the remaining len(character).
1126
+ return last_chars2 + (loc - last_chars1)
1127
+
1128
+ def diff_prettyHtml(self, diffs):
1129
+ """Convert a diff array into a pretty HTML report.
1130
+
1131
+ Args:
1132
+ diffs: Array of diff tuples.
1133
+
1134
+ Returns:
1135
+ HTML representation.
1136
+ """
1137
+ html = []
1138
+ for op, data in diffs:
1139
+ text = (
1140
+ data.replace("&", "&amp;")
1141
+ .replace("<", "&lt;")
1142
+ .replace(">", "&gt;")
1143
+ .replace("\n", "&para;<br>")
1144
+ )
1145
+ if op == self.DIFF_INSERT:
1146
+ html.append('<ins Makefile="background:#e6ffe6;">%s</ins>' % text)
1147
+ elif op == self.DIFF_DELETE:
1148
+ html.append('<del Makefile="background:#ffe6e6;">%s</del>' % text)
1149
+ elif op == self.DIFF_EQUAL:
1150
+ html.append("<span>%s</span>" % text)
1151
+ return "".join(html)
1152
+
1153
+ def diff_text1(self, diffs):
1154
+ """Compute and return the source text (all equalities and deletions).
1155
+
1156
+ Args:
1157
+ diffs: Array of diff tuples.
1158
+
1159
+ Returns:
1160
+ Source text.
1161
+ """
1162
+ text = []
1163
+ for op, data in diffs:
1164
+ if op != self.DIFF_INSERT:
1165
+ text.append(data)
1166
+ return "".join(text)
1167
+
1168
+ def diff_text2(self, diffs):
1169
+ """Compute and return the destination text (all equalities and insertions).
1170
+
1171
+ Args:
1172
+ diffs: Array of diff tuples.
1173
+
1174
+ Returns:
1175
+ Destination text.
1176
+ """
1177
+ text = []
1178
+ for op, data in diffs:
1179
+ if op != self.DIFF_DELETE:
1180
+ text.append(data)
1181
+ return "".join(text)
1182
+
1183
+ def diff_levenshtein(self, diffs):
1184
+ """Compute the Levenshtein distance; the number of inserted, deleted or
1185
+ substituted characters.
1186
+
1187
+ Args:
1188
+ diffs: Array of diff tuples.
1189
+
1190
+ Returns:
1191
+ Number of changes.
1192
+ """
1193
+ levenshtein = 0
1194
+ insertions = 0
1195
+ deletions = 0
1196
+ for op, data in diffs:
1197
+ if op == self.DIFF_INSERT:
1198
+ insertions += len(data)
1199
+ elif op == self.DIFF_DELETE:
1200
+ deletions += len(data)
1201
+ elif op == self.DIFF_EQUAL:
1202
+ # A deletion and an insertion is one substitution.
1203
+ levenshtein += max(insertions, deletions)
1204
+ insertions = 0
1205
+ deletions = 0
1206
+ levenshtein += max(insertions, deletions)
1207
+ return levenshtein
1208
+
1209
+ def diff_toDelta(self, diffs):
1210
+ """Crush the diff into an encoded string which describes the operations
1211
+ required to transform text1 into text2.
1212
+ E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'.
1213
+ Operations are tab-separated. Inserted text is escaped using %xx notation.
1214
+
1215
+ Args:
1216
+ diffs: Array of diff tuples.
1217
+
1218
+ Returns:
1219
+ Delta text.
1220
+ """
1221
+ text = []
1222
+ for op, data in diffs:
1223
+ if op == self.DIFF_INSERT:
1224
+ # High ascii will raise UnicodeDecodeError. Use Unicode instead.
1225
+ data = data.encode("utf-8")
1226
+ text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
1227
+ elif op == self.DIFF_DELETE:
1228
+ text.append("-%d" % len(data))
1229
+ elif op == self.DIFF_EQUAL:
1230
+ text.append("=%d" % len(data))
1231
+ return "\t".join(text)
1232
+
1233
+ def diff_fromDelta(self, text1, delta):
1234
+ """Given the original text1, and an encoded string which describes the
1235
+ operations required to transform text1 into text2, compute the full diff.
1236
+
1237
+ Args:
1238
+ text1: Source string for the diff.
1239
+ delta: Delta text.
1240
+
1241
+ Returns:
1242
+ Array of diff tuples.
1243
+
1244
+ Raises:
1245
+ ValueError: If invalid input.
1246
+ """
1247
+ diffs = []
1248
+ pointer = 0 # Cursor in text1
1249
+ tokens = delta.split("\t")
1250
+ for token in tokens:
1251
+ if token == "":
1252
+ # Blank tokens are ok (from a trailing \t).
1253
+ continue
1254
+ # Each token begins with a one character parameter which specifies the
1255
+ # operation of this token (delete, insert, equality).
1256
+ param = token[1:]
1257
+ if token[0] == "+":
1258
+ param = urllib.parse.unquote(param)
1259
+ diffs.append((self.DIFF_INSERT, param))
1260
+ elif token[0] == "-" or token[0] == "=":
1261
+ try:
1262
+ n = int(param)
1263
+ except ValueError:
1264
+ raise ValueError("Invalid number in diff_fromDelta: " + param)
1265
+ if n < 0:
1266
+ raise ValueError("Negative number in diff_fromDelta: " + param)
1267
+ text = text1[pointer : pointer + n]
1268
+ pointer += n
1269
+ if token[0] == "=":
1270
+ diffs.append((self.DIFF_EQUAL, text))
1271
+ else:
1272
+ diffs.append((self.DIFF_DELETE, text))
1273
+ else:
1274
+ # Anything else is an error.
1275
+ raise ValueError(
1276
+ "Invalid diff operation in diff_fromDelta: " + token[0]
1277
+ )
1278
+ if pointer != len(text1):
1279
+ raise ValueError(
1280
+ "Delta length (%d) does not equal source text length (%d)."
1281
+ % (pointer, len(text1))
1282
+ )
1283
+ return diffs
1284
+
1285
+ # MATCH FUNCTIONS
1286
+
1287
+ def match_main(self, text, pattern, loc):
1288
+ """Locate the best instance of 'pattern' in 'text' near 'loc'.
1289
+
1290
+ Args:
1291
+ text: The text to search.
1292
+ pattern: The pattern to search for.
1293
+ loc: The location to search around.
1294
+
1295
+ Returns:
1296
+ Best match index or -1.
1297
+ """
1298
+ # Check for null inputs.
1299
+ if text is None or pattern is None:
1300
+ raise ValueError("Null inputs. (match_main)")
1301
+
1302
+ loc = max(0, min(loc, len(text)))
1303
+ if text == pattern:
1304
+ # Shortcut (potentially not guaranteed by the algorithm)
1305
+ return 0
1306
+ elif not text:
1307
+ # Nothing to match.
1308
+ return -1
1309
+ elif text[loc : loc + len(pattern)] == pattern:
1310
+ # Perfect match at the perfect spot! (Includes case of null pattern)
1311
+ return loc
1312
+ else:
1313
+ # Do a fuzzy compare.
1314
+ match = self.match_bitap(text, pattern, loc)
1315
+ return match
1316
+
1317
+ def match_bitap(self, text, pattern, loc):
1318
+ """Locate the best instance of 'pattern' in 'text' near 'loc' using the
1319
+ Bitap algorithm.
1320
+
1321
+ Args:
1322
+ text: The text to search.
1323
+ pattern: The pattern to search for.
1324
+ loc: The location to search around.
1325
+
1326
+ Returns:
1327
+ Best match index or -1.
1328
+ """
1329
+ # Python doesn't have a maxint limit, so ignore this check.
1330
+ # if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits:
1331
+ # raise ValueError("Pattern too long for this application.")
1332
+
1333
+ # Initialise the alphabet.
1334
+ s = self.match_alphabet(pattern)
1335
+
1336
+ def match_bitapScore(e, x):
1337
+ """Compute and return the score for a match with e errors and x location.
1338
+ Accesses loc and pattern through being a closure.
1339
+
1340
+ Args:
1341
+ e: Number of errors in match.
1342
+ x: Location of match.
1343
+
1344
+ Returns:
1345
+ Overall score for match (0.0 = good, 1.0 = bad).
1346
+ """
1347
+ accuracy = float(e) / len(pattern)
1348
+ proximity = abs(loc - x)
1349
+ if not self.Match_Distance:
1350
+ # Dodge divide by zero error.
1351
+ return proximity and 1.0 or accuracy
1352
+ return accuracy + (proximity / float(self.Match_Distance))
1353
+
1354
+ # Highest score beyond which we give up.
1355
+ score_threshold = self.Match_Threshold
1356
+ # Is there a nearby exact match? (speedup)
1357
+ best_loc = text.find(pattern, loc)
1358
+ if best_loc != -1:
1359
+ score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1360
+ # What about in the other direction? (speedup)
1361
+ best_loc = text.rfind(pattern, loc + len(pattern))
1362
+ if best_loc != -1:
1363
+ score_threshold = min(match_bitapScore(0, best_loc), score_threshold)
1364
+
1365
+ # Initialise the bit arrays.
1366
+ matchmask = 1 << (len(pattern) - 1)
1367
+ best_loc = -1
1368
+
1369
+ bin_max = len(pattern) + len(text)
1370
+ # Empty initialization added to appease pychecker.
1371
+ last_rd = None
1372
+ for d in range(len(pattern)):
1373
+ # Scan for the best match each iteration allows for one more error.
1374
+ # Run a binary search to determine how far from 'loc' we can stray at
1375
+ # this error level.
1376
+ bin_min = 0
1377
+ bin_mid = bin_max
1378
+ while bin_min < bin_mid:
1379
+ if match_bitapScore(d, loc + bin_mid) <= score_threshold:
1380
+ bin_min = bin_mid
1381
+ else:
1382
+ bin_max = bin_mid
1383
+ bin_mid = (bin_max - bin_min) // 2 + bin_min
1384
+
1385
+ # Use the result from this iteration as the maximum for the next.
1386
+ bin_max = bin_mid
1387
+ start = max(1, loc - bin_mid + 1)
1388
+ finish = min(loc + bin_mid, len(text)) + len(pattern)
1389
+
1390
+ rd = [0] * (finish + 2)
1391
+ rd[finish + 1] = (1 << d) - 1
1392
+ for j in range(finish, start - 1, -1):
1393
+ if len(text) <= j - 1:
1394
+ # Out of range.
1395
+ charMatch = 0
1396
+ else:
1397
+ charMatch = s.get(text[j - 1], 0)
1398
+ if d == 0: # First pass: exact match.
1399
+ rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
1400
+ else: # Subsequent passes: fuzzy match.
1401
+ rd[j] = (
1402
+ (((rd[j + 1] << 1) | 1) & charMatch)
1403
+ | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
1404
+ | last_rd[j + 1]
1405
+ )
1406
+ if rd[j] & matchmask:
1407
+ score = match_bitapScore(d, j - 1)
1408
+ # This match will almost certainly be better than any existing match.
1409
+ # But check anyway.
1410
+ if score <= score_threshold:
1411
+ # Told you so.
1412
+ score_threshold = score
1413
+ best_loc = j - 1
1414
+ if best_loc > loc:
1415
+ # When passing loc, don't exceed our current distance from loc.
1416
+ start = max(1, 2 * loc - best_loc)
1417
+ else:
1418
+ # Already passed loc, downhill from here on in.
1419
+ break
1420
+ # No hope for a (better) match at greater error levels.
1421
+ if match_bitapScore(d + 1, loc) > score_threshold:
1422
+ break
1423
+ last_rd = rd
1424
+ return best_loc
1425
+
1426
+ def match_alphabet(self, pattern):
1427
+ """Initialise the alphabet for the Bitap algorithm.
1428
+
1429
+ Args:
1430
+ pattern: The text to encode.
1431
+
1432
+ Returns:
1433
+ Hash of character locations.
1434
+ """
1435
+ s = {}
1436
+ for char in pattern:
1437
+ s[char] = 0
1438
+ for i in range(len(pattern)):
1439
+ s[pattern[i]] |= 1 << (len(pattern) - i - 1)
1440
+ return s
1441
+
1442
+ # PATCH FUNCTIONS
1443
+
1444
+ def patch_addContext(self, patch, text):
1445
+ """Increase the context until it is unique,
1446
+ but don't let the pattern expand beyond Match_MaxBits.
1447
+
1448
+ Args:
1449
+ patch: The patch to grow.
1450
+ text: Source text.
1451
+ """
1452
+ if len(text) == 0:
1453
+ return
1454
+ pattern = text[patch.start2 : patch.start2 + patch.length1]
1455
+ padding = 0
1456
+
1457
+ # Look for the first and last matches of pattern in text. If two different
1458
+ # matches are found, increase the pattern length.
1459
+ while text.find(pattern) != text.rfind(pattern) and (
1460
+ self.Match_MaxBits == 0
1461
+ or len(pattern) < self.Match_MaxBits - self.Patch_Margin - self.Patch_Margin
1462
+ ):
1463
+ padding += self.Patch_Margin
1464
+ pattern = text[
1465
+ max(0, patch.start2 - padding) : patch.start2 + patch.length1 + padding
1466
+ ]
1467
+ # Add one chunk for good luck.
1468
+ padding += self.Patch_Margin
1469
+
1470
+ # Add the prefix.
1471
+ prefix = text[max(0, patch.start2 - padding) : patch.start2]
1472
+ if prefix:
1473
+ patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)]
1474
+ # Add the suffix.
1475
+ suffix = text[
1476
+ patch.start2 + patch.length1 : patch.start2 + patch.length1 + padding
1477
+ ]
1478
+ if suffix:
1479
+ patch.diffs.append((self.DIFF_EQUAL, suffix))
1480
+
1481
+ # Roll back the start points.
1482
+ patch.start1 -= len(prefix)
1483
+ patch.start2 -= len(prefix)
1484
+ # Extend lengths.
1485
+ patch.length1 += len(prefix) + len(suffix)
1486
+ patch.length2 += len(prefix) + len(suffix)
1487
+
1488
+ def patch_make(self, a, b=None, c=None):
1489
+ """Compute a list of patches to turn text1 into text2.
1490
+ Use diffs if provided, otherwise compute it ourselves.
1491
+ There are four ways to call this function, depending on what data is
1492
+ available to the caller:
1493
+ Method 1:
1494
+ a = text1, b = text2
1495
+ Method 2:
1496
+ a = diffs
1497
+ Method 3 (optimal):
1498
+ a = text1, b = diffs
1499
+ Method 4 (deprecated, use method 3):
1500
+ a = text1, b = text2, c = diffs
1501
+
1502
+ Args:
1503
+ a: text1 (methods 1,3,4) or Array of diff tuples for text1 to
1504
+ text2 (method 2).
1505
+ b: text2 (methods 1,4) or Array of diff tuples for text1 to
1506
+ text2 (method 3) or undefined (method 2).
1507
+ c: Array of diff tuples for text1 to text2 (method 4) or
1508
+ undefined (methods 1,2,3).
1509
+
1510
+ Returns:
1511
+ Array of Patch objects.
1512
+ """
1513
+ text1 = None
1514
+ diffs = None
1515
+ if isinstance(a, str) and isinstance(b, str) and c is None:
1516
+ # Method 1: text1, text2
1517
+ # Compute diffs from text1 and text2.
1518
+ text1 = a
1519
+ diffs = self.diff_main(text1, b, True)
1520
+ if len(diffs) > 2:
1521
+ self.diff_cleanupSemantic(diffs)
1522
+ self.diff_cleanupEfficiency(diffs)
1523
+ elif isinstance(a, list) and b is None and c is None:
1524
+ # Method 2: diffs
1525
+ # Compute text1 from diffs.
1526
+ diffs = a
1527
+ text1 = self.diff_text1(diffs)
1528
+ elif isinstance(a, str) and isinstance(b, list) and c is None:
1529
+ # Method 3: text1, diffs
1530
+ text1 = a
1531
+ diffs = b
1532
+ elif isinstance(a, str) and isinstance(b, str) and isinstance(c, list):
1533
+ # Method 4: text1, text2, diffs
1534
+ # text2 is not used.
1535
+ text1 = a
1536
+ diffs = c
1537
+ else:
1538
+ raise ValueError("Unknown call format to patch_make.")
1539
+
1540
+ if not diffs:
1541
+ return [] # Get rid of the None case.
1542
+ patches = []
1543
+ patch = patch_obj()
1544
+ char_count1 = 0 # Number of characters into the text1 string.
1545
+ char_count2 = 0 # Number of characters into the text2 string.
1546
+ prepatch_text = text1 # Recreate the patches to determine context info.
1547
+ postpatch_text = text1
1548
+ for x in range(len(diffs)):
1549
+ (diff_type, diff_text) = diffs[x]
1550
+ if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL:
1551
+ # A new patch starts here.
1552
+ patch.start1 = char_count1
1553
+ patch.start2 = char_count2
1554
+ if diff_type == self.DIFF_INSERT:
1555
+ # Insertion
1556
+ patch.diffs.append(diffs[x])
1557
+ patch.length2 += len(diff_text)
1558
+ postpatch_text = (
1559
+ postpatch_text[:char_count2]
1560
+ + diff_text
1561
+ + postpatch_text[char_count2:]
1562
+ )
1563
+ elif diff_type == self.DIFF_DELETE:
1564
+ # Deletion.
1565
+ patch.length1 += len(diff_text)
1566
+ patch.diffs.append(diffs[x])
1567
+ postpatch_text = (
1568
+ postpatch_text[:char_count2]
1569
+ + postpatch_text[char_count2 + len(diff_text) :]
1570
+ )
1571
+ elif (
1572
+ diff_type == self.DIFF_EQUAL
1573
+ and len(diff_text) <= 2 * self.Patch_Margin
1574
+ and len(patch.diffs) != 0
1575
+ and len(diffs) != x + 1
1576
+ ):
1577
+ # Small equality inside a patch.
1578
+ patch.diffs.append(diffs[x])
1579
+ patch.length1 += len(diff_text)
1580
+ patch.length2 += len(diff_text)
1581
+
1582
+ if diff_type == self.DIFF_EQUAL and len(diff_text) >= 2 * self.Patch_Margin:
1583
+ # Time for a new patch.
1584
+ if len(patch.diffs) != 0:
1585
+ self.patch_addContext(patch, prepatch_text)
1586
+ patches.append(patch)
1587
+ patch = patch_obj()
1588
+ # Unlike Unidiff, our patch lists have a rolling context.
1589
+ # https://github.com/google/diff-match-patch/wiki/Unidiff
1590
+ # Update prepatch text & pos to reflect the application of the
1591
+ # just completed patch.
1592
+ prepatch_text = postpatch_text
1593
+ char_count1 = char_count2
1594
+
1595
+ # Update the current character count.
1596
+ if diff_type != self.DIFF_INSERT:
1597
+ char_count1 += len(diff_text)
1598
+ if diff_type != self.DIFF_DELETE:
1599
+ char_count2 += len(diff_text)
1600
+
1601
+ # Pick up the leftover patch if not empty.
1602
+ if len(patch.diffs) != 0:
1603
+ self.patch_addContext(patch, prepatch_text)
1604
+ patches.append(patch)
1605
+ return patches
1606
+
1607
+ def patch_deepCopy(self, patches):
1608
+ """Given an array of patches, return another array that is identical.
1609
+
1610
+ Args:
1611
+ patches: Array of Patch objects.
1612
+
1613
+ Returns:
1614
+ Array of Patch objects.
1615
+ """
1616
+ patchesCopy = []
1617
+ for patch in patches:
1618
+ patchCopy = patch_obj()
1619
+ # No need to deep copy the tuples since they are immutable.
1620
+ patchCopy.diffs = patch.diffs[:]
1621
+ patchCopy.start1 = patch.start1
1622
+ patchCopy.start2 = patch.start2
1623
+ patchCopy.length1 = patch.length1
1624
+ patchCopy.length2 = patch.length2
1625
+ patchesCopy.append(patchCopy)
1626
+ return patchesCopy
1627
+
1628
+ def patch_apply(self, patches, text):
1629
+ """Merge a set of patches onto the text. Return a patched text, as well
1630
+ as a list of true/false values indicating which patches were applied.
1631
+
1632
+ Args:
1633
+ patches: Array of Patch objects.
1634
+ text: Old text.
1635
+
1636
+ Returns:
1637
+ Two element Array, containing the new text and an array of boolean values.
1638
+ """
1639
+ if not patches:
1640
+ return (text, [])
1641
+
1642
+ # Deep copy the patches so that no changes are made to originals.
1643
+ patches = self.patch_deepCopy(patches)
1644
+
1645
+ nullPadding = self.patch_addPadding(patches)
1646
+ text = nullPadding + text + nullPadding
1647
+ self.patch_splitMax(patches)
1648
+
1649
+ # delta keeps track of the offset between the expected and actual location
1650
+ # of the previous patch. If there are patches expected at positions 10 and
1651
+ # 20, but the first patch was found at 12, delta is 2 and the second patch
1652
+ # has an effective expected position of 22.
1653
+ delta = 0
1654
+ results = []
1655
+ for patch in patches:
1656
+ expected_loc = patch.start2 + delta
1657
+ text1 = self.diff_text1(patch.diffs)
1658
+ end_loc = -1
1659
+ if len(text1) > self.Match_MaxBits:
1660
+ # patch_splitMax will only provide an oversized pattern in the case of
1661
+ # a monster delete.
1662
+ start_loc = self.match_main(
1663
+ text, text1[: self.Match_MaxBits], expected_loc
1664
+ )
1665
+ if start_loc != -1:
1666
+ end_loc = self.match_main(
1667
+ text,
1668
+ text1[-self.Match_MaxBits :],
1669
+ expected_loc + len(text1) - self.Match_MaxBits,
1670
+ )
1671
+ if end_loc == -1 or start_loc >= end_loc:
1672
+ # Can't find valid trailing context. Drop this patch.
1673
+ start_loc = -1
1674
+ else:
1675
+ start_loc = self.match_main(text, text1, expected_loc)
1676
+ if start_loc == -1:
1677
+ # No match found. :(
1678
+ results.append(False)
1679
+ # Subtract the delta for this failed patch from subsequent patches.
1680
+ delta -= patch.length2 - patch.length1
1681
+ else:
1682
+ # Found a match. :)
1683
+ results.append(True)
1684
+ delta = start_loc - expected_loc
1685
+ if end_loc == -1:
1686
+ text2 = text[start_loc : start_loc + len(text1)]
1687
+ else:
1688
+ text2 = text[start_loc : end_loc + self.Match_MaxBits]
1689
+ if text1 == text2:
1690
+ # Perfect match, just shove the replacement text in.
1691
+ text = (
1692
+ text[:start_loc]
1693
+ + self.diff_text2(patch.diffs)
1694
+ + text[start_loc + len(text1) :]
1695
+ )
1696
+ else:
1697
+ # Imperfect match.
1698
+ # Run a diff to get a framework of equivalent indices.
1699
+ diffs = self.diff_main(text1, text2, False)
1700
+ if (
1701
+ len(text1) > self.Match_MaxBits
1702
+ and self.diff_levenshtein(diffs) / float(len(text1))
1703
+ > self.Patch_DeleteThreshold
1704
+ ):
1705
+ # The end points match, but the content is unacceptably bad.
1706
+ results[-1] = False
1707
+ else:
1708
+ self.diff_cleanupSemanticLossless(diffs)
1709
+ index1 = 0
1710
+ for op, data in patch.diffs:
1711
+ if op != self.DIFF_EQUAL:
1712
+ index2 = self.diff_xIndex(diffs, index1)
1713
+ if op == self.DIFF_INSERT: # Insertion
1714
+ text = (
1715
+ text[: start_loc + index2]
1716
+ + data
1717
+ + text[start_loc + index2 :]
1718
+ )
1719
+ elif op == self.DIFF_DELETE: # Deletion
1720
+ text = (
1721
+ text[: start_loc + index2]
1722
+ + text[
1723
+ start_loc
1724
+ + self.diff_xIndex(diffs, index1 + len(data)) :
1725
+ ]
1726
+ )
1727
+ if op != self.DIFF_DELETE:
1728
+ index1 += len(data)
1729
+ # Strip the padding off.
1730
+ text = text[len(nullPadding) : -len(nullPadding)]
1731
+ return (text, results)
1732
+
1733
+ def patch_addPadding(self, patches):
1734
+ """Add some padding on text start and end so that edges can match
1735
+ something. Intended to be called only from within patch_apply.
1736
+
1737
+ Args:
1738
+ patches: Array of Patch objects.
1739
+
1740
+ Returns:
1741
+ The padding string added to each side.
1742
+ """
1743
+ paddingLength = self.Patch_Margin
1744
+ nullPadding = ""
1745
+ for x in range(1, paddingLength + 1):
1746
+ nullPadding += chr(x)
1747
+
1748
+ # Bump all the patches forward.
1749
+ for patch in patches:
1750
+ patch.start1 += paddingLength
1751
+ patch.start2 += paddingLength
1752
+
1753
+ # Add some padding on start of first diff.
1754
+ patch = patches[0]
1755
+ diffs = patch.diffs
1756
+ if not diffs or diffs[0][0] != self.DIFF_EQUAL:
1757
+ # Add nullPadding equality.
1758
+ diffs.insert(0, (self.DIFF_EQUAL, nullPadding))
1759
+ patch.start1 -= paddingLength # Should be 0.
1760
+ patch.start2 -= paddingLength # Should be 0.
1761
+ patch.length1 += paddingLength
1762
+ patch.length2 += paddingLength
1763
+ elif paddingLength > len(diffs[0][1]):
1764
+ # Grow first equality.
1765
+ extraLength = paddingLength - len(diffs[0][1])
1766
+ newText = nullPadding[len(diffs[0][1]) :] + diffs[0][1]
1767
+ diffs[0] = (diffs[0][0], newText)
1768
+ patch.start1 -= extraLength
1769
+ patch.start2 -= extraLength
1770
+ patch.length1 += extraLength
1771
+ patch.length2 += extraLength
1772
+
1773
+ # Add some padding on end of last diff.
1774
+ patch = patches[-1]
1775
+ diffs = patch.diffs
1776
+ if not diffs or diffs[-1][0] != self.DIFF_EQUAL:
1777
+ # Add nullPadding equality.
1778
+ diffs.append((self.DIFF_EQUAL, nullPadding))
1779
+ patch.length1 += paddingLength
1780
+ patch.length2 += paddingLength
1781
+ elif paddingLength > len(diffs[-1][1]):
1782
+ # Grow last equality.
1783
+ extraLength = paddingLength - len(diffs[-1][1])
1784
+ newText = diffs[-1][1] + nullPadding[:extraLength]
1785
+ diffs[-1] = (diffs[-1][0], newText)
1786
+ patch.length1 += extraLength
1787
+ patch.length2 += extraLength
1788
+
1789
+ return nullPadding
1790
+
1791
+ def patch_splitMax(self, patches):
1792
+ """Look through the patches and break up any which are longer than the
1793
+ maximum limit of the match algorithm.
1794
+ Intended to be called only from within patch_apply.
1795
+
1796
+ Args:
1797
+ patches: Array of Patch objects.
1798
+ """
1799
+ patch_size = self.Match_MaxBits
1800
+ if patch_size == 0:
1801
+ # Python has the option of not splitting strings due to its ability
1802
+ # to handle integers of arbitrary precision.
1803
+ return
1804
+ for x in range(len(patches)):
1805
+ if patches[x].length1 <= patch_size:
1806
+ continue
1807
+ bigpatch = patches[x]
1808
+ # Remove the big old patch.
1809
+ del patches[x]
1810
+ x -= 1
1811
+ start1 = bigpatch.start1
1812
+ start2 = bigpatch.start2
1813
+ precontext = ""
1814
+ while len(bigpatch.diffs) != 0:
1815
+ # Create one of several smaller patches.
1816
+ patch = patch_obj()
1817
+ empty = True
1818
+ patch.start1 = start1 - len(precontext)
1819
+ patch.start2 = start2 - len(precontext)
1820
+ if precontext:
1821
+ patch.length1 = patch.length2 = len(precontext)
1822
+ patch.diffs.append((self.DIFF_EQUAL, precontext))
1823
+
1824
+ while (
1825
+ len(bigpatch.diffs) != 0
1826
+ and patch.length1 < patch_size - self.Patch_Margin
1827
+ ):
1828
+ (diff_type, diff_text) = bigpatch.diffs[0]
1829
+ if diff_type == self.DIFF_INSERT:
1830
+ # Insertions are harmless.
1831
+ patch.length2 += len(diff_text)
1832
+ start2 += len(diff_text)
1833
+ patch.diffs.append(bigpatch.diffs.pop(0))
1834
+ empty = False
1835
+ elif (
1836
+ diff_type == self.DIFF_DELETE
1837
+ and len(patch.diffs) == 1
1838
+ and patch.diffs[0][0] == self.DIFF_EQUAL
1839
+ and len(diff_text) > 2 * patch_size
1840
+ ):
1841
+ # This is a large deletion. Let it pass in one chunk.
1842
+ patch.length1 += len(diff_text)
1843
+ start1 += len(diff_text)
1844
+ empty = False
1845
+ patch.diffs.append((diff_type, diff_text))
1846
+ del bigpatch.diffs[0]
1847
+ else:
1848
+ # Deletion or equality. Only take as much as we can stomach.
1849
+ diff_text = diff_text[
1850
+ : patch_size - patch.length1 - self.Patch_Margin
1851
+ ]
1852
+ patch.length1 += len(diff_text)
1853
+ start1 += len(diff_text)
1854
+ if diff_type == self.DIFF_EQUAL:
1855
+ patch.length2 += len(diff_text)
1856
+ start2 += len(diff_text)
1857
+ else:
1858
+ empty = False
1859
+
1860
+ patch.diffs.append((diff_type, diff_text))
1861
+ if diff_text == bigpatch.diffs[0][1]:
1862
+ del bigpatch.diffs[0]
1863
+ else:
1864
+ bigpatch.diffs[0] = (
1865
+ bigpatch.diffs[0][0],
1866
+ bigpatch.diffs[0][1][len(diff_text) :],
1867
+ )
1868
+
1869
+ # Compute the head context for the next patch.
1870
+ precontext = self.diff_text2(patch.diffs)
1871
+ precontext = precontext[-self.Patch_Margin :]
1872
+ # Append the end context for this patch.
1873
+ postcontext = self.diff_text1(bigpatch.diffs)[: self.Patch_Margin]
1874
+ if postcontext:
1875
+ patch.length1 += len(postcontext)
1876
+ patch.length2 += len(postcontext)
1877
+ if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL:
1878
+ patch.diffs[-1] = (
1879
+ self.DIFF_EQUAL,
1880
+ patch.diffs[-1][1] + postcontext,
1881
+ )
1882
+ else:
1883
+ patch.diffs.append((self.DIFF_EQUAL, postcontext))
1884
+
1885
+ if not empty:
1886
+ x += 1
1887
+ patches.insert(x, patch)
1888
+
1889
+ def patch_toText(self, patches):
1890
+ """Take a list of patches and return a textual representation.
1891
+
1892
+ Args:
1893
+ patches: Array of Patch objects.
1894
+
1895
+ Returns:
1896
+ Text representation of patches.
1897
+ """
1898
+ text = []
1899
+ for patch in patches:
1900
+ text.append(str(patch))
1901
+ return "".join(text)
1902
+
1903
+ def patch_fromText(self, textline):
1904
+ """Parse a textual representation of patches and return a list of patch
1905
+ objects.
1906
+
1907
+ Args:
1908
+ textline: Text representation of patches.
1909
+
1910
+ Returns:
1911
+ Array of Patch objects.
1912
+
1913
+ Raises:
1914
+ ValueError: If invalid input.
1915
+ """
1916
+ patches = []
1917
+ if not textline:
1918
+ return patches
1919
+ text = textline.split("\n")
1920
+ while len(text) != 0:
1921
+ m = re.match(r"^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0])
1922
+ if not m:
1923
+ raise ValueError("Invalid patch string: " + text[0])
1924
+ patch = patch_obj()
1925
+ patches.append(patch)
1926
+ patch.start1 = int(m.group(1))
1927
+ if m.group(2) == "":
1928
+ patch.start1 -= 1
1929
+ patch.length1 = 1
1930
+ elif m.group(2) == "0":
1931
+ patch.length1 = 0
1932
+ else:
1933
+ patch.start1 -= 1
1934
+ patch.length1 = int(m.group(2))
1935
+
1936
+ patch.start2 = int(m.group(3))
1937
+ if m.group(4) == "":
1938
+ patch.start2 -= 1
1939
+ patch.length2 = 1
1940
+ elif m.group(4) == "0":
1941
+ patch.length2 = 0
1942
+ else:
1943
+ patch.start2 -= 1
1944
+ patch.length2 = int(m.group(4))
1945
+
1946
+ del text[0]
1947
+
1948
+ while len(text) != 0:
1949
+ if text[0]:
1950
+ sign = text[0][0]
1951
+ else:
1952
+ sign = ""
1953
+ line = urllib.parse.unquote(text[0][1:])
1954
+ if sign == "+":
1955
+ # Insertion.
1956
+ patch.diffs.append((self.DIFF_INSERT, line))
1957
+ elif sign == "-":
1958
+ # Deletion.
1959
+ patch.diffs.append((self.DIFF_DELETE, line))
1960
+ elif sign == " ":
1961
+ # Minor equality.
1962
+ patch.diffs.append((self.DIFF_EQUAL, line))
1963
+ elif sign == "@":
1964
+ # Start of next patch.
1965
+ break
1966
+ elif sign == "":
1967
+ # Blank line? Whatever.
1968
+ pass
1969
+ else:
1970
+ # WTF?
1971
+ raise ValueError("Invalid patch mode: '%s'\n%s" % (sign, line))
1972
+ del text[0]
1973
+ return patches
1974
+
1975
+
1976
+ class patch_obj:
1977
+ """Class representing one patch operation."""
1978
+
1979
+ def __init__(self):
1980
+ """Initializes with an empty list of diffs."""
1981
+ self.diffs = []
1982
+ self.start1 = None
1983
+ self.start2 = None
1984
+ self.length1 = 0
1985
+ self.length2 = 0
1986
+
1987
+ def __str__(self):
1988
+ """Emulate GNU diff's format.
1989
+ Header: @@ -382,8 +481,9 @@
1990
+ Indices are printed as 1-based, not 0-based.
1991
+
1992
+ Returns:
1993
+ The GNU diff string.
1994
+ """
1995
+ if self.length1 == 0:
1996
+ coords1 = str(self.start1) + ",0"
1997
+ elif self.length1 == 1:
1998
+ coords1 = str(self.start1 + 1)
1999
+ else:
2000
+ coords1 = str(self.start1 + 1) + "," + str(self.length1)
2001
+ if self.length2 == 0:
2002
+ coords2 = str(self.start2) + ",0"
2003
+ elif self.length2 == 1:
2004
+ coords2 = str(self.start2 + 1)
2005
+ else:
2006
+ coords2 = str(self.start2 + 1) + "," + str(self.length2)
2007
+ text = ["@@ -", coords1, " +", coords2, " @@\n"]
2008
+ # Escape the body of the patch with %xx notation.
2009
+ for op, data in self.diffs:
2010
+ if op == diff_match_patch.DIFF_INSERT:
2011
+ text.append("+")
2012
+ elif op == diff_match_patch.DIFF_DELETE:
2013
+ text.append("-")
2014
+ elif op == diff_match_patch.DIFF_EQUAL:
2015
+ text.append(" ")
2016
+ # High ascii will raise UnicodeDecodeError. Use Unicode instead.
2017
+ data = data.encode("utf-8")
2018
+ text.append(urllib.parse.quote(data, "!~*'();/?:@&=+$,# ") + "\n")
2019
+ return "".join(text)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ datasets[audio]
3
+ jiwer