m-ric HF staff commited on
Commit
6d80cb7
β€’
1 Parent(s): fad02a2

Update overlap.py

Browse files
Files changed (1) hide show
  1. overlap.py +39 -0
overlap.py CHANGED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Utilities to get overlap between strings
2
+
3
+ def get_overlap_length(left: str, right: str):
4
+ good_length, overlap = 0, ""
5
+ for i in range(min(len(left), len(right))):
6
+ if left[-i:] == right[:i]:
7
+ good_length = i
8
+ overlap = left[-i:]
9
+ return good_length, overlap
10
+
11
+ def get_overlap_list(strings):
12
+ """
13
+ Returns a list of tuples of the form (overlap_length, overlap), one tuple for each pair of strings in the input list.
14
+ """
15
+ overlaps = []
16
+ for i in range(len(strings) - 1):
17
+ overlaps.append(get_overlap_length(strings[i], strings[i+1]))
18
+ return overlaps
19
+
20
+ def unoverlap_list(strings):
21
+ """
22
+ Returns a list of tuples of the form (content, is_overlap), where is_overlap is a boolean indicating whether the content is an overlap or not.
23
+ """
24
+ overlaps = get_overlap_list(strings)
25
+ new_list = []
26
+ for index, string in enumerate(strings):
27
+ # Add the last overlap when needed
28
+ if index > 0 and len(overlaps[index-1][1]) > 0:
29
+ new_list.append((overlaps[index-1][1], True))
30
+
31
+ # prune the string with left and right overlaps
32
+ left_overlap_length, right_overlap_length = 0, 0
33
+ if index > 0:
34
+ left_overlap_length = overlaps[index-1][0]
35
+ if index < len(strings) - 1:
36
+ right_overlap_length = overlaps[index][0]
37
+
38
+ new_list.append((string[left_overlap_length:len(string)-right_overlap_length], False))
39
+ return new_list