aemin commited on
Commit
092c3fc
1 Parent(s): 2329791

Upload _highlight.py

Browse files
Files changed (1) hide show
  1. _highlight.py +92 -0
_highlight.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from rich.console import Console
3
+ from rich.highlighter import RegexHighlighter
4
+ from typing import Tuple, List
5
+
6
+
7
+ class NullHighlighter(RegexHighlighter):
8
+ """Apply style to anything that looks like an email."""
9
+
10
+ base_style = ""
11
+ highlights = [r""]
12
+
13
+
14
+ def highlight_document(doc: str,
15
+ keywords: List[Tuple[str, float]]):
16
+ """ Highlight keywords in a document
17
+ Arguments:
18
+ doc: The document for which to extract keywords/keyphrases
19
+ keywords: the top n keywords for a document with their respective distances
20
+ to the input document
21
+ Returns:
22
+ highlighted_text: The document with additional tags to highlight keywords
23
+ according to the rich package
24
+ """
25
+ keywords_only = [keyword for keyword, _ in keywords]
26
+ max_len = max([len(token.split(" ")) for token in keywords_only])
27
+
28
+ if max_len == 1:
29
+ highlighted_text = _highlight_one_gram(doc, keywords_only)
30
+ else:
31
+ highlighted_text = _highlight_n_gram(doc, keywords_only)
32
+
33
+
34
+ return highlighted_text
35
+
36
+
37
+ def _highlight_one_gram(doc: str,
38
+ keywords: List[str]) -> str:
39
+ """ Highlight 1-gram keywords in a document
40
+ Arguments:
41
+ doc: The document for which to extract keywords/keyphrases
42
+ keywords: the top n keywords for a document
43
+ Returns:
44
+ highlighted_text: The document with additional tags to highlight keywords
45
+ according to the rich package
46
+ """
47
+ tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).split(" ")
48
+
49
+ highlighted_text = " ".join([f'<span style="background-color: #FFFF00">{token}</span>'
50
+ if token.lower() in keywords
51
+ else f"{token}"
52
+ for token in tokens]).strip()
53
+
54
+
55
+ return highlighted_text
56
+
57
+
58
+ def _highlight_n_gram(doc: str,
59
+ keywords: List[str]) -> str:
60
+ """ Highlight n-gram keywords in a document
61
+ Arguments:
62
+ doc: The document for which to extract keywords/keyphrases
63
+ keywords: the top n keywords for a document
64
+ Returns:
65
+ highlighted_text: The document with additional tags to highlight keywords
66
+ according to the rich package
67
+ """
68
+ max_len = max([len(token.split(" ")) for token in keywords])
69
+ tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).strip().split(" ")
70
+ n_gram_tokens = [[" ".join(tokens[i: i + max_len][0: j + 1]) for j in range(max_len)] for i, _ in enumerate(tokens)]
71
+ highlighted_text = []
72
+ skip = False
73
+
74
+ for n_grams in n_gram_tokens:
75
+ candidate = False
76
+
77
+ if not skip:
78
+ for index, n_gram in enumerate(n_grams):
79
+
80
+ if n_gram.lower() in keywords:
81
+ candidate = f'<span style="background-color: #FFFF00">{n_gram}</span>' + n_grams[-1].split(n_gram)[-1]
82
+ skip = index + 1
83
+
84
+ if not candidate:
85
+ candidate = n_grams[0]
86
+
87
+ highlighted_text.append(candidate)
88
+
89
+ else:
90
+ skip = skip - 1
91
+ highlighted_text = " ".join(highlighted_text)
92
+ return highlighted_text