Spaces:

spark-nlp
/

SparkNLP_NER

Build error

App Files Files Community

aemin commited on Dec 1, 2021

Commit

092c3fc

•

1 Parent(s): 2329791

Upload _highlight.py

Browse files

Files changed (1) hide show

_highlight.py +92 -0

_highlight.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import re
+from rich.console import Console
+from rich.highlighter import RegexHighlighter
+from typing import Tuple, List
+class NullHighlighter(RegexHighlighter):
+    """Apply style to anything that looks like an email."""
+    base_style = ""
+    highlights = [r""]
+def highlight_document(doc: str,
+                       keywords: List[Tuple[str, float]]):
+    """ Highlight keywords in a document
+    Arguments:
+        doc: The document for which to extract keywords/keyphrases
+        keywords: the top n keywords for a document with their respective distances
+                  to the input document
+    Returns:
+        highlighted_text: The document with additional tags to highlight keywords
+                          according to the rich package
+    """
+    keywords_only = [keyword for keyword, _ in keywords]
+    max_len = max([len(token.split(" ")) for token in keywords_only])
+    if max_len == 1:
+        highlighted_text = _highlight_one_gram(doc, keywords_only)
+    else:
+        highlighted_text = _highlight_n_gram(doc, keywords_only)
+    return highlighted_text
+def _highlight_one_gram(doc: str,
+                        keywords: List[str]) -> str:
+    """ Highlight 1-gram keywords in a document
+    Arguments:
+        doc: The document for which to extract keywords/keyphrases
+        keywords: the top n keywords for a document
+    Returns:
+        highlighted_text: The document with additional tags to highlight keywords
+                          according to the rich package
+    """
+    tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).split(" ")
+    highlighted_text = " ".join([f'<span style="background-color: #FFFF00">{token}</span>'
+                                 if token.lower() in keywords
+                                 else f"{token}"
+                                 for token in tokens]).strip()
+    return highlighted_text
+def _highlight_n_gram(doc: str,
+                      keywords: List[str]) -> str:
+    """ Highlight n-gram keywords in a document
+    Arguments:
+        doc: The document for which to extract keywords/keyphrases
+        keywords: the top n keywords for a document
+    Returns:
+        highlighted_text: The document with additional tags to highlight keywords
+                          according to the rich package
+    """
+    max_len = max([len(token.split(" ")) for token in keywords])
+    tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).strip().split(" ")
+    n_gram_tokens = [[" ".join(tokens[i: i + max_len][0: j + 1]) for j in range(max_len)] for i, _ in enumerate(tokens)]
+    highlighted_text = []
+    skip = False
+    for n_grams in n_gram_tokens:
+        candidate = False
+        if not skip:
+            for index, n_gram in enumerate(n_grams):
+                if n_gram.lower() in keywords:
+                    candidate = f'<span style="background-color: #FFFF00">{n_gram}</span>' + n_grams[-1].split(n_gram)[-1]
+                    skip = index + 1
+            if not candidate:
+                candidate = n_grams[0]
+            highlighted_text.append(candidate)
+        else:
+            skip = skip - 1
+    highlighted_text = " ".join(highlighted_text)
+    return highlighted_text