Spaces:
Build error
Build error
import re | |
from rich.console import Console | |
from rich.highlighter import RegexHighlighter | |
from typing import Tuple, List | |
class NullHighlighter(RegexHighlighter): | |
"""Apply style to anything that looks like an email.""" | |
base_style = "" | |
highlights = [r""] | |
def highlight_document(doc: str, | |
keywords: List[Tuple[str, float]]): | |
""" Highlight keywords in a document | |
Arguments: | |
doc: The document for which to extract keywords/keyphrases | |
keywords: the top n keywords for a document with their respective distances | |
to the input document | |
Returns: | |
highlighted_text: The document with additional tags to highlight keywords | |
according to the rich package | |
""" | |
keywords_only = [keyword for keyword, _ in keywords] | |
max_len = max([len(token.split(" ")) for token in keywords_only]) | |
if max_len == 1: | |
highlighted_text = _highlight_one_gram(doc, keywords_only) | |
else: | |
highlighted_text = _highlight_n_gram(doc, keywords_only) | |
return highlighted_text | |
def _highlight_one_gram(doc: str, | |
keywords: List[str]) -> str: | |
""" Highlight 1-gram keywords in a document | |
Arguments: | |
doc: The document for which to extract keywords/keyphrases | |
keywords: the top n keywords for a document | |
Returns: | |
highlighted_text: The document with additional tags to highlight keywords | |
according to the rich package | |
""" | |
tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).split(" ") | |
highlighted_text = " ".join([f'<span style="background-color: #FFFF00">{token}</span>' | |
if token.lower() in keywords | |
else f"{token}" | |
for token in tokens]).strip() | |
return highlighted_text | |
def _highlight_n_gram(doc: str, | |
keywords: List[str]) -> str: | |
""" Highlight n-gram keywords in a document | |
Arguments: | |
doc: The document for which to extract keywords/keyphrases | |
keywords: the top n keywords for a document | |
Returns: | |
highlighted_text: The document with additional tags to highlight keywords | |
according to the rich package | |
""" | |
max_len = max([len(token.split(" ")) for token in keywords]) | |
tokens = re.sub(r' +', ' ', doc.replace("\n", " ")).strip().split(" ") | |
n_gram_tokens = [[" ".join(tokens[i: i + max_len][0: j + 1]) for j in range(max_len)] for i, _ in enumerate(tokens)] | |
highlighted_text = [] | |
skip = False | |
for n_grams in n_gram_tokens: | |
candidate = False | |
if not skip: | |
for index, n_gram in enumerate(n_grams): | |
if n_gram.lower() in keywords: | |
candidate = f'<span style="background-color: #FFFF00">{n_gram}</span>' + n_grams[-1].split(n_gram)[-1] | |
skip = index + 1 | |
if not candidate: | |
candidate = n_grams[0] | |
highlighted_text.append(candidate) | |
else: | |
skip = skip - 1 | |
highlighted_text = " ".join(highlighted_text) | |
return highlighted_text |