Spaces:
Build error
Build error
| """ | |
| | **Abbreviation** | **Description** | | |
| |------------------|-----------------| | |
| | O | Outside of a named entity | |
| | B-MIS | Beginning of a miscellaneous entity right after another miscellaneous entity | |
| | I-MIS | Miscellaneous entity | |
| | B-PER | Beginning of a person’s name right after another person’s name | |
| | I-PER | Person’s name | |
| | B-ORG | Beginning of an organization right after another organization | |
| | I-ORG | Organization | |
| | B-LOC | Beginning of a location right after another location | |
| | I-LOC | Location | |
| """ | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
| from enum import Enum | |
| class DictKey(Enum): | |
| ENTITY = 'entity' | |
| SCORE = 'score' | |
| INDEX = 'index' | |
| WORD = 'word' | |
| START = 'start' | |
| END = 'end' | |
| class NER: | |
| def __init__(self, text_to_analyse): | |
| """ | |
| The Constructor for the Named Entity Recognition class. | |
| :param text_to_analyse: The text in which to find named entities. | |
| """ | |
| self.tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") | |
| self.model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") | |
| self.nlp = pipeline("ner", model=self.model, tokenizer=self.tokenizer, grouped_entities=True) | |
| if self.nlp is None: | |
| raise ValueError("Unable to load pipeline from DSLIM BERT model") | |
| self.text_to_analyse = text_to_analyse | |
| self.results = self.nlp(text_to_analyse) | |
| self.all_entities = self.get_list_of_entities() | |
| self.unique_entities = self.unique_entities() | |
| self.markdown = None | |
| self.markdown_text = None | |
| def get_entity_value(self, key: DictKey, item_index): | |
| """ | |
| Extracts the value for a specific key (as an Enum) from a specific dictionary item in the list. | |
| :param key: DictKey Enum representing the key for which the value is required. | |
| :param item_index: Index of the item in the list to process. | |
| :return: Value for the given key in the specified dictionary item, or None if key is not found. | |
| """ | |
| if item_index < len(self.results): | |
| return self.results[item_index].get(key.value) | |
| else: | |
| raise ValueError("The supplied list index is out of bounds") | |
| def get_list_of_entities(self): | |
| """ | |
| Returns a list of all entities in the original text, in the order they appear. There may be repeated | |
| entities in this list. | |
| :return: A list of all entities in the original text. | |
| """ | |
| # create a list where each item is the value of word from each of the dictionaries in self.results | |
| return [item.get(DictKey.WORD.value) for item in self.results] | |
| def entity_markdown(self): | |
| """ | |
| Convert a string to markdown format and change the color of specified substrings to red. | |
| """ | |
| self.markdown = self.text_to_analyse | |
| for substring in self.get_list_of_entities(): | |
| self.markdown = self.markdown.replace(substring, f'<span style = "color:red;">{substring}</span>') | |
| self.markdown_text = self.markdown.replace('\n', ' \n') # Two spaces at the end of line for markdown new line | |
| def unique_entities(self): | |
| """ | |
| Return a list of all unique entities in the original text. | |
| :return: A list of unique entities. | |
| """ | |
| unique_set = set() # Sets are faster than lists for checking membership | |
| # Create a new list to store the unique strings in order | |
| unique_list = [] | |
| for string in self.all_entities: | |
| if string not in unique_set: | |
| unique_set.add(string) | |
| unique_list.append(string) | |
| return unique_list |