Update generic_ner.py
Browse files- generic_ner.py +41 -30
generic_ner.py
CHANGED
@@ -540,6 +540,7 @@ def remove_trailing_stopwords(entities):
|
|
540 |
rOffset = entity.get("rOffset", original_len)
|
541 |
|
542 |
# Remove stopwords and punctuation from the beginning
|
|
|
543 |
while entity_text and (
|
544 |
entity_text.split()[0].lower() in stop_words
|
545 |
or entity_text[0] in punctuation
|
@@ -561,36 +562,48 @@ def remove_trailing_stopwords(entities):
|
|
561 |
print(
|
562 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
563 |
)
|
|
|
564 |
|
|
|
565 |
# Remove stopwords and punctuation from the end
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
|
|
|
|
593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
594 |
# Skip certain entities based on rules
|
595 |
if entity_text in string.punctuation:
|
596 |
if DEBUG:
|
@@ -669,13 +682,11 @@ def remove_trailing_stopwords(entities):
|
|
669 |
entities.remove(entity)
|
670 |
else:
|
671 |
new_entities.append(entity)
|
672 |
-
|
673 |
-
new_entities.append(entity)
|
674 |
if DEBUG:
|
675 |
print(f"Remained entities: {len(new_entities)}")
|
676 |
return new_entities
|
677 |
|
678 |
-
|
679 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
680 |
|
681 |
def _sanitize_parameters(self, **kwargs):
|
|
|
540 |
rOffset = entity.get("rOffset", original_len)
|
541 |
|
542 |
# Remove stopwords and punctuation from the beginning
|
543 |
+
i = 0
|
544 |
while entity_text and (
|
545 |
entity_text.split()[0].lower() in stop_words
|
546 |
or entity_text[0] in punctuation
|
|
|
562 |
print(
|
563 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
564 |
)
|
565 |
+
i += 1
|
566 |
|
567 |
+
i = 0
|
568 |
# Remove stopwords and punctuation from the end
|
569 |
+
iteration = 0
|
570 |
+
max_iterations = len(entity_text) # Prevent infinite loops
|
571 |
+
|
572 |
+
while entity_text and iteration < max_iterations:
|
573 |
+
# Check if the last word is a stopword or the last character is punctuation
|
574 |
+
last_word = entity_text.split()[-1] if entity_text.split() else ""
|
575 |
+
last_char = entity_text[-1]
|
576 |
+
|
577 |
+
if last_word.lower() in stop_words:
|
578 |
+
# Remove trailing stopword and adjust rOffset
|
579 |
+
stopword_len = len(last_word) + 1 # Include space before stopword
|
580 |
+
entity_text = entity_text[:-stopword_len].rstrip()
|
581 |
+
rOffset -= stopword_len
|
582 |
+
if DEBUG:
|
583 |
+
print(
|
584 |
+
f"Removed trailing stopword from entity: {entity_text} (rOffset={rOffset})"
|
585 |
+
)
|
586 |
+
|
587 |
+
elif last_char in punctuation:
|
588 |
+
# Remove trailing punctuation and adjust rOffset
|
589 |
+
entity_text = entity_text[:-1].rstrip()
|
590 |
+
rOffset -= 1
|
591 |
+
if DEBUG:
|
592 |
+
print(
|
593 |
+
f"Removed trailing punctuation from entity: {entity_text} (rOffset={rOffset})"
|
594 |
+
)
|
595 |
+
else:
|
596 |
+
# Exit loop if neither stopwords nor punctuation are found
|
597 |
+
break
|
598 |
|
599 |
+
iteration += 1
|
600 |
+
# print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
|
601 |
+
|
602 |
+
if len(entity_text.strip()) == 1:
|
603 |
+
entities.remove(entity)
|
604 |
+
if DEBUG:
|
605 |
+
print(f"Skipping entity: {entity_text}")
|
606 |
+
continue
|
607 |
# Skip certain entities based on rules
|
608 |
if entity_text in string.punctuation:
|
609 |
if DEBUG:
|
|
|
682 |
entities.remove(entity)
|
683 |
else:
|
684 |
new_entities.append(entity)
|
685 |
+
|
|
|
686 |
if DEBUG:
|
687 |
print(f"Remained entities: {len(new_entities)}")
|
688 |
return new_entities
|
689 |
|
|
|
690 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
691 |
|
692 |
def _sanitize_parameters(self, **kwargs):
|