File size: 4,404 Bytes
d52ef71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import re
from datasets import load_dataset
from deepmultilingualpunctuation import PunctuationModel
from multiprocess import set_start_method
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
import nltk
import spacy
# from rpunct import RestorePuncts
# rpunct = RestorePuncts()
model = PunctuationModel()
ds = load_dataset("ylacombe/mls-eng-tags", split = "train", num_proc=16)
def truecasing_by_pos(input_text):
# break input text to sentences
sent_texts = sent_tokenize(input_text)
full_text = ""
for sent_text in sent_texts:
# tokenize the text into words
words = word_tokenize(sent_text)
# apply POS-tagging on words
tagged_words = pos_tag([word.lower() for word in words])
# apply capitalization based on POS tags
capitalized_words = [w.capitalize() if t in ["NNP","NNPS"] else w for (w,t) in tagged_words]
# capitalize first word in sentence
capitalized_words[0] = capitalized_words[0].capitalize()
# join capitalized words
text_truecase = " ".join(capitalized_words)
full_text += text_truecase.strip()
return full_text.strip()
def true_case(text):
# Split the text into sentences
sentences = nltk.sent_tokenize(text)
# Process each sentence
true_cased_sentences = []
for sentence in sentences:
# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)
# Perform POS tagging
tagged = nltk.pos_tag(tokens)
# Capitalize the first word of the sentence and NNP and NNPS tags
for i, (word, tag) in enumerate(tagged):
if i == 0 or tag in ('NNP', 'NNPS'):
tagged[i] = (word.capitalize(), tag)
# Join tokens back into a string, preserving punctuation
true_cased_sentence = ' '.join(word for word, tag in tagged)
# Remove spaces between punctuations and the preceding word
true_cased_sentence = re.sub(r'(\w) (\W)', r'\1\2', true_cased_sentence)
true_cased_sentences.append(true_cased_sentence)
# Join the processed sentences back into a single string
true_cased_text = ' '.join(true_cased_sentences)
return true_cased_text
spacy.require_gpu(gpu_id=2)
# Load the spaCy model
nlp = spacy.load('en_core_web_trf')
from spacy.util import compile_infix_regex
def custom_tokenizer(nlp):
infixes = nlp.Defaults.infixes + ['\w+(?:-\w+)+']
infix_regex = compile_infix_regex(infixes)
return spacy.tokenizer.Tokenizer(nlp.vocab, infix_finditer=infix_regex.finditer)
# Use the custom tokenizer
nlp.tokenizer = custom_tokenizer(nlp)
def true_case_spacy(text):
# Process the text with the spaCy model
doc = nlp(text)
# Initialize an empty list to hold the processed sentences
true_cased_sentences = []
# Iterate through the sentences in the Doc object
for sent in doc.sents:
# Initialize an empty list to hold the processed tokens of the current sentence
processed_tokens = []
# Iterate through the tokens in the current sentence
for i, token in enumerate(sent):
# Capitalize the first word of the sentence and proper nouns
if i == 0 or token.pos_ == 'PROPN':
processed_tokens.append(token.text.capitalize())
else:
processed_tokens.append(token.text)
# Join the processed tokens back into a string
processed_sentence = ' '.join(processed_tokens)
# Remove spaces between punctuations and the preceding word
processed_sentence = re.sub(r'(\w) (\W)', r'\1\2', processed_sentence)
# Add the processed sentence to the list of processed sentences
true_cased_sentences.append(processed_sentence)
# Join the processed sentences back into a single string
true_cased_text = ' '.join(true_cased_sentences)
return true_cased_text
def repunctuation_apply_simple(batch):
repunct_sample = model.restore_punctuation(batch["text"])
batch["repunct_text"] = true_case_spacy(repunct_sample)
return batch
if __name__ == "__main__":
set_start_method("spawn")
repunct_ds = ds.map(repunctuation_apply_simple, batch_size=1, num_proc=14)
repunct_ds.push_to_hub("reach-vb/mls-eng-tags-spacy-v2", split = "train")
|