""" This module contains the functions to get PoS tags using Spacy and return a Markdown table """ from .alignment_mappers import get_alignment_mapping, select_model from flair.models import SequenceTagger from flair.data import Sentence import spacy from spacy.cli import download download("en_core_web_sm") import en_core_web_sm import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') from textblob import TextBlob def get_spacy_postag_dict(target=""): ''' Get spacy pos tags ''' nlp = en_core_web_sm.load() target_tokenized = nlp(target) spacy_postag_dict = dict((token.text, token.tag_) for token in target_tokenized) return spacy_postag_dict def get_nltk_postag_dict(target=""): ''' Get nltk pos tags ''' target_tokenized = nltk.tokenize.word_tokenize(target) nltk_postag_dict = dict((key, value) for key, value in nltk.pos_tag(target_tokenized)) return nltk_postag_dict def get_flair_postag_dict(target=""): ''' Get flair pos tags ''' tagger = SequenceTagger.load("pos") target_tokenized = Sentence(target) tagger.predict(target_tokenized) flair_postag_dict = dict((token.text, token.tag) for token in target_tokenized) return flair_postag_dict def get_textblob_postag_dict(target=""): ''' Get textblob pos tags ''' blob = TextBlob(target) textblob_postag_dict = dict(blob.tags) return textblob_postag_dict def get_postag( get_postag_dict, source="", target="", model_name="musfiqdehan/bn-en-word-aligner"): """Get Spacy PoS Tags and return a Markdown table""" sent_src, sent_tgt, align_words = get_alignment_mapping( source=source, target=target, model_name=model_name ) postag_dict = get_postag_dict(target=target) mapped_sent_src = [] html_table = '''
Bangla | English | PoS Tags | ''' for i, j in sorted(align_words): punc = r"""!()-[]{}ред;:'"\,<>./?@#$%^&*_~""" if sent_src[i] in punc or sent_tgt[j] in punc: mapped_sent_src.append(sent_src[i]) html_table += f'''
---|---|---|
{sent_src[i]} | {sent_tgt[j]} | PUNC |
{sent_src[i]} | {sent_tgt[j]} | {postag_dict[sent_tgt[j]]} |
{word} | N/A | UNK |