Spaces:
Paused
Paused
| # -*- coding: utf-8 -*- | |
| """utils(2).ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1snWVRieogxGIRp-UsTCZWjLM5ir5KQxB | |
| """ | |
| import re | |
| import nltk | |
| import torch | |
| import numpy as np | |
| from nltk.tokenize import TweetTokenizer | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.corpus import stopwords | |
| from nltk.corpus import wordnet | |
| from transformers import BertTokenizer | |
| from keras.preprocessing.sequence import pad_sequences | |
| nltk.download('stopwords') | |
| stopword_list = nltk.corpus.stopwords.words('english') | |
| stopword_list.remove('no') | |
| stopword_list.remove('not') | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| nltk.download('wordnet') | |
| tokenizer = TweetTokenizer() | |
| lemmatizer = WordNetLemmatizer() | |
| tokenizer_B = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True) | |
| device = torch.device("cuda") | |
| # wordnet ๋ชจ๋์ ์ฌ์ฉํ์ฌ ๋จ์ด์ ํ์ฌ(POS, Part of Speech)๋ฅผ ๊ฐ์ ธ์ค๋ ํจ์ | |
| def get_wordnet_pos(word): | |
| """Map POS tag to first character lemmatize() accepts""" | |
| tag = nltk.pos_tag([word])[0][1][0].upper() | |
| tag_dict = {"J": wordnet.ADJ, #ํ์ฉ์ฌ | |
| "N": wordnet.NOUN, #๋ช ์ฌ | |
| "V": wordnet.VERB, #๋์ฌ | |
| "R": wordnet.ADV} #๋ถ์ฌ | |
| return tag_dict.get(tag, wordnet.NOUN) | |
| def get_wordnet_pos(word): | |
| """Map POS tag to first character lemmatize() accepts""" | |
| tag = nltk.pos_tag([word])[0][1][0].upper() | |
| tag_dict = {"J": wordnet.ADJ, #ํ์ฉ์ฌ | |
| "N": wordnet.NOUN, #๋ช ์ฌ | |
| "V": wordnet.VERB, #๋์ฌ | |
| "R": wordnet.ADV} #๋ถ์ฌ | |
| return tag_dict.get(tag, wordnet.NOUN) | |
| # ์ ์ฒ๋ฆฌ ํจ์ | |
| def pre_data(data): | |
| #์๋ฌธ์ | |
| df2 = data.lower().strip() | |
| #ํ ํฐํโTweetTokenizer ์ฌ์ฉ | |
| df_token = tokenizer.tokenize(df2) | |
| #@์์ด๋ โ ์ ๊ฑฐ | |
| df_IDdel = [] | |
| for word in df_token: | |
| if '@' not in word: | |
| df_IDdel.append(word) | |
| #๋ค์ ๋ฌธ์ฅ,.. | |
| df_IDdel_sen = ' '.join(df_IDdel) | |
| #์์ด ์๋ ๋ฌธ์๋ค ๊ณต๋ฐฑ์ผ๋ก ์ ํ | |
| df_eng = re.sub("[^a-zA-Z]", " ", df_IDdel_sen) | |
| #๋ฐ๋ณต๋ ์ฒ ์ ์ง์ฐ๊ธฐ (์ต๋ 2๊ฐ๊น์ง ๊ฐ๋ฅ) | |
| df_rep_list = [] | |
| for i, e in enumerate(df_eng): | |
| if i > 1 and e == df_eng[i - 2] and e == df_eng[i - 1]: | |
| df_rep_list.append('') | |
| else: | |
| df_rep_list.append(e) | |
| df_rep = ''.join(df_rep_list) | |
| #์ฐ์๋ ๊ณต๋ฐฑ ์ ๋ฆฌ | |
| df_rep = re.sub(r'\s+', ' ', df_rep) | |
| #ํ์ ์ด ์ถ์ถ(lemmatizer) | |
| df_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(df_rep)] | |
| #๋ถ์ฉ์ด ์ ๊ฑฐ | |
| df_clean = [w for w in df_lemma if not w in stopword_list] | |
| if len(df_clean) == 0: | |
| df_clean = 'NC' #NC=No Category - ๋ฆฌ์คํธ๊ฐ ๋น์ด์์๋ ์ฌ์ฉํ๋ ๋ฌธ์์ด, ์๋ฏธ์๋ ๋จ์ดX | |
| else: df_clean = ' '.join(df_clean) | |
| return df_clean | |
| # ์ ๋ ฅ ๋ฐ์ดํฐ ๋ณํ | |
| def convert_input_data(sentences): | |
| # BERT์ ํ ํฌ๋์ด์ ๋ก ๋ฌธ์ฅ์ ํ ํฐ์ผ๋ก ๋ถ๋ฆฌ | |
| tokenized_texts = [tokenizer_B.tokenize(sent) for sent in sentences] | |
| # ์ ๋ ฅ ํ ํฐ์ ์ต๋ ์ํ์ค ๊ธธ์ด | |
| MAX_LEN = 80 | |
| # ํ ํฐ์ ์ซ์ ์ธ๋ฑ์ค๋ก ๋ณํ | |
| input_ids = [tokenizer_B.convert_tokens_to_ids(x) for x in tokenized_texts] | |
| # ๋ฌธ์ฅ์ MAX_LEN ๊ธธ์ด์ ๋ง๊ฒ ์๋ฅด๊ณ , ๋ชจ์๋ ๋ถ๋ถ์ ํจ๋ฉ 0์ผ๋ก ์ฑ์ | |
| input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") | |
| # ์ดํ ์ ๋ง์คํฌ ์ด๊ธฐํ | |
| attention_masks = [] | |
| # ์ดํ ์ ๋ง์คํฌ๋ฅผ ํจ๋ฉ์ด ์๋๋ฉด 1, ํจ๋ฉ์ด๋ฉด 0์ผ๋ก ์ค์ | |
| # ํจ๋ฉ ๋ถ๋ถ์ BERT ๋ชจ๋ธ์์ ์ดํ ์ ์ ์ํํ์ง ์์ ์๋ ํฅ์ | |
| for seq in input_ids: | |
| seq_mask = [float(i>0) for i in seq] | |
| attention_masks.append(seq_mask) | |
| # ๋ฐ์ดํฐ๋ฅผ ํ์ดํ ์น์ ํ ์๋ก ๋ณํ | |
| inputs = torch.tensor(input_ids) | |
| masks = torch.tensor(attention_masks) | |
| return inputs, masks | |
| # ๋ฌธ์ฅ ํ ์คํธ | |
| def test_sentences(sentences, load_model): | |
| # ๋ฌธ์ฅ์ ์ ๋ ฅ ๋ฐ์ดํฐ๋ก ๋ณํ | |
| inputs, masks = convert_input_data(sentences) | |
| # ๋ฐ์ดํฐ๋ฅผ GPU์ ๋ฃ์ | |
| b_input_ids = inputs.to(device) | |
| b_input_mask = masks.to(device) | |
| # ๊ทธ๋๋์ธํธ ๊ณ์ฐ ์ํจ | |
| with torch.no_grad(): | |
| # Forward ์ํ | |
| outputs = load_model(b_input_ids, | |
| token_type_ids=None, | |
| attention_mask=b_input_mask) | |
| # ๋ก์ค ๊ตฌํจ | |
| logits = outputs[0] | |
| # CPU๋ก ๋ฐ์ดํฐ ์ด๋ | |
| logits = logits.detach().cpu().numpy() | |
| return logits |