Spaces:

S2gamzaS2
/

Twitter_nlp

Paused

App Files Files Community

Twitter_nlp / utils.py

gamza

Upload 2 files

d37b5f8 over 1 year ago

raw

history blame

4.6 kB

	# -- coding: utf-8 --
	"""utils(2).ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1snWVRieogxGIRp-UsTCZWjLM5ir5KQxB
	"""

	import re
	import nltk
	import torch
	import numpy as np

	from nltk.tokenize import TweetTokenizer
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords
	from nltk.corpus import wordnet
	from transformers import BertTokenizer
	from keras.preprocessing.sequence import pad_sequences

	nltk.download('stopwords')

	stopword_list = nltk.corpus.stopwords.words('english')
	stopword_list.remove('no')
	stopword_list.remove('not')

	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('wordnet')

	tokenizer = TweetTokenizer()
	lemmatizer = WordNetLemmatizer()
	tokenizer_B = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

	device = torch.device("cuda")

	# wordnet 모듈을 사용하여 단어의 품사(POS, Part of Speech)를 가져오는 함수
	def get_wordnet_pos(word):
	"""Map POS tag to first character lemmatize() accepts"""
	tag = nltk.pos_tag([word])[0][1][0].upper()
	tag_dict = {"J": wordnet.ADJ, #형용사
	"N": wordnet.NOUN, #명사
	"V": wordnet.VERB, #동사
	"R": wordnet.ADV} #부사

	return tag_dict.get(tag, wordnet.NOUN)

	def get_wordnet_pos(word):
	"""Map POS tag to first character lemmatize() accepts"""
	tag = nltk.pos_tag([word])[0][1][0].upper()
	tag_dict = {"J": wordnet.ADJ, #형용사
	"N": wordnet.NOUN, #명사
	"V": wordnet.VERB, #동사
	"R": wordnet.ADV} #부사

	return tag_dict.get(tag, wordnet.NOUN)

	# 전처리 함수
	def pre_data(data):

	#소문자
	df2 = data.lower().strip()

	#토큰화→TweetTokenizer 사용
	df_token = tokenizer.tokenize(df2)

	#@아이디 → 제거
	df_IDdel = []
	for word in df_token:
	if '@' not in word:
	df_IDdel.append(word)

	#다시 문장,..
	df_IDdel_sen = ' '.join(df_IDdel)

	#영어 아닌 문자들 공백으로 전환
	df_eng = re.sub("[^a-zA-Z]", " ", df_IDdel_sen)

	#반복된 철자 지우기 (최대 2개까지 가능)
	df_rep_list = []
	for i, e in enumerate(df_eng):
	if i > 1 and e == df_eng[i - 2] and e == df_eng[i - 1]:
	df_rep_list.append('')
	else:
	df_rep_list.append(e)
	df_rep = ''.join(df_rep_list)
	#연속된 공백 정리
	df_rep = re.sub(r'\s+', ' ', df_rep)

	#표제어 추출(lemmatizer)
	df_lemma = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(df_rep)]

	#불용어 제거
	df_clean = [w for w in df_lemma if not w in stopword_list]

	if len(df_clean) == 0:
	df_clean = 'NC' #NC=No Category - 리스트가 비어있을때 사용하는 문자열, 의미있는 단어X
	else: df_clean = ' '.join(df_clean)

	return df_clean

	# 입력 데이터 변환
	def convert_input_data(sentences):

	# BERT의 토크나이저로 문장을 토큰으로 분리
	tokenized_texts = [tokenizer_B.tokenize(sent) for sent in sentences]

	# 입력 토큰의 최대 시퀀스 길이
	MAX_LEN = 80

	# 토큰을 숫자 인덱스로 변환
	input_ids = [tokenizer_B.convert_tokens_to_ids(x) for x in tokenized_texts]

	# 문장을 MAX_LEN 길이에 맞게 자르고, 모자란 부분을 패딩 0으로 채움
	input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

	# 어텐션 마스크 초기화
	attention_masks = []

	# 어텐션 마스크를 패딩이 아니면 1, 패딩이면 0으로 설정
	# 패딩 부분은 BERT 모델에서 어텐션을 수행하지 않아 속도 향상
	for seq in input_ids:
	seq_mask = [float(i>0) for i in seq]
	attention_masks.append(seq_mask)

	# 데이터를 파이토치의 텐서로 변환
	inputs = torch.tensor(input_ids)
	masks = torch.tensor(attention_masks)

	return inputs, masks

	# 문장 테스트
	def test_sentences(sentences, load_model):

	# 문장을 입력 데이터로 변환
	inputs, masks = convert_input_data(sentences)

	# 데이터를 GPU에 넣음
	b_input_ids = inputs.to(device)
	b_input_mask = masks.to(device)

	# 그래디언트 계산 안함
	with torch.no_grad():
	# Forward 수행
	outputs = load_model(b_input_ids,
	token_type_ids=None,
	attention_mask=b_input_mask)

	# 로스 구함
	logits = outputs[0]

	# CPU로 데이터 이동
	logits = logits.detach().cpu().numpy()

	return logits