Spaces:

strongpear
/

Vietnamese-aspect-detection

Paused

App Files Files Community

Vietnamese-aspect-detection / data_preprocessing.py

strongpear

create data_preprocessing.py

0beb932 about 1 year ago

raw

history blame

5.58 kB

	# -- coding: utf-8 --
	"""
	Created on Fri Jul 28 08:29:31 2023

	@author: ASUS
	"""
	import pandas as pd
	import os
	import glob
	import re

	import unicodedata2
	from underthesea import word_tokenize

	path = 'raw_data/'
	files = glob.glob(os.path.join(path, "*.csv"))

	def read_csv_file(file):

	raw_df = pd.DataFrame()

	for file in files:
	drop_idx = []
	df = pd.read_csv(file)
	for index, row in df.iterrows():
	if len(row['comments'].split(" ")) < 10:
	drop_idx.append(index)

	df = df.drop(drop_idx, axis=0)
	df.reset_index(inplace=True)

	raw_df = pd.concat([raw_df, df], ignore_index=True)

	raw_df.drop(['index', 'Unnamed: 0'], axis=1, inplace=True)
	raw_df = raw_df.drop_duplicates()

	return raw_df

	def remove_xem_them(text):
	text = text.replace("Xem thêm", "")
	text = text.replace("xem thêm", "")

	return text

	# remove emojis
	def remove_emojis(text):
	emoj = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002500-\U00002BEF" # chinese char
	u"\U00002702-\U000027B0"
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	u"\U0001f926-\U0001f937"
	u"\U00010000-\U0010ffff"
	u"\u2640-\u2642"
	u"\u2600-\u2B55"
	u"\u200d"
	u"\u23cf"
	u"\u23e9"
	u"\u231a"
	u"\ufe0f" # dingbats
	u"\u3030"
	"]+", re.UNICODE)

	return re.sub(emoj, ' ', text)

	def remove_hastag(text):
	pattern = re.compile(r'([\#]+)((\w))(\s)')
	matches = pattern.finditer(text + " ")
	for m in matches:
	text = text.replace(m.group(), '')

	return text

	def remove_stopwords(text):
	stopwords = []

	f = open('vietnamese-stopwords.txt', encoding='utf8')
	for line in f:
	stopwords.append(line.rstrip('\n'))

	new_text = ' '.join([i for i in text.split() if i not in stopwords])

	return new_text

	# split word with punctuation
	def format_punctuation(text):
	pattern = re.compile(r'(([\!\"\#\$\%\&\,\.\-\_\+\:\;\?\^\•])+)(\w+)')
	matches = pattern.finditer(text + " ")
	for m in matches:
	text = text.replace(m.group()[0], ' ')

	return text

	# remove punctuation
	def remove_punctuation(text):
	punc = "'!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{\|}~‘’“”•…‼‼‼⁃₫√≧≦–"
	new_text = "".join([i for i in text if i not in punc])

	return new_text

	def format_price(text):
	pattern = re.compile(r'([0-9]+)(\s*)(k)(?=\W)')
	matches = pattern.finditer(text + " ")
	prices = []
	new_prices = []
	for m in matches:
	prices.append(m.group())
	new_prices.append(m.group().replace('k', '') + " nghìn_đồng")

	pattern = re.compile(r'([0-9]+)(\s)(tr \|m )(([0-9]))')
	matches = pattern.finditer(text + " ")
	for m in matches:
	prices.append(m.group())
	for r in ["tr ", "m "]:
	if r in m.group():
	n_p = m.group().replace(r, " triệu ")
	break
	tmp = n_p.split("triệu")
	if tmp[1] == " ":
	n_p += "_đồng "
	else :
	if int(tmp[1]) < 10:
	tmp[1] = int(tmp[1]) * 100
	if int(tmp[1]) < 100:
	tmp[1] = int(tmp[1]) * 10
	n_p = tmp[0] + "_triệu " + str(tmp[1]) + " nghìn_đồng"
	new_prices.append(n_p)

	for i in range(len(prices)):
	text = text.replace(prices[i], new_prices[i])

	text = text.replace("nghìn đồng", "nghìn_đồng")
	text = text.replace("triệu đồng", "triệu_đồng")

	return text

	def format_price_v2(text):
	pattern = re.compile(r'([0-9]+)(\s*)(triệu_đồng\|nghìn_đồng\|nghìn)')
	matches = pattern.finditer(text + " ")
	old = []
	new = []
	for m in matches:
	old.append(m.group())
	new.append("_".join(m.group().split()))
	for i in range(len(old)):
	text = text.replace(old[i], new[i])

	return text

	def clean_text(text):
	text = text.lower()
	rp_dict = {"cty":"công ty", "\"":"", "'":"", "\n":" ", " k ":" không ", " h ":" giờ ", " ko ":" không ", " cf ":" cà phê ", " cofe ":" cà phê ", " coffee ":" cà phê ", " cofee ":" cà phê ", " cafe ":" cà phê ", " cafee ":" cà phê ",
	" j ":" gì ", ".000":" nghìn", "vnd":" đồng", "vnđ":" đồng", " r ":" rồi ", " đc ":" được ", " dc ":" được ", " pv ":" phục vụ ", " pvu ":" phục vụ ", " pvụ ":" phục vụ ",
	" nv ":" nhân viên ", " nvien ":" nhân viên ", " nviên ": " nhân viên ", " b ":" bạn ", " m ":" mình ", " ng ":" người ", " cx ":" cũng ", "oder":"order", "ita":"ít",
	"vaie":"vải", "chie":"chỉ", "cb":"chuẩn bị", "nc":"nước", "khoog":"không", "bânh":"bánh", "lug":"lung", "nhiêm":"nhiên", "nguời":"người", "ntn":"như thế này", "nuớc":"nước",
	"lẫu":"lẩu", "dẻ":"rẻ", "siu":"siêu", "ni":"này"}

	for key, value in rp_dict.items():
	text = text.replace(key, value)

	text = re.sub('\n', '' , text)

	return text

	def normalize_format(text):
	return unicodedata2.normalize('NFC', text)

	def word_segment(text):
	try:
	text = word_tokenize(text, format='text')
	except:
	return "Lỗi"
	return text