Spaces:

Linhz
/

Vietnames-MNER

Paused

App Files Files Community

Vietnames-MNER / thunghiemxuly.py

Linhz

Upload 3 files

4c741a4 verified 2 months ago

raw

history blame contribute delete

No virus

4.33 kB

	import re
	import os
	from Model.NER.VLSP2021.Predict_Ner import ViTagger,normalize_text
	def process_text(text):
	# Loại bỏ dấu cách thừa và dấu cách ở đầu và cuối văn bản
	processed_text = re.sub(r'\s+', ' ', text.strip())
	return processed_text

	# Sử dụng hàm process_text để xử lý văn bản
	text = """
	Trang Footballogue vừa đăng tải đoạn video được cho là quay ở phòng tập thể dục của CLB Al Nassr vào hôm 7/8. Trong đoạn video đó, C.Ronaldo vẫn miệt mài tập luyện một mình, dù cho cả đội đã ra về từ lâu.

	Tờ báo này bình luận: "Khi tất cả các đồng đội ở Al Nassr ra về, C.Ronaldo vẫn miệt mài tập luyện. Kỷ luật của CR7 thật đáng ngưỡng mộ khi cầu thủ này đã có trong tay mọi thứ".

	Trên trang Twitter, những người hâm mộ đã bày tỏ sự thán phục sự chăm chỉ và chuyên nghiệp của C.Ronaldo. Dưới đây là một vài dòng bình luận:

	"C.Ronaldo là biểu tượng của sự tận hiến trong bóng đá".

	"Ở tuổi 38, khi nhiều cầu thủ treo giày, C.Ronaldo vẫn miệt mài tập luyện. Bạn sẽ không tìm cầu thủ thứ hai trong lịch sử như vậy".
	"""

	# processed_text = process_text(text)
	# print(processed_text)

	LABEL2ID_VLSP2021 = ['O', 'LOCATION-GPE', 'QUANTITY-NUM', 'EVENT-CUL', 'DATETIME', 'PERSONTYPE', 'PERSON', 'QUANTITY-PER', 'ORGANIZATION', 'LOCATION-GEO', 'LOCATION-STRUC', 'PRODUCT-COM', 'DATETIME-DATE', 'QUANTITY-DIM', 'PRODUCT', 'QUANTITY', 'DATETIME-DURATION', 'PERSON', 'QUANTITY-CUR', 'DATETIME-TIME', 'QUANTITY-TEM', 'DATETIME-TIMERANGE', 'EVENT-GAMESHOW', 'QUANTITY-AGE', 'QUANTITY-ORD', 'PRODUCT-LEGAL', 'PERSONTYPE', 'LOCATION', 'ORGANIZATION-MED', 'URL', 'PHONENUMBER', 'ORGANIZATION-SPORTS', 'EVENT-SPORT', 'SKILL', 'EVENT-NATURAL', 'ADDRESS', 'IP', 'EMAIL', 'ORGANIZATION-STOCK', 'DATETIME-SET', 'PRODUCT-AWARD', 'MISCELLANEOUS', 'LOCATION-GPE-GEO']
	# print(len(LABEL2ID_VLSP2021))

	def save_uploaded_image(image, directory):
	if not os.path.exists(directory):
	os.makedirs(directory)
	file_path = os.path.join(directory, image.name)
	with open(file_path, "wb") as f:
	f.write(image.getbuffer())
	# def convert_text_to_txt(text,file_path):
	# # Gộp các dòng văn bản thành một đoạn văn
	# paragraph = text.replace('\n', ' ')
	#
	# # Sử dụng biểu thức chính quy để tách từ và dấu câu
	# words_list = re.findall(r'\w+\|[.,]', paragraph)
	# with open(file_path, 'w', encoding='utf-8') as file:
	# for word in words_list:
	# file.write(word + '\n')
	# return words_list



	# # Văn bản mẫu
	# text = """Toi ten la Minh"""
	# # Sử dụng hàm để chuyển đổi văn bản
	# sa='E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/list.txt'
	# convert_text_to_txt(text ,sa)

	def add_string_to_txt(string, file_path):
	# Đọc dữ liệu từ tệp
	file_name = string.split('.')[0]

	with open(file_path, 'r', encoding='utf-8') as file:
	lines = file.readlines()

	# Thêm chuỗi vào dòng đầu tiên
	lines.insert(0, f"IMGID:{file_name}\n")

	# Ghi lại dữ liệu vào tệp
	with open(file_path, 'w', encoding='utf-8') as file:
	file.writelines(lines)

	# string= 'namngo.jpg'
	# add_string_to_txt(string, sa)
	# # In kết quả


	import os
	import re

	def convert_text_to_txt(text, file_path):
	# Merge lines of text into a paragraph
	paragraph = text.replace('\n', ' ')

	# Use regular expression to separate words and punctuation marks
	words_list = re.findall(r'\w+\|[.,]', paragraph)

	# Ensure the directory exists
	directory = os.path.dirname(file_path)
	if not os.path.exists(directory):
	os.makedirs(directory)

	# Write words to the file
	with open(file_path, 'w', encoding='utf-8') as file:
	for word in words_list:
	file.write(word + '\n')

	return words_list

	# Example usage
	# text = "This is some example text."
	# output_file_path = 'E:/demo_datn/pythonProject1/Model/MultimodelNER/VLSP2016/Filetxt/output.txt'
	# convert_text_to_txt(text, output_file_path)