tre-1 / src /predict_word_segmentation.py

Add external dictionary features for VLSP 2013 word segmentation

73173ad about 2 months ago

7.8 kB

	# /// script
	# requires-python = ">=3.9"
	# dependencies = [
	# "python-crfsuite>=0.9.11",
	# "click>=8.0.0",
	# "underthesea>=6.8.0",
	# "underthesea-core @ file:///home/claude-user/projects/workspace_underthesea/underthesea-core-dev/extensions/underthesea_core/target/wheels/underthesea_core-1.0.7-cp312-cp312-manylinux_2_34_x86_64.whl",
	# ]
	# ///
	"""
	Prediction script for Vietnamese Word Segmentation.

	Uses underthesea regex_tokenize to split text into syllables,
	then applies CRF model at syllable level to decide word boundaries.

	Usage:
	uv run scripts/predict_word_segmentation.py "Trên thế giới, giá vàng đang giao dịch"
	echo "Text here" \| uv run scripts/predict_word_segmentation.py -
	"""

	import sys
	from pathlib import Path

	import click
	import pycrfsuite
	from underthesea.pipeline.word_tokenize.regex_tokenize import tokenize as regex_tokenize


	def get_syllable_at(syllables, position, offset):
	"""Get syllable at position + offset, with boundary handling."""
	idx = position + offset
	if idx < 0:
	return "__BOS__"
	elif idx >= len(syllables):
	return "__EOS__"
	return syllables[idx]


	def is_punct(s):
	"""Check if string is punctuation."""
	return len(s) == 1 and not s.isalnum()


	def load_dictionary(path):
	"""Load dictionary from a text file (one word per line)."""
	dictionary = set()
	with open(path, encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if line:
	dictionary.add(line)
	return dictionary


	def extract_syllable_features(syllables, position, dictionary=None):
	"""Extract features for a syllable at given position."""
	features = {}

	# Current syllable
	s0 = get_syllable_at(syllables, position, 0)
	is_boundary = s0 in ("__BOS__", "__EOS__")

	features["S[0]"] = s0
	features["S[0].lower"] = s0.lower() if not is_boundary else s0
	features["S[0].istitle"] = str(s0.istitle()) if not is_boundary else "False"
	features["S[0].isupper"] = str(s0.isupper()) if not is_boundary else "False"
	features["S[0].isdigit"] = str(s0.isdigit()) if not is_boundary else "False"
	features["S[0].ispunct"] = str(is_punct(s0)) if not is_boundary else "False"
	features["S[0].len"] = str(len(s0)) if not is_boundary else "0"
	features["S[0].prefix2"] = s0[:2] if not is_boundary and len(s0) >= 2 else s0
	features["S[0].suffix2"] = s0[-2:] if not is_boundary and len(s0) >= 2 else s0

	# Previous syllables
	s_1 = get_syllable_at(syllables, position, -1)
	s_2 = get_syllable_at(syllables, position, -2)
	features["S[-1]"] = s_1
	features["S[-1].lower"] = s_1.lower() if s_1 not in ("__BOS__", "__EOS__") else s_1
	features["S[-2]"] = s_2
	features["S[-2].lower"] = s_2.lower() if s_2 not in ("__BOS__", "__EOS__") else s_2

	# Next syllables
	s1 = get_syllable_at(syllables, position, 1)
	s2 = get_syllable_at(syllables, position, 2)
	features["S[1]"] = s1
	features["S[1].lower"] = s1.lower() if s1 not in ("__BOS__", "__EOS__") else s1
	features["S[2]"] = s2
	features["S[2].lower"] = s2.lower() if s2 not in ("__BOS__", "__EOS__") else s2

	# Bigrams
	features["S[-1,0]"] = f"{s_1}\|{s0}"
	features["S[0,1]"] = f"{s0}\|{s1}"

	# Trigrams
	features["S[-1,0,1]"] = f"{s_1}\|{s0}\|{s1}"

	# Dictionary lookup — longest match for bigram windows
	if dictionary is not None:
	n = len(syllables)

	if position >= 1:
	match = ""
	for length in range(2, min(6, position + 2)):
	start = position - length + 1
	if start >= 0:
	ngram = " ".join(syllables[start:position + 1]).lower()
	if ngram in dictionary:
	match = ngram
	features["S[-1,0].in_dict"] = match if match else "0"

	if position < n - 1:
	match = ""
	for length in range(2, min(6, n - position + 1)):
	ngram = " ".join(syllables[position:position + length]).lower()
	if ngram in dictionary:
	match = ngram
	features["S[0,1].in_dict"] = match if match else "0"

	return features


	def sentence_to_syllable_features(syllables, dictionary=None):
	"""Convert syllable sequence to feature sequences."""
	return [
	[f"{k}={v}" for k, v in extract_syllable_features(syllables, i, dictionary).items()]
	for i in range(len(syllables))
	]


	def labels_to_words(syllables, labels):
	"""Convert syllable sequence and BIO labels back to words."""
	words = []
	current_word = []

	for syl, label in zip(syllables, labels):
	if label == "B":
	if current_word:
	words.append(" ".join(current_word))
	current_word = [syl]
	else: # I
	current_word.append(syl)

	if current_word:
	words.append(" ".join(current_word))

	return words


	def segment_text(text, tagger, dictionary=None):
	"""
	Full pipeline: regex tokenize -> CRF segment -> output words.
	"""
	# Step 1: Regex tokenize into syllables
	syllables = regex_tokenize(text)

	if not syllables:
	return ""

	# Step 2: Extract syllable features
	X = sentence_to_syllable_features(syllables, dictionary)

	# Step 3: Predict BIO labels
	labels = tagger.tag(X)

	# Step 4: Convert to words (syllables joined with underscore for compound words)
	words = labels_to_words(syllables, labels)

	return "_".join(words).replace(" ", "_").replace("_", " ").replace(" ", " _ ")


	def segment_text_formatted(text, tagger, use_underscore=True, dictionary=None):
	"""
	Full pipeline with formatted output.
	"""
	syllables = regex_tokenize(text)

	if not syllables:
	return ""

	X = sentence_to_syllable_features(syllables, dictionary)
	labels = tagger.tag(X)
	words = labels_to_words(syllables, labels)

	if use_underscore:
	# Join compound word syllables with underscore
	return " ".join(w.replace(" ", "_") for w in words)
	else:
	return " ".join(words)


	@click.command()
	@click.argument("text", required=False)
	@click.option(
	"--model", "-m",
	default="word_segmenter.crfsuite",
	help="Path to CRF model file",
	show_default=True,
	)
	@click.option(
	"--underscore/--no-underscore",
	default=True,
	help="Use underscore to join compound word syllables",
	)
	def main(text, model, underscore):
	"""Segment Vietnamese text into words."""
	# Handle stdin input
	if text == "-" or text is None:
	text = sys.stdin.read().strip()

	if not text:
	click.echo("No input text provided", err=True)
	return

	# Load model - support both pycrfsuite and underthesea-core formats
	if model.endswith(".crf"):
	# underthesea-core format
	try:
	from underthesea_core import CRFModel, CRFTagger
	except ImportError:
	from underthesea_core.underthesea_core import CRFModel, CRFTagger
	crf_model = CRFModel.load(model)
	tagger = CRFTagger.from_model(crf_model)
	else:
	# pycrfsuite format
	tagger = pycrfsuite.Tagger()
	tagger.open(model)

	# Load dictionary if available alongside model
	model_dir = Path(model).parent
	dict_path = model_dir / "dictionary.txt"
	dictionary = load_dictionary(dict_path) if dict_path.exists() else None
	if dictionary:
	click.echo(f"Dictionary: {len(dictionary)} words", err=True)

	# Process each line
	for line in text.split("\n"):
	if line.strip():
	result = segment_text_formatted(line, tagger, use_underscore=underscore, dictionary=dictionary)
	click.echo(result)


	if __name__ == "__main__":
	main()