Spaces:

krislette
/

kataklassifer

Sleeping

App Files Files Community

kataklassifer / scripts /predict.py

krislette

Initial commit

caf26c9 about 1 month ago

raw

history blame contribute delete

5.15 kB

	"""
	Loads the saved model artifacts and predicts the origin language of one or
	more katakana loanwords provided by the user.

	The model, vectorizer, and label encoder must already exist in models/.
	Run scripts/train.py first if they are not there yet.

	Usage (interactive prompt):
	python scripts/predict.py

	Usage (pass words directly as arguments):
	python scripts/predict.py テレビコーヒーアルバイト

	Output example:
	テレビ -> English
	コーヒー -> Dutch
	アルバイト -> German
	"""

	import sys
	import os
	import re

	# Allow imports from the project root regardless of where the script is called from
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

	import joblib

	MODEL_DIR = "models"

	# Matches strings made entirely of katakana characters (same rule as loader.py)
	KATAKANA_PATTERN = re.compile(r"^[\u30A0-\u30FF]+$")


	def load_artifacts():
	"""
	Load the three saved model artifacts from the models/ directory.

	Exits with a clear message if the files are not found, so the user
	knows to run train.py first rather than seeing a raw FileNotFoundError.
	"""
	model_path = os.path.join(MODEL_DIR, "model.joblib")
	vectorizer_path = os.path.join(MODEL_DIR, "vectorizer.joblib")
	encoder_path = os.path.join(MODEL_DIR, "encoder.joblib")

	missing = [
	p for p in (model_path, vectorizer_path, encoder_path) if not os.path.exists(p)
	]
	if missing:
	print("\n[predict] Error: the following model files were not found:")
	for path in missing:
	print(f" {path}")
	print("\n Run scripts/train.py first to generate them.\n")
	sys.exit(1)

	model = joblib.load(model_path)
	vectorizer = joblib.load(vectorizer_path)
	label_encoder = joblib.load(encoder_path)
	return model, vectorizer, label_encoder


	def predict(
	words: list[str], model, vectorizer, label_encoder
	) -> list[tuple[str, str]]:
	"""
	Predict the origin language for each katakana word.

	Words that are not pure katakana are flagged as invalid and skipped
	rather than silently passed to the model with meaningless features.

	Args:
	words : List of katakana strings to classify.
	model : Fitted LinearSVC.
	vectorizer : Fitted TfidfVectorizer.
	label_encoder : Fitted LabelEncoder.

	Returns:
	List of (word, prediction) tuples. Invalid words get '(invalid input)'
	as their prediction so the output table stays aligned.
	"""
	results = []

	# Separate valid katakana words from invalid inputs
	valid_words = [w for w in words if KATAKANA_PATTERN.match(w)]
	invalid_words = {w for w in words if not KATAKANA_PATTERN.match(w)}

	if valid_words:
	# Vectorize all valid words in one batch for efficiency
	X = vectorizer.transform(valid_words)
	predictions = label_encoder.inverse_transform(model.predict(X))
	word_to_pred = dict(zip(valid_words, predictions))
	else:
	word_to_pred = {}

	for word in words:
	if word in invalid_words:
	results.append((word, "(invalid — not pure katakana)"))
	else:
	results.append((word, word_to_pred[word]))

	return results


	def print_results(results: list[tuple[str, str]]):
	"""Print predictions in a clean aligned two-column table."""
	if not results:
	return

	max_word_len = max(len(word) for word, _ in results)

	print()
	print(f' {"Katakana":<{max_word_len + 2}} Predicted Origin Language')
	print(f' {"-" * (max_word_len + 2)} {"-" * 28}')
	for word, prediction in results:
	print(f" {word:<{max_word_len + 2}} {prediction}")
	print()


	def interactive_mode(model, vectorizer, label_encoder):
	"""
	Run a loop that prompts the user for katakana words and prints predictions.

	Multiple words can be entered space-separated on a single line.
	Type 'quit' or press Ctrl+C to exit.
	"""
	print("\n[predict] Gairaigo Origin Classifier — interactive mode")
	print(' Enter katakana words (space-separated). Type "quit" to exit.\n')

	while True:
	try:
	raw = input(" >> ").strip()
	except (KeyboardInterrupt, EOFError):
	print("\n じゃあね！\n")
	break

	if raw.lower() in ("quit", "exit", "q"):
	print(" じゃあね！\n")
	break

	if not raw:
	continue

	words = raw.split()
	results = predict(words, model, vectorizer, label_encoder)
	print_results(results)


	def main():
	model, vectorizer, label_encoder = load_artifacts()
	print(f"\n[predict] Model loaded. Known classes: {list(label_encoder.classes_)}")

	# If words were passed as CLI arguments, classify them and exit
	if len(sys.argv) > 1:
	words = sys.argv[1:]
	results = predict(words, model, vectorizer, label_encoder)
	print_results(results)
	else:
	# No arguments, launch interactive prompt
	interactive_mode(model, vectorizer, label_encoder)


	if __name__ == "__main__":
	main()