Spaces:

mikeee
/

radiobee-aligner

Build error

radiobee-aligner / radiobee /lists2cmat.py

freemt

Update before sent-align

4c04f50 over 2 years ago

No virus

3.07 kB

	"""Convert two lists of str (texts) to correlation matrix."""
	# pylint: disable=too-many-arguments, too-many-locals, unused-import

	from typing import Dict, Iterable, List, Optional, Union # noqa

	import numpy as np
	from textacy.representations import Vectorizer
	from fastlid import fastlid

	from radiobee.en2zh_tokens import en2zh_tokens
	from radiobee.insert_spaces import insert_spaces
	from radiobee.gen_model import gen_model
	from radiobee.smatrix import smatrix


	# fmt: off
	def lists2cmat(
	text1: Union[str, Iterable[str]],
	text2: Union[str, Iterable[str]],
	# text1: Union[str, List[str]],
	# text2: Union[str, List[str]],
	lang1: Optional[str] = None,
	lang2: Optional[str] = None,
	model: Vectorizer = None,
	tf_type: str = "linear",
	idf_type: Optional[str] = "smooth",
	# dl_type: Optional[str] = "sqrt", # "lucene-style tfidf"
	dl_type: Optional[str] = None, #
	norm: Optional[str] = "l2", # + "l2"
	min_df: Union[int, float] = 1,
	max_df: Union[int, float] = 1.0,
	max_n_terms: Optional[int] = None,
	vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
	) -> np.ndarray:
	# fmt: on
	"""Convert two lists to cmat.

	Args:
	text1: refer smatrix
	text2: refer smatrix
	lang1: optional 1st lang code
	lang2: optional 2nd lang code
	dl_type: doc lenth
	idf_type: idf tyoe
	max_df: max doc freq
	max_n_terms: max n terms
	min_df: min doc freq
	model: optional model
	norm: norm
	tf_type: term freq type
	vocabulary_terms: vocab refer smatrix

	Returs
	cmat
	"""
	if isinstance(text1, str):
	text1 = [text1]
	if isinstance(text2, str):
	text2 = [text2]

	set_languages = fastlid.set_languages
	fastlid.set_languages = ["en", "zh"]
	if lang1 is None:
	lang1, _ = fastlid(" ".join(text1))
	if lang2 is None:
	lang2, _ = fastlid(" ".join(text2))

	# restore fastlid.set_languages
	fastlid.set_languages = set_languages

	# en2zh_tokens
	def zh_tokens(textzh):
	return [insert_spaces(elm).split() for elm in textzh]

	if lang1 in ["zh"] and lang2 in ["en"]:
	vec1 = zh_tokens(text1)
	vec2 = en2zh_tokens(text2)
	elif lang1 in ["zh"] and lang2 in ["zh"]:
	vec1 = zh_tokens(text1)
	vec2 = zh_tokens(text2)
	elif lang1 in ["en"] and lang2 in ["en"]:
	vec1 = en2zh_tokens(text1)
	vec2 = en2zh_tokens(text2)

	# if lang1 in ["en"] and lang2 in ["zh"]:
	else:
	vec1 = en2zh_tokens(text1)
	vec2 = zh_tokens(text2)

	if model is None:
	model = gen_model(vec1)

	cmat = smatrix(
	vec1,
	vec2,
	model=model,
	tf_type=tf_type,
	idf_type=idf_type,
	dl_type=dl_type,
	norm=norm,
	min_df=min_df,
	max_df=max_df,
	max_n_terms=max_n_terms,
	vocabulary_terms=vocabulary_terms,
	)

	return np.array(cmat)