Spaces:

BabelSeniorDesignTeam
/

babel-router-api

Sleeping

babel-router-api / chunking.py

Create chunking.py

36ed2b7 verified about 1 month ago

1.97 kB

	import re
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	def get_max_word_length(target_languages: list[str]) -> int:
	helsinki_word_limits = {
	"el": 50,
	"et": 50,
	"fi": 50,
	"fr": 40,
	"sv": 140,
	"hu": 50,
	"lt": 50,
	"sk": 140,
	"bg": 50,
	"cs": 140,
	"da": 140,
	"de": 150,
	}

	max_word_length = 700 # Default for non-Helsinki languages

	for lang in target_languages:
	if lang in helsinki_word_limits:
	if helsinki_word_limits[lang] < max_word_length:
	max_word_length = helsinki_word_limits[lang]

	return max_word_length

	def chunk_text(text: str, safe_word_limit: int) -> list[str]:
	sentences = re.split(r'(?<=[.!?])\s+', text.strip())

	chunks = []
	current_chunk = []
	current_word_count = 0

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	word_count = len(sentence.split())

	# If sentence is longer than the safe word limit by itself, split it
	if word_count > safe_word_limit:
	if current_chunk:
	chunks.append(' '.join(current_chunk))
	current_chunk = []
	current_word_count = 0
	words = sentence.split()
	for i in range(0, len(words), safe_word_limit):
	chunks.append(' '.join(words[i:i+safe_word_limit]))
	continue

	# Otherwise, see if it fits in the current chunk
	if current_word_count + word_count <= safe_word_limit:
	current_chunk.append(sentence)
	current_word_count += word_count
	else:
	# Start a new chunk
	chunks.append(' '.join(current_chunk))
	current_chunk = [sentence]
	current_word_count = word_count

	if current_chunk:
	chunks.append(' '.join(current_chunk))

	return chunks