|
import re |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
def get_max_word_length(target_languages: list[str]) -> int: |
|
helsinki_word_limits = { |
|
"el": 50, |
|
"et": 50, |
|
"fi": 50, |
|
"fr": 40, |
|
"sv": 140, |
|
"hu": 50, |
|
"lt": 50, |
|
"sk": 140, |
|
"bg": 50, |
|
"cs": 140, |
|
"da": 140, |
|
"de": 150, |
|
} |
|
|
|
max_word_length = 700 |
|
|
|
for lang in target_languages: |
|
if lang in helsinki_word_limits: |
|
if helsinki_word_limits[lang] < max_word_length: |
|
max_word_length = helsinki_word_limits[lang] |
|
|
|
return max_word_length |
|
|
|
def chunk_text(text: str, safe_word_limit: int) -> list[str]: |
|
sentences = re.split(r'(?<=[.!?])\s+', text.strip()) |
|
|
|
chunks = [] |
|
current_chunk = [] |
|
current_word_count = 0 |
|
|
|
for sentence in sentences: |
|
sentence = sentence.strip() |
|
if not sentence: |
|
continue |
|
|
|
word_count = len(sentence.split()) |
|
|
|
|
|
if word_count > safe_word_limit: |
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
current_chunk = [] |
|
current_word_count = 0 |
|
words = sentence.split() |
|
for i in range(0, len(words), safe_word_limit): |
|
chunks.append(' '.join(words[i:i+safe_word_limit])) |
|
continue |
|
|
|
|
|
if current_word_count + word_count <= safe_word_limit: |
|
current_chunk.append(sentence) |
|
current_word_count += word_count |
|
else: |
|
|
|
chunks.append(' '.join(current_chunk)) |
|
current_chunk = [sentence] |
|
current_word_count = word_count |
|
|
|
if current_chunk: |
|
chunks.append(' '.join(current_chunk)) |
|
|
|
return chunks |
|
|