File size: 1,972 Bytes
36ed2b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def get_max_word_length(target_languages: list[str]) -> int:
    helsinki_word_limits = {
        "el": 50,
        "et": 50,
        "fi": 50,
        "fr": 40,
        "sv": 140,
        "hu": 50,
        "lt": 50,
        "sk": 140,
        "bg": 50,
        "cs": 140,
        "da": 140,
        "de": 150,
    }

    max_word_length = 700  # Default for non-Helsinki languages

    for lang in target_languages:
        if lang in helsinki_word_limits:
            if helsinki_word_limits[lang] < max_word_length:
                max_word_length = helsinki_word_limits[lang]

    return max_word_length

def chunk_text(text: str, safe_word_limit: int) -> list[str]:
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        word_count = len(sentence.split())

        # If sentence is longer than the safe word limit by itself, split it
        if word_count > safe_word_limit:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_word_count = 0
            words = sentence.split()
            for i in range(0, len(words), safe_word_limit):
                chunks.append(' '.join(words[i:i+safe_word_limit]))
            continue

        # Otherwise, see if it fits in the current chunk
        if current_word_count + word_count <= safe_word_limit:
            current_chunk.append(sentence)
            current_word_count += word_count
        else:
            # Start a new chunk
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_word_count = word_count

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks