asasasaasasa commited on
Commit
da8d2e4
·
0 Parent(s):
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .devcontainer/devcontainer.json +33 -0
  2. .gitattributes +35 -0
  3. .gitignore +12 -0
  4. .idea/Tilmash_Translator.iml +7 -0
  5. .idea/inspectionProfiles/profiles_settings.xml +6 -0
  6. .idea/misc.xml +7 -0
  7. .idea/vcs.xml +6 -0
  8. .idea/workspace.xml +0 -0
  9. Dockerfile +50 -0
  10. LICENSE.txt +21 -0
  11. README.md +115 -0
  12. check_gpu.py +69 -0
  13. config.py +22 -0
  14. main.py +345 -0
  15. models/nltk_resources.py +21 -0
  16. nltk_data/tokenizers/punkt_tab.zip +3 -0
  17. nltk_data/tokenizers/punkt_tab/README +98 -0
  18. nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
  19. nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
  20. nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
  21. nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +54 -0
  22. nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +211 -0
  23. nltk_data/tokenizers/punkt_tab/danish/collocations.tab +101 -0
  24. nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
  25. nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +64 -0
  26. nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +99 -0
  27. nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +37 -0
  28. nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
  29. nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +54 -0
  30. nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +156 -0
  31. nltk_data/tokenizers/punkt_tab/english/collocations.tab +37 -0
  32. nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
  33. nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +39 -0
  34. nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +48 -0
  35. nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +100 -0
  36. nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
  37. nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +89 -0
  38. nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +81 -0
  39. nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +167 -0
  40. nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
  41. nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +86 -0
  42. nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +61 -0
  43. nltk_data/tokenizers/punkt_tab/french/collocations.tab +18 -0
  44. nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
  45. nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +48 -0
  46. nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +71 -0
  47. nltk_data/tokenizers/punkt_tab/german/collocations.tab +28 -0
  48. nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
  49. nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +107 -0
  50. nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +100 -0
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Python 3",
3
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4
+ "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5
+ "customizations": {
6
+ "codespaces": {
7
+ "openFiles": [
8
+ "README.md",
9
+ "main.py"
10
+ ]
11
+ },
12
+ "vscode": {
13
+ "settings": {},
14
+ "extensions": [
15
+ "ms-python.python",
16
+ "ms-python.vscode-pylance"
17
+ ]
18
+ }
19
+ },
20
+ "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21
+ "postAttachCommand": {
22
+ "server": "streamlit run main.py --server.enableCORS false --server.enableXsrfProtection false"
23
+ },
24
+ "portsAttributes": {
25
+ "8501": {
26
+ "label": "Application",
27
+ "onAutoForward": "openPreview"
28
+ }
29
+ },
30
+ "forwardPorts": [
31
+ 8501
32
+ ]
33
+ }
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ .huggingface/
7
+ .cache/
8
+ local_llms/.cache/
9
+ local_llms/.locks/
10
+ local_llms/locks/
11
+ local_llms/instances/
12
+ local_llms/models--*/
.idea/Tilmash_Translator.iml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module version="4">
3
+ <component name="PyDocumentationSettings">
4
+ <option name="format" value="PLAIN" />
5
+ <option name="myDocStringFormat" value="Plain" />
6
+ </component>
7
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.13 (Tilmash_Translator)" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (Tilmash_Translator)" project-jdk-type="Python SDK" />
7
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml ADDED
The diff for this file is too large to render. See raw diff
 
Dockerfile ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
+ FROM python:3.9-slim
3
+
4
+ WORKDIR /app
5
+
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ curl \
9
+ software-properties-common \
10
+ git \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ COPY requirements.txt ./
14
+ COPY src/ ./src/
15
+
16
+ RUN pip3 install -r requirements.txt
17
+
18
+ EXPOSE 8501
19
+
20
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
21
+
22
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
23
+ =======
24
+ FROM python:3.11-slim
25
+
26
+ # Базовые системные зависимости (для сборки/запуска)
27
+ RUN apt-get update && apt-get install -y --no-install-recommends \
28
+ build-essential git libglib2.0-0 libgl1 \
29
+ && rm -rf /var/lib/apt/lists/*
30
+
31
+ # Установка CUDA поддержки (для PyTorch)
32
+ ENV PYTHONUNBUFFERED=1 \
33
+ PYTHONDONTWRITEBYTECODE=1 \
34
+ PIP_NO_CACHE_DIR=1
35
+
36
+ WORKDIR /app
37
+
38
+ # Устанавливаем Python-зависимости
39
+ COPY requirements.txt /app/requirements.txt
40
+ ENV PIP_NO_CACHE_DIR=1 \
41
+ HF_HOME=/data/.cache/huggingface \
42
+ PORT=7860
43
+ RUN pip install -U pip && pip install -r requirements.txt
44
+
45
+ # Копируем весь проект
46
+ COPY . /app
47
+
48
+ # Запуск Streamlit внутри контейнера
49
+ CMD ["streamlit", "run", "main.py", "--server.port=7860", "--server.address=0.0.0.0"]
50
+ >>>>>>> 805a119 (Initial commit for HF Space)
LICENSE.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2011-2025 The Bootstrap Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
README.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <<<<<<< HEAD
2
+ ---
3
+ title: Translate Tl
4
+ emoji: 🚀
5
+ colorFrom: red
6
+ colorTo: red
7
+ sdk: docker
8
+ app_port: 8501
9
+ tags:
10
+ - streamlit
11
+ pinned: false
12
+ short_description: Streamlit template space
13
+ ---
14
+
15
+ # Welcome to Streamlit!
16
+
17
+ Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
18
+
19
+ If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
20
+ forums](https://discuss.streamlit.io).
21
+ =======
22
+ $yaml = @"
23
+ ---
24
+ title: Tilmash Translator
25
+ sdk: streamlit
26
+ app_file: main.py
27
+ python_version: "3.11"
28
+ pinned: false
29
+ ---
30
+ "@
31
+ $orig = Get-Content -Raw README.md
32
+ Set-Content README.md $yaml -Encoding UTF8
33
+ Add-Content README.md $orig
34
+
35
+ # Tilmash Translator
36
+
37
+ **Tilmash Translator** is an offline‑first, privacy‑preserving translation and readability toolkit for Russian, English and Kazakh.
38
+
39
+ It ships as a Streamlit web‑app and offers two core capabilities:
40
+
41
+ 1. **Neural Machine Translation**
42
+ • Primary model — [ISSAI/tilmash](https://huggingface.co/issai/tilmash) (Seq2Seq) for RU ↔ EN ↔ KK
43
+ • Smart chunking & streaming make multi‑page documents feel snappy
44
+ 2. **Readability Analysis**
45
+ • Calculates Flesch Reading Ease, Flesch‑Kincaid, Gunning Fog and SMOG
46
+ • Highlights complex words and supports RU/EN/KK
47
+
48
+
49
+ ---
50
+
51
+ ## Quick Start
52
+
53
+ ```bash
54
+ # 1. Clone & create a virtual environment
55
+ $ git clone https://github.com/medetshatayev/Tilmash_Translator.git
56
+ $ cd Tilmash_Translator
57
+ $ python3 -m venv .venv && source .venv/bin/activate
58
+
59
+ # 2. Install dependencies
60
+ $ pip install -r requirements.txt
61
+
62
+ # 3. (optional) authenticate once to download the Tilmash weights
63
+ $ echo "HF_TOKEN=🪄your_huggingface_token" > .env
64
+
65
+ # 4. Launch the Streamlit app
66
+ $ streamlit run main.py
67
+ ```
68
+
69
+ 💡 The helper script `start.sh` automates the above and sets safe memory limits for `llama‑cpp-python`.
70
+
71
+ ### GPU Off‑loading (Gemma‑3)
72
+
73
+ Set `GEMMA_GPU_LAYERS=<num_layers>` in your environment (defaults to **48**) to off‑load those layers to Metal/CUDA.
74
+
75
+ ---
76
+
77
+ ## Project Layout
78
+
79
+ ```
80
+ .
81
+ ├── main.py # Streamlit UI
82
+ ├── utils/ # Translation & analysis helpers
83
+ │ ├── tilmash_translation.py
84
+ │ ├── gemma_translation.py
85
+ │ ├── readability_indices.py
86
+ │ └── ...
87
+ ├── models/ # Extra resources (NLTK, etc.)
88
+ ├── config.py # Default env‑vars
89
+ ├── start.sh # Convenience launcher
90
+ └── requirements.txt # Python deps
91
+ ```
92
+
93
+ ## Configuration Keys
94
+
95
+ | Variable | Default | Purpose |
96
+ |------------------------|---------|-------------------------------------------|
97
+ | `GEMMA_GPU_LAYERS` | 48 | Layers to move to GPU (0 = CPU‑only) |
98
+ | `GEMMA_CONTEXT_SIZE` | 8192 | Context window for Gemma‑3 |
99
+ | `MAX_PARALLEL_MODELS` | 4 | Concurrency guard |
100
+ | `MAX_TOKENS` | 4096 | Generation cap per request |
101
+ | `CHUNK_SIZE` | 3000 | Token threshold before auto‑chunking |
102
+
103
+ Override any of these via the environment or edit **config.py**.
104
+
105
+ ---
106
+
107
+ ## How It Works
108
+
109
+ 1. **File ingestion** — `.txt`, `.docx`, `.pdf` loaded via `utils/file_readers.py`
110
+ 2. **Language detection** — `langdetect` (auto‑detect option in UI)
111
+ 3. **Translation pipeline** — <3000 tokens translate directly; longer texts are chunked (`utils/chunking.py`) and streamed through Tilmash or Gemma‑3
112
+ 4. **Readability analysis** — scores computed in `utils/readability_indices.py` and color‑coded in the app.
113
+
114
+ ---
115
+ >>>>>>> 805a119 (Initial commit for HF Space)
check_gpu.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """
4
+ Этот скрипт проверяет наличие GPU и совместимость с PyTorch.
5
+ Запустите его, чтобы проверить конфигурацию вашей системы:
6
+
7
+ python check_gpu.py
8
+ """
9
+
10
+ import platform
11
+ import os
12
+ import sys
13
+
14
+ def check_system():
15
+ print(f"Операционная система: {platform.system()} {platform.release()}")
16
+ print(f"Python версия: {sys.version}")
17
+
18
+ try:
19
+ import torch
20
+ print(f"\nПроверка PyTorch:")
21
+ print(f"Версия PyTorch: {torch.__version__}")
22
+
23
+ # Проверка CUDA
24
+ if hasattr(torch.version, "cuda"):
25
+ print(f"CUDA версия: {torch.version.cuda}")
26
+ else:
27
+ print("CUDA версия: не найдена")
28
+
29
+ # Проверка доступности CUDA
30
+ print(f"CUDA доступен: {torch.cuda.is_available()}")
31
+
32
+ if torch.cuda.is_available():
33
+ print(f"Обнаружено GPU: {torch.cuda.get_device_name(0)}")
34
+ print(f"Количество GPU: {torch.cuda.device_count()}")
35
+ for i in range(torch.cuda.device_count()):
36
+ print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
37
+
38
+ # Проверка MPS (Apple Silicon)
39
+ if hasattr(torch.backends, "mps"):
40
+ print(f"\nApple MPS доступен: {torch.backends.mps.is_available()}")
41
+ if torch.backends.mps.is_available():
42
+ print("Обнаружен Apple Silicon GPU (M1/M2/M3)")
43
+
44
+ except ImportError:
45
+ print("PyTorch не установлен.")
46
+ print("Установите PyTorch командой: pip install torch")
47
+ except Exception as e:
48
+ print(f"Ошибка при проверке PyTorch: {str(e)}")
49
+
50
+ if __name__ == "__main__":
51
+ print("===== Диагностика GPU для Tilmash =====")
52
+ check_system()
53
+
54
+ print("\n===== Готовность системы =====")
55
+ try:
56
+ import torch
57
+ if torch.cuda.is_available():
58
+ print("✅ GPU CUDA обнаружен и готов к использованию")
59
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
60
+ print("✅ Apple Silicon GPU (MPS) обнаружен и готов к использованию")
61
+ else:
62
+ print("⚠️ GPU не обнаружен. Tilmash будет работать на CPU (медленнее)")
63
+ except ImportError:
64
+ print("❌ PyTorch не установлен. Установите его командой: pip install torch")
65
+
66
+ print("\nСовет: Если у вас есть GPU, но он не обнаружен, проверьте:\n"
67
+ "1. Драйверы NVIDIA (для CUDA)\n"
68
+ "2. Правильную версию PyTorch для вашей системы\n"
69
+ "3. Переустановите PyTorch с поддержкой CUDA: pip install torch --upgrade")
config.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration defaults for the Translator application.
3
+ This file contains the default values for environment variables.
4
+ These are only used if the actual environment variables are not set.
5
+ """
6
+
7
+ # Default model configuration
8
+ DEFAULT_CONFIG = {
9
+ "MAX_PARALLEL_MODELS": 4,
10
+ "SESSION_TIMEOUT": 1800,
11
+ "MODEL_INSTANCE_TIMEOUT": 1800,
12
+ "ALLOW_GPU": True, # Разрешить использование GPU если доступно
13
+ "LOGLEVEL": "INFO",
14
+ "MAX_TOKENS": 4096,
15
+ "CHUNK_SIZE": 3000
16
+ }
17
+
18
+ # Convert boolean and integer values to strings for environment variables
19
+ ENV_DEFAULTS = {
20
+ key: str(value).lower() if isinstance(value, bool) else str(value)
21
+ for key, value in DEFAULT_CONFIG.items()
22
+ }
main.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+
3
+ import os
4
+ import streamlit as st
5
+ import logging
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables first, before any other code
9
+ load_dotenv()
10
+
11
+ # Import configuration defaults (after loading .env to prioritize environment variables)
12
+ from config import ENV_DEFAULTS, DEFAULT_CONFIG
13
+
14
+ # Configure logging based on configuration
15
+ log_level = os.environ.get('LOGLEVEL', DEFAULT_CONFIG['LOGLEVEL']).upper()
16
+ logging.basicConfig(
17
+ level=getattr(logging, log_level),
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ # Only log to console if level is INFO or higher
21
+ logging.StreamHandler() if log_level != 'WARNING' else logging.NullHandler()
22
+ ]
23
+ )
24
+
25
+ # Configure app
26
+ st.set_page_config(page_title="Translator & Readability", layout="wide")
27
+
28
+ # Check for missing environment variables and use defaults from config
29
+ for var, default in ENV_DEFAULTS.items():
30
+ if var not in os.environ:
31
+ logging.debug(f"Environment variable {var} not found, using default: {default}")
32
+ os.environ[var] = default
33
+
34
+ # Model configuration from default config
35
+ MODEL_CONFIG = {
36
+ "max_parallel_models": DEFAULT_CONFIG["MAX_PARALLEL_MODELS"],
37
+ "session_timeout": DEFAULT_CONFIG["SESSION_TIMEOUT"],
38
+ "allow_gpu": DEFAULT_CONFIG["ALLOW_GPU"]
39
+ }
40
+
41
+ # Initialize model semaphore for limiting concurrent model usage
42
+ import threading
43
+ model_semaphore = threading.Semaphore(MODEL_CONFIG["max_parallel_models"])
44
+
45
+ import tempfile
46
+ import io
47
+ from docx import Document
48
+ import uuid
49
+ import traceback
50
+
51
+ from models.nltk_resources import setup_nltk
52
+ from utils.file_readers import read_file
53
+ from utils.text_processing import detect_language
54
+ from utils.readability_indices import (
55
+ flesch_reading_ease,
56
+ flesch_kincaid_grade_level,
57
+ gunning_fog_index,
58
+ smog_index,
59
+ highlight_complex_text
60
+ )
61
+ from utils.formatting import color_code_index
62
+ from utils.tilmash_translation import tilmash_translate, display_tilmash_streaming_translation
63
+
64
+ # Initialize session state for user identification
65
+ if 'session_id' not in st.session_state:
66
+ st.session_state.session_id = str(uuid.uuid4())
67
+
68
+ if 'translation_lock' not in st.session_state:
69
+ st.session_state.translation_lock = False
70
+
71
+ def handle_translation():
72
+ st.header("Перевод (Kazakh, Russian, English)")
73
+
74
+ # Show session ID in sidebar for debugging
75
+ with st.sidebar.expander("Session Info", expanded=False):
76
+ st.write(f"Session ID: {st.session_state.session_id}")
77
+
78
+ # Add GPU usage option
79
+ if MODEL_CONFIG["allow_gpu"]:
80
+ st.session_state.use_gpu = st.checkbox("Использовать GPU (быстрее)", value=True)
81
+ if st.session_state.use_gpu:
82
+ try:
83
+ import torch
84
+ if torch.cuda.is_available():
85
+ gpu_info = f"CUDA: {torch.cuda.get_device_name(0)}"
86
+ st.success(f"Доступен GPU: {gpu_info}")
87
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
88
+ st.success("Доступен Apple Silicon GPU (MPS)")
89
+ else:
90
+ st.warning("GPU не обнаружен, будет использован CPU")
91
+ st.session_state.use_gpu = False
92
+ except ImportError:
93
+ st.warning("PyTorch не установлен, будет использован CPU")
94
+ st.session_state.use_gpu = False
95
+ else:
96
+ st.session_state.use_gpu = False
97
+ st.write("GPU отключен в конфигурации")
98
+
99
+ translate_input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
100
+ input_text = ""
101
+
102
+ if translate_input_method == "Загрузить файл":
103
+ uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
104
+ if uploaded_file is not None:
105
+ suffix = os.path.splitext(uploaded_file.name)[1]
106
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
107
+ tmp_file.write(uploaded_file.getbuffer())
108
+ temp_file_path = tmp_file.name
109
+ input_text = read_file(temp_file_path)
110
+ os.remove(temp_file_path)
111
+ st.write("**Содержимое файла:**")
112
+ st.write(input_text)
113
+ else:
114
+ input_text = st.text_area("Вставьте ваш текст здесь", height=200)
115
+
116
+ if input_text:
117
+ auto_detect = st.checkbox("Автоматически определить язык", value=True)
118
+ src_lang = None
119
+ if auto_detect:
120
+ detected_lang = detect_language(input_text)
121
+ if detected_lang in ['ru','en','kk']:
122
+ st.info(f"Определён язык: {detected_lang}")
123
+ src_lang = detected_lang
124
+ else:
125
+ st.warning("Не удалось определить язык. Выберите вручную.")
126
+ src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
127
+ else:
128
+ src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
129
+
130
+ if src_lang == "ru":
131
+ tgt_options = ["en","kk"]
132
+ elif src_lang == "en":
133
+ tgt_options = ["ru","kk"]
134
+ else:
135
+ tgt_options = ["ru","en"]
136
+
137
+ tgt_lang = st.selectbox("Перевод на:", tgt_options)
138
+
139
+ if st.button("Перевести"):
140
+ # Prevent multiple concurrent translations from same session
141
+ if st.session_state.translation_lock:
142
+ st.warning("Перевод уже выполняется. Пожалуйста, дождитесь завершения.")
143
+ return
144
+
145
+ # Set translation lock
146
+ st.session_state.translation_lock = True
147
+
148
+ try:
149
+ # Use the model semaphore to limit concurrent model access
150
+ acquired = model_semaphore.acquire(blocking=False)
151
+ if not acquired:
152
+ st.warning("Максимальное количество параллельных моделей достигнуто. Пожалуйста, попробуйте позже.")
153
+ st.session_state.translation_lock = False
154
+ return
155
+
156
+ try:
157
+ st.subheader("Результат перевода:")
158
+ # Get the approximate size of the text to determine if chunking is needed
159
+ approx_text_size = len(input_text) / 4 # rough approximation (4 chars ≈ 1 token)
160
+ needs_chunking = approx_text_size > 500 # If text is likely over 500 tokens
161
+
162
+ # Display appropriate spinner message
163
+ spinner_message = "Processing text in chunks..." if needs_chunking else "Processing translation..."
164
+
165
+ # Create a dedicated translator instance for this session
166
+ from utils.tilmash_translation import TilmashTranslator
167
+ # Используем GPU если включено в настройках
168
+ use_gpu = getattr(st.session_state, 'use_gpu', False)
169
+ translator = TilmashTranslator(use_gpu=use_gpu)
170
+
171
+ with st.spinner(spinner_message):
172
+ try:
173
+ # Use direct streaming approach with session-specific translator
174
+ result = ""
175
+ translation_placeholder = st.empty()
176
+
177
+ # Stream translation
178
+ for chunk in translator.translate_streaming(input_text, src_lang, tgt_lang):
179
+ result += chunk
180
+ translation_placeholder.markdown(result)
181
+
182
+ except Exception as e:
183
+ st.error(f"Translation error: {str(e)}")
184
+ logging.error(f"Tilmash translation error: {traceback.format_exc()}")
185
+ result = None
186
+
187
+ if result:
188
+ # Prepare download capability
189
+ doc = Document()
190
+ doc.add_paragraph(result)
191
+ doc_io = io.BytesIO()
192
+ doc.save(doc_io)
193
+ doc_io.seek(0)
194
+
195
+ st.download_button(
196
+ label="Скачать переведённый текст (.docx)",
197
+ data=doc_io,
198
+ file_name="translated_text.docx",
199
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
200
+ )
201
+ else:
202
+ st.warning("Не удалось выполнить перевод.")
203
+
204
+ # Unload Tilmash model after use
205
+ try:
206
+ if translator.initialized:
207
+ translator.unload_model()
208
+ except Exception as unload_error:
209
+ logging.error(f"Error unloading Tilmash model: {str(unload_error)}")
210
+ except Exception as tilmash_error:
211
+ st.error(f"Tilmash model error: {str(tilmash_error)}")
212
+ logging.error(f"Tilmash model error: {traceback.format_exc()}")
213
+ finally:
214
+ # Release the semaphore
215
+ model_semaphore.release()
216
+ except Exception as outer_error:
217
+ st.error(f"Unexpected error: {str(outer_error)}")
218
+ logging.error(f"Unexpected error: {traceback.format_exc()}")
219
+ finally:
220
+ # Release translation lock
221
+ st.session_state.translation_lock = False
222
+
223
+ def handle_readability_analysis():
224
+ st.header("Анализ удобочитаемости текста")
225
+ input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
226
+ text = ""
227
+
228
+ if input_method == "Загрузить файл":
229
+ uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
230
+ if uploaded_file is not None:
231
+ suffix = os.path.splitext(uploaded_file.name)[1]
232
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
233
+ tmp_file.write(uploaded_file.getbuffer())
234
+ temp_file_path = tmp_file.name
235
+ text = read_file(temp_file_path)
236
+ os.remove(temp_file_path)
237
+ st.write("**Содержимое файла:**")
238
+ st.write(text)
239
+ else:
240
+ text = st.text_area("Вставьте ваш текст здесь", height=200)
241
+
242
+ if text:
243
+ auto_detect = st.checkbox("Определить язык автоматически", value=True)
244
+ if auto_detect:
245
+ detected_lang = detect_language(text)
246
+ st.info(f"Определён язык: {detected_lang}")
247
+ lang_code = detected_lang if detected_lang in ['ru','en','kk'] else 'en'
248
+ else:
249
+ lang_code = st.selectbox("Язык текста", ["ru", "en", "kk"])
250
+
251
+ if st.button("Анализировать"):
252
+ # Prevent multiple concurrent analyses
253
+ if 'analysis_lock' in st.session_state and st.session_state.analysis_lock:
254
+ st.warning("Анализ уже выполняется. Пожалуйста, дождитесь завершения.")
255
+ return
256
+
257
+ # Set analysis lock
258
+ st.session_state.analysis_lock = True
259
+
260
+ try:
261
+ # Use the model semaphore for consistency with translation
262
+ acquired = model_semaphore.acquire(blocking=False)
263
+ if not acquired:
264
+ st.warning("Система загружена. Пожалуйста, попробуйте позже.")
265
+ st.session_state.analysis_lock = False
266
+ return
267
+
268
+ try:
269
+ with st.spinner("Выполняется анализ..."):
270
+ fre = flesch_reading_ease(text, lang_code)
271
+ fkgl = flesch_kincaid_grade_level(text, lang_code)
272
+ fog = gunning_fog_index(text, lang_code)
273
+ smog = smog_index(text, lang_code)
274
+ highlighted_text, complex_words_list = highlight_complex_text(text, lang_code)
275
+
276
+ st.subheader("Результаты удобочитаемости")
277
+ st.markdown(
278
+ f"**Индекс удобочитаемости Флеша:** {color_code_index('Flesch Reading Ease', fre)}",
279
+ unsafe_allow_html=True
280
+ )
281
+ st.markdown(
282
+ f"**Индекс Флеша-Кинкейда:** {color_code_index('Flesch-Kincaid Grade Level', fkgl)}",
283
+ unsafe_allow_html=True
284
+ )
285
+ st.markdown(
286
+ f"**Индекс тумана Ганнинга:** {color_code_index('Gunning Fog Index', fog)}",
287
+ unsafe_allow_html=True
288
+ )
289
+ st.markdown(
290
+ f"**Индекс SMOG:** {color_code_index('SMOG Index', smog)}",
291
+ unsafe_allow_html=True
292
+ )
293
+
294
+ st.subheader("Сложные слова")
295
+ st.write(", ".join(set(complex_words_list)))
296
+ finally:
297
+ # Release the semaphore
298
+ model_semaphore.release()
299
+ finally:
300
+ # Release analysis lock
301
+ st.session_state.analysis_lock = False
302
+
303
+ def main():
304
+ setup_nltk()
305
+
306
+ # Log the model configuration only once per session
307
+ if 'model_config_logged' not in st.session_state:
308
+ logging.info(f"Using model configuration: {MODEL_CONFIG}")
309
+ st.session_state.model_config_logged = True
310
+
311
+ # Проверка доступности GPU при запуске
312
+ try:
313
+ import torch
314
+ if torch.cuda.is_available():
315
+ gpu_name = torch.cuda.get_device_name(0)
316
+ cuda_ver = torch.version.cuda if hasattr(torch.version, "cuda") else "N/A"
317
+ logging.info(f"Обнаружен GPU: {gpu_name}, CUDA {cuda_ver}")
318
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
319
+ logging.info("Обнаружен Apple Silicon GPU (MPS)")
320
+ else:
321
+ logging.warning("GPU не обнаружен. Устанавливаем устройство на CPU")
322
+ if not torch.cuda.is_available():
323
+ # Вывод диагностической информации
324
+ logging.warning("Диагностика CUDA:")
325
+ logging.warning(f"torch.__version__: {torch.__version__}")
326
+ if hasattr(torch.version, "cuda"):
327
+ logging.warning(f"torch.version.cuda: {torch.version.cuda}")
328
+ if hasattr(torch.cuda, "is_available"):
329
+ logging.warning(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
330
+ except ImportError:
331
+ logging.warning("PyTorch не установлен, будет использован CPU")
332
+ except Exception as e:
333
+ logging.warning(f"Ошибка при проверке GPU: {str(e)}")
334
+
335
+ st.title("Translation & Readability Analysis")
336
+ st.sidebar.header("Функциональность")
337
+ functionality = st.sidebar.radio("Выберите режим:", ["Перевод", "Анализ удобочитаемости"])
338
+
339
+ if functionality == "Перевод":
340
+ handle_translation()
341
+ elif functionality == "Анализ удобочитаемости":
342
+ handle_readability_analysis()
343
+
344
+ if __name__ == "__main__":
345
+ main()
models/nltk_resources.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # models/nltk_resources.py
2
+
3
+ import nltk
4
+ import logging
5
+
6
+ def setup_nltk():
7
+ nltk_data_dir = 'nltk_data'
8
+
9
+ # Add the nltk_data directory to the NLTK data path
10
+ if nltk_data_dir not in nltk.data.path:
11
+ nltk.data.path.insert(0, nltk_data_dir)
12
+
13
+ # Define the required package
14
+ required_package = 'punkt_tab'
15
+
16
+ # Check if the package is installed locally
17
+ try:
18
+ nltk.data.find('tokenizers/punkt_tab')
19
+ except LookupError:
20
+ logging.info(f"Downloading NLTK package: {required_package}")
21
+ nltk.download(required_package, download_dir=nltk_data_dir, quiet=True)
nltk_data/tokenizers/punkt_tab.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e57f64187974277726a3417ca6f181ec5403676c717672eef6a748a7b20e0106
3
+ size 4319076
nltk_data/tokenizers/punkt_tab/README ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
2
+
3
+ Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
4
+ been contributed by various people using NLTK for sentence boundary detection.
5
+
6
+ For information about how to use these models, please confer the tokenization HOWTO:
7
+ http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
8
+ and chapter 3.8 of the NLTK book:
9
+ http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
10
+
11
+ There are pretrained tokenizers for the following languages:
12
+
13
+ File Language Source Contents Size of training corpus(in tokens) Model contributed by
14
+ =======================================================================================================================================================================
15
+ czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
16
+ Literarni Noviny
17
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
18
+ danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
19
+ (Berlingske Avisdata, Copenhagen) Weekend Avisen
20
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
21
+ dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
22
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
23
+ english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
24
+ (American)
25
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
26
+ estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
27
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
28
+ finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
29
+ Text Bank (Suomen Kielen newspapers
30
+ Tekstipankki)
31
+ Finnish Center for IT Science
32
+ (CSC)
33
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
34
+ french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
35
+ (European)
36
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
37
+ german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
38
+ (Switzerland) CD-ROM
39
+ (Uses "ss"
40
+ instead of "ß")
41
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
42
+ greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
43
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
44
+ italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
45
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
46
+ norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
47
+ (Bokmål and Information Technologies,
48
+ Nynorsk) Bergen
49
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
50
+ polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
51
+ (http://www.nkjp.pl/)
52
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
53
+ portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
54
+ (Brazilian) (Linguateca)
55
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
56
+ slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
57
+ Slovene Academy for Arts
58
+ and Sciences
59
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
60
+ spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
61
+ (European)
62
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
63
+ swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
64
+ (and some other texts)
65
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
66
+ turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
67
+ (Türkçe Derlem Projesi)
68
+ University of Ankara
69
+ -----------------------------------------------------------------------------------------------------------------------------------------------------------------------
70
+
71
+ The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
72
+ Unicode using the codecs module.
73
+
74
+ Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
75
+ Computational Linguistics 32: 485-525.
76
+
77
+ ---- Training Code ----
78
+
79
+ # import punkt
80
+ import nltk.tokenize.punkt
81
+
82
+ # Make a new Tokenizer
83
+ tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
84
+
85
+ # Read in training corpus (one example: Slovene)
86
+ import codecs
87
+ text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
88
+
89
+ # Train tokenizer
90
+ tokenizer.train(text)
91
+
92
+ # Dump pickled tokenizer
93
+ import pickle
94
+ out = open("slovene.pickle","wb")
95
+ pickle.dump(tokenizer, out)
96
+ out.close()
97
+
98
+ ---------
nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ t
2
+ množ
3
+ např
4
+ j.h
5
+ man
6
+ ú
7
+ jug
8
+ dr
9
+ bl
10
+ ml
11
+ okr
12
+ st
13
+ uh
14
+ šp
15
+ judr
16
+ u.s.a
17
+ p
18
+ arg
19
+ žitě
20
+ st.celsia
21
+ etc
22
+ p.s
23
+ t.r
24
+ lok
25
+ mil
26
+ ict
27
+ n
28
+ tl
29
+ min
30
+ č
31
+ d
32
+ al
33
+ ravenně
34
+ mj
35
+ nar
36
+ plk
37
+ s.p
38
+ a.g
39
+ roč
40
+ b
41
+ zdi
42
+ r.s.c
43
+ přek
44
+ m
45
+ gen
46
+ csc
47
+ mudr
48
+ vic
49
+ š
50
+ sb
51
+ resp
52
+ tzn
53
+ iv
54
+ s.r.o
55
+ mar
56
+ w
57
+ čs
58
+ vi
59
+ tzv
60
+ ul
61
+ pen
62
+ zv
63
+ str
64
+ čp
65
+ org
66
+ rak
67
+ sv
68
+ pplk
69
+ u.s
70
+ prof
71
+ c.k
72
+ op
73
+ g
74
+ vii
75
+ kr
76
+ ing
77
+ j.o
78
+ drsc
79
+ m3
80
+ l
81
+ tr
82
+ ceo
83
+ ch
84
+ fuk
85
+ vl
86
+ viii
87
+ líp
88
+ hl.m
89
+ t.zv
90
+ phdr
91
+ o.k
92
+ tis
93
+ doc
94
+ kl
95
+ ard
96
+ čkd
97
+ pok
98
+ apod
99
+ r
100
+
101
+ a.s
102
+ j
103
+ jr
104
+ i.m
105
+ e
106
+ kupř
107
+ f
108
+
109
+ xvi
110
+ mir
111
+ atď
112
+ vr
113
+ r.i.v
114
+ hl
115
+ kv
116
+ t.j
117
+ y
118
+ q.p.r
nltk_data/tokenizers/punkt_tab/czech/collocations.tab ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ i dejmala
2
+ ##number## prosince
3
+ h steina
4
+ ##number## listopadu
5
+ a dvořák
6
+ v klaus
7
+ i čnhl
8
+ ##number## wladyslawowo
9
+ ##number## letech
10
+ a jiráska
11
+ a dubček
12
+ ##number## štrasburk
13
+ ##number## juniorské
14
+ ##number## století
15
+ ##number## kola
16
+ ##number## pád
17
+ ##number## května
18
+ ##number## týdne
19
+ v dlouhý
20
+ k design
21
+ ##number## červenec
22
+ i ligy
23
+ ##number## kolo
24
+ z svěrák
25
+ ##number## mája
26
+ ##number## šimková
27
+ a bělého
28
+ a bradáč
29
+ ##number## ročníku
30
+ ##number## dubna
31
+ a vivaldiho
32
+ v mečiara
33
+ c carrićre
34
+ ##number## sjezd
35
+ ##number## výroční
36
+ ##number## kole
37
+ ##number## narozenin
38
+ k maleevová
39
+ i čnfl
40
+ ##number## pádě
41
+ ##number## září
42
+ ##number## výročí
43
+ a dvořáka
44
+ h g.
45
+ ##number## ledna
46
+ a dvorský
47
+ h měsíc
48
+ ##number## srpna
49
+ ##number## tř.
50
+ a mozarta
51
+ ##number## sudetoněmeckých
52
+ o sokolov
53
+ k škrach
54
+ v benda
55
+ ##number## symfonie
56
+ ##number## července
57
+ x šalda
58
+ c abrahama
59
+ a tichý
60
+ ##number## místo
61
+ k bielecki
62
+ v havel
63
+ ##number## etapu
64
+ a dubčeka
65
+ i liga
66
+ ##number## světový
67
+ v klausem
68
+ ##number## ženy
69
+ ##number## létech
70
+ ##number## minutě
71
+ ##number## listopadem
72
+ ##number## místě
73
+ o vlček
74
+ k peteraje
75
+ i sponzor
76
+ ##number## června
77
+ ##number## min.
78
+ ##number## oprávněnou
79
+ ##number## květnu
80
+ ##number## aktu
81
+ ##number## květnem
82
+ ##number## října
83
+ i rynda
84
+ ##number## února
85
+ i snfl
86
+ a mozart
87
+ z košler
88
+ a dvorskému
89
+ v marhoul
90
+ v mečiar
91
+ ##number## ročník
92
+ ##number## máje
93
+ v havla
94
+ k gott
95
+ s bacha
96
+ ##number## ad
nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ milena
3
+ tomáš
4
+ oznámila
5
+ podle
6
+ my
7
+ vyplývá
8
+ hlavní
9
+ jelikož
10
+ musíme
11
+ kdyby
12
+ foto
13
+ rozptylové
14
+ snad
15
+ zároveň
16
+ jaroslav
17
+ po
18
+ v
19
+ kromě
20
+ pokud
21
+ toto
22
+ jenže
23
+ oba
24
+ jak
25
+ zatímco
26
+ ten
27
+ myslím
28
+ navíc
29
+ dušan
30
+ zdá
31
+ dnes
32
+ přesto
33
+ tato
34
+ ti
35
+ bratislava
36
+ ale
37
+ když
38
+ nicméně
39
+ tento
40
+ mirka
41
+ přitom
42
+ dokud
43
+ jan
44
+ bohužel
45
+ ta
46
+ díky
47
+ prohlásil
48
+ praha
49
+ jestliže
50
+ jde
51
+ vždyť
52
+ moskva
53
+ proto
54
+ to
nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ t
2
+ tlf
3
+ b.p
4
+ evt
5
+ j.h
6
+ lenz
7
+ mht
8
+ gl
9
+ bl
10
+ stud.polit
11
+ e.j
12
+ st
13
+ o
14
+ dec
15
+ mag
16
+ h.b
17
+ p
18
+ adm
19
+ el.lign
20
+ e.s
21
+ saalba
22
+ styrt
23
+ nr
24
+ m.a.s.h
25
+ etc
26
+ pharm
27
+ hg
28
+ j.j
29
+ dj
30
+ mountainb
31
+ f.kr
32
+ h.r
33
+ cand.jur
34
+ sp
35
+ osv
36
+ s.g
37
+ ndr
38
+ inc
39
+ b.i.g
40
+ dk-sver
41
+ sl
42
+ v.s.o.d
43
+ cand.mag
44
+ d.v.s
45
+ v.i
46
+ bøddel
47
+ fr
48
+ ø«
49
+ dr.phil
50
+ chr
51
+ p.d
52
+ bj
53
+ fhv
54
+ tilskudsforhold
55
+ m.a
56
+ sek
57
+ p.g.a
58
+ int
59
+ pokalf
60
+ ik
61
+ dir
62
+ em-lodtrækn
63
+ a.h
64
+ o.lign
65
+ p.t
66
+ m.v
67
+ n.j
68
+ m.h.t
69
+ m.m
70
+ a.p
71
+ pers
72
+ 4-bakketurn
73
+ dr.med
74
+ w.ø
75
+ polit
76
+ fremsættes
77
+ techn
78
+ tidl
79
+ o.g
80
+ i.c.i
81
+ mill
82
+ skt
83
+ m.fl
84
+ cand.merc
85
+ kbh
86
+ indiv
87
+ stk
88
+ dk-maked
89
+ memorandum
90
+ mestersk
91
+ mag.art
92
+ kitzb
93
+ h
94
+ lic
95
+ fig
96
+ dressurst
97
+ sportsg
98
+ r.e.m
99
+ d.u.m
100
+ sct
101
+ kld
102
+ bl.a
103
+ hf
104
+ g.a
105
+ corp
106
+ w
107
+ konk
108
+ zoeterm
109
+ b.t
110
+ a.d
111
+ l.b
112
+ jf
113
+ s.b
114
+ kgl
115
+ ill
116
+ beck
117
+ tosset
118
+ afd
119
+ johs
120
+ pct
121
+ k.b
122
+ sv
123
+ verbalt
124
+ kgs
125
+ l.m.k
126
+ j.l
127
+ aus
128
+ superl
129
+ t.v
130
+ mia
131
+ kr
132
+ pr
133
+ præmien
134
+ j.b.s
135
+ j.o
136
+ o.s.v
137
+ edb-oplysninger
138
+ o.m.a
139
+ ca
140
+ 1b
141
+ f.eks
142
+ rens
143
+ ch
144
+ mr
145
+ schw
146
+ d.c
147
+ utraditionelt
148
+ idrætsgym
149
+ hhv
150
+ e.l
151
+ s.s
152
+ eks
153
+ f.o.m
154
+ dk-storbrit
155
+ dk-jugo
156
+ n.z
157
+ derivater
158
+ c
159
+ pt
160
+ vm-kval
161
+ kl
162
+ hr
163
+ cand
164
+ jur
165
+ sav
166
+ h.c
167
+ arab.-danm
168
+ d.a.d
169
+ fl
170
+ o.a
171
+ a.s
172
+ cand.polit
173
+ grundejerform
174
+ j
175
+ faglærte
176
+ cr
177
+ a.a
178
+ mou
179
+ f.r.i
180
+ årh
181
+ o.m.m
182
+ sve
183
+ c.a
184
+ engl
185
+ sikkerhedssystemerne
186
+ m.f
187
+ j.k
188
+ phil
189
+ f
190
+ vet
191
+ mio
192
+ k.e
193
+ m.k
194
+ atla
195
+ idrætsg
196
+ n.n
197
+ 4-bakketur
198
+ dvs
199
+ sdr
200
+ s.j
201
+ hol
202
+ s.h
203
+ pei
204
+ kbhvn
205
+ aa
206
+ m.g.i
207
+ fvt
208
+
209
+ b.c
210
+ th
211
+ lrs
nltk_data/tokenizers/punkt_tab/danish/collocations.tab ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## skak
2
+ ##number## speedway
3
+ ##number## rally
4
+ ##number## april
5
+ ##number## dm-fin
6
+ ##number## viceformand
7
+ m jensen
8
+ ##number## kano/kajak
9
+ ##number## bowling
10
+ ##number## dm-finale
11
+ ##number## årh.
12
+ ##number## januar
13
+ ##number## august
14
+ ##number## marathon
15
+ ##number## kamp
16
+ ##number## skihop
17
+ ##number## etage
18
+ ##number## tennis
19
+ ##number## cykling
20
+ e andersen
21
+ ##number## december
22
+ g h.
23
+ ##number## neb
24
+ ##number## sektion
25
+ ##number## afd.
26
+ ##number## klasse
27
+ ##number## trampolin
28
+ ##number## bordtennis
29
+ ##number## formel
30
+ ##number## århundredes
31
+ ##number## dm-semifin
32
+ ##number## heks
33
+ ##number## taekwondo
34
+ ##number## galop
35
+ ##number## basketball
36
+ ##number## dm
37
+ m skræl
38
+ ##number## trav
39
+ ##number## provins
40
+ ##number## triathlon
41
+ k axel
42
+ ##number## rugby
43
+ s h.
44
+ ##number## klaverkoncert
45
+ a p.
46
+ e løgstrup
47
+ k telefax
48
+ ##number## gyldendal
49
+ ##number## fodbold
50
+ e rosenfeldt
51
+ ##number## oktober
52
+ k o.
53
+ ##number## september
54
+ ##number## dec.
55
+ ##number## juledag
56
+ ##number## badminton
57
+ ##number## sejlsport
58
+ ##number## håndbold
59
+ r førsund
60
+ e jørgensen
61
+ d ##number##
62
+ k e
63
+ ##number## alp.ski
64
+ ##number## judo
65
+ ##number## roning
66
+ ##number## november
67
+ ##number## atletik
68
+ ##number## århundrede
69
+ ##number## ridning
70
+ ##number## marts
71
+ m andersen
72
+ d roosevelt
73
+ ##number## brydning
74
+ s kr.
75
+ ##number## runde
76
+ ##number## division
77
+ ##number## sal
78
+ ##number## boksning
79
+ ##number## minut
80
+ ##number## golf
81
+ ##number## juni
82
+ ##number## symfoni
83
+ ##number## hurtigløb
84
+ k jørgensen
85
+ ##number## jörgen
86
+ ##number## klasses
87
+ e jacobsen
88
+ k jensen
89
+ ##number## februar
90
+ k nielsen
91
+ ##number## volleyball
92
+ ##number## maj
93
+ ##number## verdenskrig
94
+ ##number## juli
95
+ ##number## ishockey
96
+ ##number## kunstskøjteløb
97
+ b jørgensen
98
+ ##number## gymnastik
99
+ ##number## svømning
100
+ ##number## tw
101
+ i pedersens
nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ kronik
2
+ alligevel
3
+ de
4
+ først
5
+ derfor
6
+ vi
7
+ selv
8
+ hertil
9
+ sådan
10
+ dette
11
+ sport
12
+ man
13
+ foto
14
+ begge
15
+ tag
16
+ dertil
17
+ reuter
18
+ efter
19
+ endelig
20
+ ifølge
21
+ lad
22
+ når
23
+ det
24
+ desuden
25
+ nu
26
+ reuters
27
+ årsagen
28
+ tænk
29
+ samtidig
30
+ udover
31
+ men
32
+ endvidere
33
+ rør
34
+ rb
35
+ udstillingen
36
+ faktabox
37
+ reception
38
+ blandt
39
+ hvad
40
+ skær
41
+ lilot
42
+ derudover
43
+ da
44
+ tilsæt
45
+ denne
46
+ afp
47
+ her
48
+ hvis
49
+ hæld
50
+ problemet
51
+ dermed
52
+ jeg
53
+ grafik
54
+ anmeldelse
55
+ den
56
+ ebbe
57
+ resultatet
58
+ tværtimod
59
+ hans
60
+ måske
61
+ feature
62
+ tillæg
63
+ hun
64
+ han
nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ m.j
2
+ t
3
+ ph
4
+ j.h
5
+ p.a.m
6
+ j.m
7
+ dr
8
+ st
9
+ j.b.m
10
+ p
11
+ nr
12
+ h.s
13
+ e.d
14
+ t.e
15
+ a.v
16
+ esb
17
+ s.z
18
+ drs
19
+ b.b
20
+ m.o
21
+ inc
22
+ n
23
+ pensioenfonds
24
+ s.v.p
25
+ bod
26
+ fr
27
+ pk
28
+ r.p
29
+ c.p.j
30
+ v.l.n.r
31
+ chr
32
+ m.v.d
33
+ int
34
+ o.m
35
+ j.v.d
36
+ u.o.m
37
+ f.c
38
+ k
39
+ bijgebracht
40
+ ontwaakte
41
+ m
42
+ j.w
43
+ a.l
44
+ a.v.d
45
+ s.v
46
+ s
47
+ j.d
48
+ binnengekomen
49
+ ds
50
+ schouwburg
51
+ b.v
52
+ h
53
+ a
54
+ j.a
55
+ aanvielen
56
+ h.g
57
+ p.f
58
+ j.l
59
+ mgr
60
+ c.j
61
+ blz
62
+ l.e.h
63
+ w.k
64
+ g
65
+ m.g
66
+ r.v.d
67
+ ing
68
+ v.d
69
+ c.q
70
+ l
71
+ h.p
72
+ mr
73
+ gesch
74
+ e.l
75
+ p.j
76
+ mm
77
+ j.g
78
+ j.f
79
+ c
80
+ f.m
81
+ jl
82
+ r
83
+ o.a
84
+ a.s
85
+ ir
86
+ v
87
+ j
88
+ jr
89
+ e
90
+ m.i.v
91
+ l.a
92
+ f.v.d
93
+ aansluit
94
+ c.c
95
+ a.m
96
+ f.o.j
97
+ m.b
98
+ y
99
+ th
nltk_data/tokenizers/punkt_tab/dutch/collocations.tab ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## sotelo
2
+ ##number## clas
3
+ ##number## buckler
4
+ ##number## carrera
5
+ ##number## rmo
6
+ ##number## orioli
7
+ w baron
8
+ ##number## morales
9
+ ##number## snotselelaank
10
+ ##number## arcarons
11
+ ##number## cavandoli
12
+ ##number## pdm
13
+ ##number## helvetia
14
+ ##number## panasonic
15
+ ##number## motorola
16
+ w bruinsma
17
+ ##number## heer
18
+ ##number## lotus
19
+ ##number## banesto
20
+ ##number## magnaldi
21
+ w jense
22
+ w heuvelmans
23
+ w spatje
24
+ ##number## telekom
25
+ f kennedy
26
+ ##number## gatorade
27
+ ##number## mg-gb
28
+ ##number## once
29
+ ##number## peterhansel
30
+ ##number## ariostea
31
+ ##number## tvm
32
+ ##number## höl
33
+ ##number## castorama
34
+ ##number## tulip
35
+ b situatie
36
+ ##number## mas
37
+ ##number## lotto
nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ het
2
+ daardoor
3
+ de
4
+ er
5
+ hoewel
6
+ wat
7
+ urlings
8
+ na
9
+ ze
10
+ alleen
11
+ dat
12
+ ik
13
+ pijls
14
+ wie
15
+ daarna
16
+ foto
17
+ als
18
+ boer
19
+ hammes
20
+ verder
21
+ ook
22
+ evers
23
+ vandaar
24
+ toen
25
+ we
26
+ langenberg
27
+ naast
28
+ want
29
+ in
30
+ wij
31
+ zo
32
+ hendrikx
33
+ daar
34
+ crouzen
35
+ dit
36
+ daarnaast
37
+ anp
38
+ zij
39
+ behalve
40
+ waarom
41
+ daarom
42
+ bovendien
43
+ hij
44
+ daarbij
45
+ nee
46
+ volgens
47
+ daarmee
48
+ bukkems
49
+ dvnl
50
+ eén
51
+ pas
52
+ tijdens
53
+ vooral
54
+ maar
nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ct
2
+ m.j
3
+ t
4
+ a.c
5
+ n.h
6
+ ms
7
+ p.a.m
8
+ dr
9
+ pa
10
+ p.m
11
+ u.k
12
+ st
13
+ dec
14
+ u.s.a
15
+ lt
16
+ g.k
17
+ adm
18
+ p
19
+ h.m
20
+ ga
21
+ tenn
22
+ yr
23
+ sen
24
+ n.c
25
+ j.j
26
+ d.h
27
+ s.g
28
+ inc
29
+ vs
30
+ s.p.a
31
+ a.t
32
+ n
33
+ feb
34
+ sr
35
+ jan
36
+ s.a.y
37
+ n.y
38
+ col
39
+ g.f
40
+ c.o.m.b
41
+ d
42
+ ft
43
+ va
44
+ r.k
45
+ e.f
46
+ chg
47
+ r.i
48
+ a.g
49
+ minn
50
+ a.h
51
+ k
52
+ n.j
53
+ m
54
+ l.f
55
+ f.j
56
+ gen
57
+ i.m.s
58
+ s.a
59
+ aug
60
+ j.p
61
+ okla
62
+ m.d.c
63
+ ltd
64
+ oct
65
+ s
66
+ vt
67
+ r.a
68
+ j.c
69
+ ariz
70
+ w.w
71
+ b.v
72
+ ore
73
+ h
74
+ w.r
75
+ e.h
76
+ mrs
77
+ cie
78
+ corp
79
+ w
80
+ n.v
81
+ a.d
82
+ r.j
83
+ ok
84
+ . .
85
+ e.m
86
+ w.c
87
+ ill
88
+ nov
89
+ u.s
90
+ prof
91
+ conn
92
+ u.s.s.r
93
+ mg
94
+ f.g
95
+ ph.d
96
+ g
97
+ calif
98
+ messrs
99
+ h.f
100
+ wash
101
+ tues
102
+ sw
103
+ bros
104
+ u.n
105
+ l
106
+ wis
107
+ mr
108
+ sep
109
+ d.c
110
+ ave
111
+ e.l
112
+ co
113
+ s.s
114
+ reps
115
+ c
116
+ r.t
117
+ h.c
118
+ r
119
+ wed
120
+ a.s
121
+ v
122
+ fla
123
+ jr
124
+ r.h
125
+ c.v
126
+ m.b.a
127
+ rep
128
+ a.a
129
+ e
130
+ c.i.t
131
+ l.a
132
+ b.f
133
+ j.b
134
+ d.w
135
+ j.k
136
+ ala
137
+ f
138
+ w.va
139
+ sept
140
+ mich
141
+ n.m
142
+ j.r
143
+ l.p
144
+ s.c
145
+ colo
146
+ fri
147
+ a.m
148
+ g.d
149
+ kan
150
+ maj
151
+ ky
152
+ a.m.e
153
+ n.d
154
+ t.j
155
+ cos
156
+ nev
nltk_data/tokenizers/punkt_tab/english/collocations.tab ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## international
2
+ ##number## rj
3
+ ##number## commodities
4
+ ##number## cooper
5
+ b stewart
6
+ ##number## genentech
7
+ ##number## wedgestone
8
+ i toussie
9
+ ##number## pepper
10
+ j fialka
11
+ o ludcke
12
+ ##number## insider
13
+ ##number## aes
14
+ i magnin
15
+ ##number## credit
16
+ ##number## corrections
17
+ ##number## financing
18
+ ##number## henley
19
+ ##number## business
20
+ ##number## pay-fone
21
+ b wigton
22
+ b edelman
23
+ b levine
24
+ ##number## leisure
25
+ b smith
26
+ j walter
27
+ ##number## pegasus
28
+ ##number## dividend
29
+ j aron
30
+ ##number## review
31
+ ##number## abreast
32
+ ##number## who
33
+ ##number## letters
34
+ ##number## colgate
35
+ ##number## cbot
36
+ ##number## notable
37
+ ##number## zimmer
nltk_data/tokenizers/punkt_tab/english/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/english/sent_starters.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ most
2
+ he
3
+ since
4
+ so
5
+ both
6
+ these
7
+ it
8
+ nevertheless
9
+ this
10
+ indeed
11
+ however
12
+ instead
13
+ under
14
+ similarly
15
+ some
16
+ though
17
+ while
18
+ when
19
+ in
20
+ despite
21
+ although
22
+ nonetheless
23
+ thus
24
+ there
25
+ if
26
+ the
27
+ nor
28
+ separately
29
+ moreover
30
+ but
31
+ they
32
+ yet
33
+ many
34
+ according
35
+ sales
36
+ among
37
+ meanwhile
38
+ even
39
+ i
nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ eos
2
+ c
3
+ a.d
4
+ t.a.s.s
5
+ e.t
6
+ päevapiltnikud
7
+ c.h
8
+ b.p
9
+ amm
10
+ ameerika-mees
11
+ n.-ö
12
+ cm
13
+ b
14
+ mhm
15
+ a.s
16
+ m.e
17
+ j.l
18
+ j
19
+ u.t
20
+ vm
21
+ g.u.n
22
+ hajutada
23
+ p.s
24
+ a.b
25
+ c.h.-r
26
+ i.q
27
+ gr
28
+ fido
29
+ pankurit
30
+ s.v
31
+ l.l
32
+ c.-h
33
+ m.h
34
+ h.l
35
+ m.k
36
+ j.r
37
+ t.k
38
+ k.h
39
+ 89/90
40
+ h
41
+ a
42
+ dost
43
+ v.k
44
+ e.q
45
+ t.j
46
+ m.b
47
+ d
48
+ p.k
nltk_data/tokenizers/punkt_tab/estonian/collocations.tab ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## juuni
2
+ ##number## novembril
3
+ ##number## juulilt
4
+ r järve-vomm
5
+ ##number## mida
6
+ n liidu
7
+ ##number## milliseid
8
+ ##number## oktoobri
9
+ ##number## iidol
10
+ m e
11
+ ##number## klassist
12
+ ##number## millest
13
+ ##number## august
14
+ ##number## pariis
15
+ ##number## septembrist
16
+ ##number## oktoober
17
+ ##number## märtsini
18
+ ##number## kust
19
+ k mägi
20
+ ##number## detsembrist
21
+ ##number## jaanuari
22
+ ##number## epee
23
+ ##number## nimetage
24
+ ##number## novembrini
25
+ ##number## eluaasta
26
+ s mill
27
+ ##number## helsingi
28
+ ##number## jaanuarini
29
+ ##number## aastail
30
+ ##number## augustil
31
+ ##number## millise
32
+ ##number## juulist
33
+ ##number## mai
34
+ ##number## novembri
35
+ ##number## oktoobrist
36
+ ##number## juunini
37
+ ##number## septembriks
38
+ ##number## detsembril
39
+ p s
40
+ ##number## jaanuar
41
+ ##number## aastate
42
+ ##number## milline
43
+ ##number## kelle
44
+ ##number## jaanuaril
45
+ s stadnikov
46
+ ##number## aastaks
47
+ ##number## stockholm
48
+ ##number## suurim
49
+ ##number## aasta
50
+ ##number## sajandi
51
+ ##number## millega
52
+ ##number## aastast
53
+ ##number## aastal
54
+ ##number## kumb
55
+ ##number## septembril
56
+ ##number## korruselt
57
+ ##number## septembri
58
+ ##number## veebruarini
59
+ ##number## london
60
+ ##number## aastatel
61
+ ##number## september
62
+ ##number## veebruari
63
+ ##number## oktoobrini
64
+ ##number## mail
65
+ m kassovitz
66
+ ##number## action-film
67
+ ##number## mis
68
+ k herkül
69
+ n n
70
+ ##number## detsembrini
71
+ ##number## imre
72
+ t jõgeda
73
+ ##number## casino
74
+ ##number## septembrit
75
+ ##number## augustini
76
+ ##number## juulil
77
+ ##number## november
78
+ ##number## kuupäeval
79
+ ##number## taevas
80
+ ##number## septembrini
81
+ ##number## detsember
82
+ ##number## detsembri
83
+ ##number## juunil
84
+ ##number## augustist
85
+ n jurist
86
+ ##number## missugust
87
+ ##number## aastatesse
88
+ ##number## aprillil
89
+ ##number## augusti
90
+ ##number## oktoobril
91
+ ##number## märtsil
92
+ ##number## a
93
+ ##number## the
94
+ ##number## sajandil
95
+ ##number## aastani
96
+ ##number## juuli
97
+ ##number## septembrile
98
+ ##number## millist
99
+ ##number## millised
100
+ ##number## veebruaril
nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ kalad
2
+ õnneks
3
+ selle
4
+ maimu
5
+ teisipäeval
6
+ ma
7
+ skorpion
8
+ aga
9
+ lisaks
10
+ selleks
11
+ maitse
12
+ esiteks
13
+ erinevalt
14
+ pealegi
15
+ praegu
16
+ kas
17
+ tegelikult
18
+ neitsi
19
+ nädalavahetus
20
+ tema
21
+ kui
22
+ seega
23
+ täna
24
+ lugupidamisega
25
+ miks
26
+ teiseks
27
+ väldi
28
+ pohlak
29
+ osades
30
+ sõnn
31
+ samas
32
+ nimelt
33
+ juhtkiri
34
+ krimi
35
+ nädalavahetusel
36
+ näiteks
37
+ kuidas
38
+ ambur
39
+ telgmaa
40
+ laupäeval
41
+ seetõttu
42
+ rezhissöör
43
+ kahjuks
44
+ ent
45
+ samuti
46
+ ehkki
47
+ veevalaja
48
+ seepärast
49
+ muidugi
50
+ kuna
51
+ tänaseks
52
+ mina
53
+ loomulikult
54
+ ometi
55
+ arvamus
56
+ lõvi
57
+ ee
58
+ niisiis
59
+ mul
60
+ kaksikud
61
+ tõsi
62
+ hinnete
63
+ sestap
64
+ tõenäoliselt
65
+ samal
66
+ see
67
+ paraku
68
+ jäär
69
+ kokkuvõttes
70
+ küllap
71
+ muide
72
+ nüüd
73
+ kolmapäeval
74
+ võibolla
75
+ kuid
76
+ nädalavahetuse
77
+ kuigi
78
+ võid
79
+ lõpuks
80
+ kaalud
81
+ areen
82
+ kirjad
83
+ vähk
84
+ esmaspäeval
85
+ nii
86
+ need
87
+ uue
88
+ ta
89
+ minu
nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ t
2
+ suom
3
+ dr
4
+ st
5
+ 970125090.jtun
6
+ p
7
+ sis
8
+ t.h
9
+ 961221327.jtun
10
+ a.i
11
+ milj
12
+ ski
13
+ kp
14
+ 970131067.jtun
15
+ 970124030.jtun
16
+ nk
17
+ va
18
+ pan
19
+ yhteystiedot
20
+ ruots
21
+ jne
22
+ t.a
23
+ l.-g
24
+ k
25
+ j.w
26
+ p2
27
+ oik
28
+ 970102248.jtun
29
+ hj
30
+ s
31
+ vt
32
+ muistelmia
33
+ o.s
34
+ elo
35
+ h
36
+ ortod
37
+ o.l
38
+ w
39
+ tms
40
+ 970120219.jtun
41
+ pj
42
+ ok
43
+ toissapäiväinen
44
+ 28.t1
45
+ pelintekijä
46
+ 970111011.jtun
47
+ op
48
+ os
49
+ ns
50
+ m.g
51
+ o.-i
52
+ m3
53
+ pros
54
+ mr
55
+ 970102171.jtun
56
+ waller
57
+ hels
58
+ rotary-järjestössä
59
+ ins
60
+ esim
61
+ apul
62
+ fil
63
+ id
64
+ ym
65
+ j
66
+ rf
67
+ v.o
68
+ lis
69
+ c.a
70
+ em
71
+ kand
72
+ r.y
73
+ valt
74
+ dipl
75
+ ö
76
+ 970111092.jtun
77
+ ponteva
78
+ y
79
+ kapakoista
80
+ 970130160.jtun
81
+ th
nltk_data/tokenizers/punkt_tab/finnish/collocations.tab ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## sm
2
+ ##number## ohjelmassa
3
+ ##number## a3
4
+ ##number## rc3
5
+ ##number## rxd4
6
+ ##number## hxg4
7
+ o stenberg
8
+ ##number## lg5
9
+ ##number## tallitontun
10
+ ##number## lähetysohjeet
11
+ ##number## uimakoulu
12
+ ##number## jaana
13
+ ##number## alustuksen
14
+ ##number## uppo-nallen
15
+ ##number## anne
16
+ ##number## rxf3
17
+ a sjögren
18
+ ##number## kamarikuoro
19
+ ##number## vetäjänä
20
+ ##number## pääsymaksu
21
+ ##number## kerros
22
+ ##number## kurssi
23
+ ##number## kuori
24
+ ##number## g4
25
+ ##number## h3
26
+ ##number## tiede-teatterissa
27
+ ##number## kh2
28
+ ##number## kausimaksu
29
+ ##number## tia
30
+ ##number## gxf5
31
+ ##number## täky-galleria
32
+ ##number## le2
33
+ ##number## te8+
34
+ ##number## la4
35
+ ##number## keitä
36
+ ##number## huhtikuuta
37
+ ##number## menotiedoissa
38
+ ##number## valmista
39
+ ##number## txb5
40
+ ##number## maskeerauskurssin
41
+ ##number## rd2
42
+ ##number## re2
43
+ ##number## solisteina
44
+ ##number## esitelmä
45
+ ##number## puupiirrossarja
46
+ ##number## ta1
47
+ ##number## vaahdota
48
+ ##number## h4
49
+ ##number## kesäkuuta
50
+ ##number## liikkeitä
51
+ ##number## tuolloin
52
+ ##number## viikko
53
+ ##number## mittaa
54
+ a sjögrenin
55
+ ##number## exf6
56
+ ##number## rc6+
57
+ ##number## viimeistele
58
+ ##number## ld1
59
+ ##number## elokuuta
60
+ ##number## dh5+
61
+ ##number## syyskuuta
62
+ ##number## opettajina
63
+ ##number## b3
64
+ ##number## rauhankatu
65
+ c clarke
66
+ ##number## saakka
67
+ ##number## elokuvat
68
+ b huggins
69
+ g gahmberg
70
+ ##number## luento
71
+ ##number## lf3
72
+ ##number## tammikuuta
73
+ ##number## ryömä
74
+ ##number## meller
75
+ ##number## jäsenkortti
76
+ ##number## esiintyjinä
77
+ ##number## maria
78
+ ##number## lf4
79
+ ##number## siirto
80
+ ##number## aurinko
81
+ ##number## lxg6
82
+ ##number## marraskuuta
83
+ ##number## harjoituksissa
84
+ ##number## romantika-yhtye
85
+ ##number## g3
86
+ ##number## heinäkuuta
87
+ ##number## rxd5
88
+ ##number## kuumenna
89
+ e hämäläisen
90
+ ##number## bxc4
91
+ ##number## te1
92
+ ##number## kg2
93
+ ##number## osallistumismaksu
94
+ ##number## re5
95
+ ##number## ohjelma
96
+ ##number## varapuheenjohtajaksi
97
+ ##number## raisa
98
+ ##number## päivään
99
+ ##number## luokan
100
+ ##number## sulata
101
+ ##number## levitä
102
+ ##number## kaustinen
103
+ ##number## kuoroa
104
+ ##number## df3
105
+ v helsingistä
106
+ ##number## mieskuoro
107
+ ##number## lokakuuta
108
+ ##number## kerho
109
+ ##number## helmikuuta
110
+ ##number## kokkola
111
+ ##number## suuruusluokan
112
+ v kaupungista
113
+ ##number## krs
114
+ ##number## tekstit
115
+ ##number## menyy
116
+ ##number## rf3
117
+ ##number## ulkoasiainministeriön
118
+ ##number## kaada
119
+ ##number## cxd5
120
+ ##number## ilmailumuseo
121
+ e waris
122
+ ##number## kierros
123
+ ##number## tunnille
124
+ ##number## kh3
125
+ ##number## ohjaus
126
+ a t.
127
+ ##number## postimaksu
128
+ ##number## pane
129
+ ##number## th3
130
+ ##number## joulukuuta
131
+ ##number## vatkaa
132
+ ##number## kokeessa
133
+ l j.
134
+ ##number## asti
135
+ ##number## opastajana
136
+ ##number## kirsi
137
+ ##number## lc2
138
+ ##number## lh2
139
+ ##number## e4
140
+ ##number## sairaankuljetukset
141
+ ##number## sekoita
142
+ ##number## mervi
143
+ ##number## de2
144
+ a pietilän
145
+ ##number## kf1
146
+ ##number## toukokuuta
147
+ ##number## maaliskuuta
148
+ ##number## leikkaa
149
+ ##number## ryhmänäytökset
150
+ v maaseudulta
151
+ ##number## de3-e1
152
+ ##number## c4
153
+ ##number## ta1-b1
154
+ ##number## d5
155
+ ##number## pia
156
+ ##number## lxd6
157
+ ##number## d4
158
+ ##number## f3-f4
159
+ ##number## dxg6+
160
+ ##number## sari
161
+ ##number## pelkkään
162
+ ##number## ld3
163
+ ##number## perkaa
164
+ ##number## lg3
165
+ ##number## kg3
166
+ ##number## kvm
167
+ ##number## tb1xb6
nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ siinä
2
+ lämpötila
3
+ viiden
4
+ he
5
+ vapaa
6
+ viime
7
+ useimmat
8
+ kansallisooppera
9
+ rooleissa
10
+ näin
11
+ odotettavissa
12
+ tiedustelut
13
+ kansallisteatterin
14
+ sen
15
+ musiikki
16
+ monet
17
+ uusi
18
+ avoinna
19
+ pakkasta
20
+ freeze
21
+ tämä
22
+ lämpö
23
+ lautakunta
24
+ vastaväittäjänä
25
+ päivällä
26
+ tällä
27
+ esimerkiksi
28
+ varoituksia
29
+ merenkurkku
30
+ meriennuste
31
+ näyttelyssä
32
+ kun
33
+ pilvistä
34
+ silloin
35
+ selkämeren
36
+ suurin
37
+ se
38
+ jos
39
+ vaihtelevaa
40
+ vastaväittäjinä
41
+ sivu
42
+ kaupunginteatterin
43
+ pilvisyys
44
+ siellä
45
+ siksi
46
+ kurssimaksu
47
+ tämän
48
+ kotimaa
49
+ näiden
50
+ teatteri
51
+ kaikki
52
+ puolipilvistä
53
+ niiden
54
+ maksimilämpötila
55
+ lisäksi
56
+ kaupunginhallitus
57
+ helsingin
58
+ nyt
59
+ samalla
60
+ hänen
61
+ olen
62
+ kaupunkikierros
63
+ vastaväittäjä
64
+ ne
65
+ tästä
66
+ enimmäkseen
67
+ poika
68
+ niinpä
69
+ viirus
70
+ me
71
+ poliisi
72
+ liput
73
+ ilmoittautuminen
74
+ tarjoa
75
+ hän
76
+ molemmat
77
+ ulkomaat
78
+ rock
79
+ lääketieteen
80
+ tanssi
81
+ sainks
82
+ näyttely
83
+ lisätietoja
84
+ ulkomaiden
85
+ näyttelyn
86
+ palo
nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ p.o.l
2
+ pds
3
+ 3o
4
+ inscr
5
+ suè
6
+ z
7
+ abst
8
+ g.-b
9
+ tél
10
+ r
11
+ ed
12
+ o
13
+ b
14
+ esp
15
+ j.l
16
+ v
17
+ k
18
+ e.p
19
+ aus
20
+ jap
21
+ r.e
22
+ gb-bel
23
+ p
24
+ aut
25
+ usx
26
+ arg
27
+ g
28
+ e
29
+ etc
30
+ fra
31
+ p.s
32
+ j.-l
33
+ blu
34
+ e.-u
35
+ f.b
36
+ msf
37
+ e.d
38
+ shi
39
+ can
40
+ j.b
41
+ s.a
42
+ f.o
43
+ you
44
+ mir
45
+ inc
46
+ ital
47
+ expr
48
+ tch
49
+ g-b-bel
50
+ cid
51
+ c.u
52
+ ctk
53
+ j.-m.g
54
+ bta
55
+ p.-b
56
+ cie
57
+ ita
58
+ equ
59
+ corp
60
+ vot
61
+ w
nltk_data/tokenizers/punkt_tab/french/collocations.tab ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## shinozuka-magne
2
+ ##number## ambrosino-baumgartner
3
+ c tanvier
4
+ f b.
5
+ ##number## waldegaard-fenouil
6
+ ##number## fermé
7
+ a dechaume
8
+ i demongeot
9
+ s motos
10
+ ##number## rahier
11
+ ##number## magnaldi
12
+ ##number## orioli
13
+ f tél.
14
+ ##number## cowan-delferrier
15
+ ##number## vatanen-berglund
16
+ ##number## picco
17
+ ##number## masuoka-oligo
18
+ ##number## medardo
nltk_data/tokenizers/punkt_tab/french/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/french/sent_starters.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ c
2
+ depuis
3
+ la
4
+ enfin
5
+ certains
6
+ selon
7
+ cet
8
+ car
9
+ ces
10
+ il
11
+ cependant
12
+ pour
13
+ j
14
+ alors
15
+ un
16
+ certes
17
+ les
18
+ nous
19
+ dans
20
+ le
21
+ une
22
+ si
23
+ mais
24
+ en
25
+ dès
26
+ or
27
+ tout
28
+ ils
29
+ l
30
+ mr
31
+ malgré
32
+ elles
33
+
34
+ je
35
+ on
36
+ quand
37
+ pourtant
38
+ cela
39
+ a
40
+ après
41
+ puis
42
+ ce
43
+ elle
44
+ voilà
45
+ cette
46
+ comment
47
+ quant
48
+ ainsi
nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rfr
2
+ t
3
+ c
4
+ a.d
5
+ dk
6
+ he
7
+ mjm
8
+ inkl
9
+ bt
10
+ 69f
11
+ crz
12
+ dr
13
+ st
14
+ ib
15
+ liv
16
+ mrd
17
+ n.r
18
+ rg
19
+ v
20
+ vgl
21
+ mgr
22
+ cs
23
+ prof
24
+ j
25
+ kfr
26
+ bd
27
+ fre
28
+ gfh
29
+ fon
30
+ m
31
+ rp
32
+ nr
33
+ chr
34
+ etc
35
+ hg
36
+ sx
37
+ rz
38
+ 48f
39
+ kmu
40
+ abs
41
+ nkm
42
+ z.b
43
+ usw
44
+ f
45
+ d.h
46
+ lz
47
+ sc
48
+ usf
49
+ gir
50
+ hag
51
+ ff
52
+ mio
53
+ zr
54
+ k
55
+ h
56
+ mey
57
+ bst
58
+ ne
59
+ u.a
60
+ fem
61
+ bzw
62
+
63
+ med
64
+ u
65
+ lts
66
+ fr
67
+ s.o.s
68
+ w
69
+ lib
70
+ k.a
71
+ th
nltk_data/tokenizers/punkt_tab/german/collocations.tab ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##number## oktober
2
+ ##number## jahrhunderts
3
+ ##number## geburtstag
4
+ ##number## juni
5
+ s ##number##
6
+ ##number## september
7
+ ##number## mai
8
+ ##number## dezember
9
+ ##number## april
10
+ ##number## ahv-revision
11
+ ##number## revision
12
+ ##number## jahrhundert
13
+ ##number## landwirtschaftsbericht
14
+ ##number## altersjahr
15
+ ##number## februar
16
+ a schumpeter
17
+ ##number## freiheit
18
+ ##number## august
19
+ ##number## januar
20
+ ##number## märz
21
+ a meyers
22
+ ##number## november
23
+ ##number## bauetappe
24
+ ##number## ahv-
25
+ ##number## eu-richtlinie
26
+ ##number## juli
27
+ a meyer
28
+ ##number## säule
nltk_data/tokenizers/punkt_tab/german/ortho_context.tab ADDED
The diff for this file is too large to render. See raw diff
 
nltk_data/tokenizers/punkt_tab/german/sent_starters.txt ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ das
2
+ man
3
+ es
4
+ wir
5
+ dabei
6
+ ferner
7
+ ähnliches
8
+ während
9
+ entscheidend
10
+ ausserdem
11
+ ein
12
+ in
13
+ der
14
+ daraus
15
+ obschon
16
+ beide
17
+ hier
18
+ all
19
+ neben
20
+ solche
21
+ hingegen
22
+ selbstverständlich
23
+ daneben
24
+ hinzu
25
+ vielmehr
26
+ sie
27
+ natürlich
28
+ obwohl
29
+ nun
30
+ doch
31
+ ob
32
+ abgesehen
33
+ überdies
34
+ im
35
+ zweitens
36
+ darin
37
+ erstens
38
+ dieses
39
+ nach
40
+ wer
41
+ da
42
+ interessant
43
+ seit
44
+ zudem
45
+ darüber
46
+ umgekehrt
47
+ ähnlich
48
+ aber
49
+ was
50
+ nachdem
51
+ insbesondere
52
+ statt
53
+ angesichts
54
+ gefragt
55
+ gleiches
56
+ solange
57
+ wenn
58
+ dies
59
+ dass
60
+ wie
61
+ damit
62
+ allerdings
63
+ denn
64
+ letztere
65
+ eine
66
+ selbst
67
+ gleichzeitig
68
+ wo
69
+ weder
70
+ gerade
71
+ unter
72
+ problematischer
73
+ wieso
74
+ dennoch
75
+ bei
76
+ deshalb
77
+ davon
78
+ andernfalls
79
+ er
80
+ die
81
+ anders
82
+ auch
83
+ ebenso
84
+ so
85
+ inzwischen
86
+ sonst
87
+ immerhin
88
+ entsprechend
89
+ danach
90
+ am
91
+ trotz
92
+ trotzdem
93
+ worum
94
+ damals
95
+ dafür
96
+ schliesslich
97
+ gemäss
98
+ demgegenüber
99
+ warum
100
+ letzteres
101
+ mit
102
+ dazu
103
+ anderseits
104
+ ganz
105
+ zwar
106
+ dieser
107
+ diese
nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ κλ
2
+ δημ
3
+ χλμ
4
+ σ.τ.ε
5
+ ό.π
6
+ δρχ
7
+ κων
8
+ χρ
9
+ π.α
10
+ ριχ
11
+ π.χρ
12
+ υγ
13
+ tel
14
+ ζ
15
+ ο.π
16
+ βασ
17
+ γλ
18
+ n.c
19
+ d.j
20
+ σωκ
21
+ π
22
+ ιω
23
+ αχ
24
+ βα
25
+ γερ
26
+ εκδ
27
+ κλπ
28
+ φ
29
+ ελ
30
+ οσ
31
+ α
32
+ σελ
33
+ ευ
34
+ ε.έ
35
+ ρ
36
+ ε.τ.α
37
+ λ
38
+ εβ
39
+ θρ
40
+ ν
41
+ βλ
42
+ ηλ
43
+ γ
44
+ αρ
45
+ π.χ
46
+ ε.μ
47
+ κ.μ
48
+ α.ε
49
+ μιχ
50
+ δισ
51
+ ολ
52
+ μ
53
+ κ.ά
54
+ κ
55
+ δηλ
56
+ ε.α.χ
57
+ πρ
58
+ αγ
59
+ μac
60
+ κ.ο.κ
61
+ λ.χ
62
+ θ
63
+ αδσ
64
+ εκατ
65
+ δρη
66
+ εμμ
67
+ δ
68
+ δεκ
69
+ σ.σ
70
+ 55ο
71
+ κκ
72
+ αδ
73
+ τ.μ
74
+ ε.ε
75
+ μ.χ
76
+ ν.μ
77
+ κτλ
78
+ δολ
79
+ κ.ά.π
80
+ αγγ
81
+ μ.κ
82
+ δ.σ
83
+ μπ
84
+ έκδ
85
+ ι
86
+ v
87
+ χαρ
88
+ γρ
89
+ μ.μ.ε
90
+ σχ
91
+ λεκ
92
+ σπ
93
+ πλι
94
+ αθ
95
+ χ
96
+ τζ
97
+ τρισ
98
+ στ
99
+ ευθ
100
+ μ.μ