Spaces:
Sleeping
Sleeping
Commit
·
da8d2e4
0
Parent(s):
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .devcontainer/devcontainer.json +33 -0
- .gitattributes +35 -0
- .gitignore +12 -0
- .idea/Tilmash_Translator.iml +7 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +7 -0
- .idea/vcs.xml +6 -0
- .idea/workspace.xml +0 -0
- Dockerfile +50 -0
- LICENSE.txt +21 -0
- README.md +115 -0
- check_gpu.py +69 -0
- config.py +22 -0
- main.py +345 -0
- models/nltk_resources.py +21 -0
- nltk_data/tokenizers/punkt_tab.zip +3 -0
- nltk_data/tokenizers/punkt_tab/README +98 -0
- nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt +118 -0
- nltk_data/tokenizers/punkt_tab/czech/collocations.tab +96 -0
- nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt +54 -0
- nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt +211 -0
- nltk_data/tokenizers/punkt_tab/danish/collocations.tab +101 -0
- nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt +64 -0
- nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt +99 -0
- nltk_data/tokenizers/punkt_tab/dutch/collocations.tab +37 -0
- nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt +54 -0
- nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt +156 -0
- nltk_data/tokenizers/punkt_tab/english/collocations.tab +37 -0
- nltk_data/tokenizers/punkt_tab/english/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/english/sent_starters.txt +39 -0
- nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt +48 -0
- nltk_data/tokenizers/punkt_tab/estonian/collocations.tab +100 -0
- nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt +89 -0
- nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt +81 -0
- nltk_data/tokenizers/punkt_tab/finnish/collocations.tab +167 -0
- nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt +86 -0
- nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt +61 -0
- nltk_data/tokenizers/punkt_tab/french/collocations.tab +18 -0
- nltk_data/tokenizers/punkt_tab/french/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/french/sent_starters.txt +48 -0
- nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt +71 -0
- nltk_data/tokenizers/punkt_tab/german/collocations.tab +28 -0
- nltk_data/tokenizers/punkt_tab/german/ortho_context.tab +0 -0
- nltk_data/tokenizers/punkt_tab/german/sent_starters.txt +107 -0
- nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt +100 -0
.devcontainer/devcontainer.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Python 3",
|
| 3 |
+
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
| 4 |
+
"image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
|
| 5 |
+
"customizations": {
|
| 6 |
+
"codespaces": {
|
| 7 |
+
"openFiles": [
|
| 8 |
+
"README.md",
|
| 9 |
+
"main.py"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
"vscode": {
|
| 13 |
+
"settings": {},
|
| 14 |
+
"extensions": [
|
| 15 |
+
"ms-python.python",
|
| 16 |
+
"ms-python.vscode-pylance"
|
| 17 |
+
]
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
|
| 21 |
+
"postAttachCommand": {
|
| 22 |
+
"server": "streamlit run main.py --server.enableCORS false --server.enableXsrfProtection false"
|
| 23 |
+
},
|
| 24 |
+
"portsAttributes": {
|
| 25 |
+
"8501": {
|
| 26 |
+
"label": "Application",
|
| 27 |
+
"onAutoForward": "openPreview"
|
| 28 |
+
}
|
| 29 |
+
},
|
| 30 |
+
"forwardPorts": [
|
| 31 |
+
8501
|
| 32 |
+
]
|
| 33 |
+
}
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
.huggingface/
|
| 7 |
+
.cache/
|
| 8 |
+
local_llms/.cache/
|
| 9 |
+
local_llms/.locks/
|
| 10 |
+
local_llms/locks/
|
| 11 |
+
local_llms/instances/
|
| 12 |
+
local_llms/models--*/
|
.idea/Tilmash_Translator.iml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<module version="4">
|
| 3 |
+
<component name="PyDocumentationSettings">
|
| 4 |
+
<option name="format" value="PLAIN" />
|
| 5 |
+
<option name="myDocStringFormat" value="Plain" />
|
| 6 |
+
</component>
|
| 7 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<component name="InspectionProjectProfileManager">
|
| 2 |
+
<settings>
|
| 3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
| 4 |
+
<version value="1.0" />
|
| 5 |
+
</settings>
|
| 6 |
+
</component>
|
.idea/misc.xml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="Black">
|
| 4 |
+
<option name="sdkName" value="Python 3.13 (Tilmash_Translator)" />
|
| 5 |
+
</component>
|
| 6 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (Tilmash_Translator)" project-jdk-type="Python SDK" />
|
| 7 |
+
</project>
|
.idea/vcs.xml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="VcsDirectoryMappings">
|
| 4 |
+
<mapping directory="" vcs="Git" />
|
| 5 |
+
</component>
|
| 6 |
+
</project>
|
.idea/workspace.xml
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Dockerfile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
curl \
|
| 9 |
+
software-properties-common \
|
| 10 |
+
git \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
COPY requirements.txt ./
|
| 14 |
+
COPY src/ ./src/
|
| 15 |
+
|
| 16 |
+
RUN pip3 install -r requirements.txt
|
| 17 |
+
|
| 18 |
+
EXPOSE 8501
|
| 19 |
+
|
| 20 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
|
| 21 |
+
|
| 22 |
+
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
| 23 |
+
=======
|
| 24 |
+
FROM python:3.11-slim
|
| 25 |
+
|
| 26 |
+
# Базовые системные зависимости (для сборки/запуска)
|
| 27 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 28 |
+
build-essential git libglib2.0-0 libgl1 \
|
| 29 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
+
|
| 31 |
+
# Установка CUDA поддержки (для PyTorch)
|
| 32 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 33 |
+
PYTHONDONTWRITEBYTECODE=1 \
|
| 34 |
+
PIP_NO_CACHE_DIR=1
|
| 35 |
+
|
| 36 |
+
WORKDIR /app
|
| 37 |
+
|
| 38 |
+
# Устанавливаем Python-зависимости
|
| 39 |
+
COPY requirements.txt /app/requirements.txt
|
| 40 |
+
ENV PIP_NO_CACHE_DIR=1 \
|
| 41 |
+
HF_HOME=/data/.cache/huggingface \
|
| 42 |
+
PORT=7860
|
| 43 |
+
RUN pip install -U pip && pip install -r requirements.txt
|
| 44 |
+
|
| 45 |
+
# Копируем весь проект
|
| 46 |
+
COPY . /app
|
| 47 |
+
|
| 48 |
+
# Запуск Streamlit внутри контейнера
|
| 49 |
+
CMD ["streamlit", "run", "main.py", "--server.port=7860", "--server.address=0.0.0.0"]
|
| 50 |
+
>>>>>>> 805a119 (Initial commit for HF Space)
|
LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The MIT License (MIT)
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2011-2025 The Bootstrap Authors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in
|
| 13 |
+
all copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
| 21 |
+
THE SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<<<<<<< HEAD
|
| 2 |
+
---
|
| 3 |
+
title: Translate Tl
|
| 4 |
+
emoji: 🚀
|
| 5 |
+
colorFrom: red
|
| 6 |
+
colorTo: red
|
| 7 |
+
sdk: docker
|
| 8 |
+
app_port: 8501
|
| 9 |
+
tags:
|
| 10 |
+
- streamlit
|
| 11 |
+
pinned: false
|
| 12 |
+
short_description: Streamlit template space
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Welcome to Streamlit!
|
| 16 |
+
|
| 17 |
+
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
| 18 |
+
|
| 19 |
+
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 20 |
+
forums](https://discuss.streamlit.io).
|
| 21 |
+
=======
|
| 22 |
+
$yaml = @"
|
| 23 |
+
---
|
| 24 |
+
title: Tilmash Translator
|
| 25 |
+
sdk: streamlit
|
| 26 |
+
app_file: main.py
|
| 27 |
+
python_version: "3.11"
|
| 28 |
+
pinned: false
|
| 29 |
+
---
|
| 30 |
+
"@
|
| 31 |
+
$orig = Get-Content -Raw README.md
|
| 32 |
+
Set-Content README.md $yaml -Encoding UTF8
|
| 33 |
+
Add-Content README.md $orig
|
| 34 |
+
|
| 35 |
+
# Tilmash Translator
|
| 36 |
+
|
| 37 |
+
**Tilmash Translator** is an offline‑first, privacy‑preserving translation and readability toolkit for Russian, English and Kazakh.
|
| 38 |
+
|
| 39 |
+
It ships as a Streamlit web‑app and offers two core capabilities:
|
| 40 |
+
|
| 41 |
+
1. **Neural Machine Translation**
|
| 42 |
+
• Primary model — [ISSAI/tilmash](https://huggingface.co/issai/tilmash) (Seq2Seq) for RU ↔ EN ↔ KK
|
| 43 |
+
• Smart chunking & streaming make multi‑page documents feel snappy
|
| 44 |
+
2. **Readability Analysis**
|
| 45 |
+
• Calculates Flesch Reading Ease, Flesch‑Kincaid, Gunning Fog and SMOG
|
| 46 |
+
• Highlights complex words and supports RU/EN/KK
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
---
|
| 50 |
+
|
| 51 |
+
## Quick Start
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
# 1. Clone & create a virtual environment
|
| 55 |
+
$ git clone https://github.com/medetshatayev/Tilmash_Translator.git
|
| 56 |
+
$ cd Tilmash_Translator
|
| 57 |
+
$ python3 -m venv .venv && source .venv/bin/activate
|
| 58 |
+
|
| 59 |
+
# 2. Install dependencies
|
| 60 |
+
$ pip install -r requirements.txt
|
| 61 |
+
|
| 62 |
+
# 3. (optional) authenticate once to download the Tilmash weights
|
| 63 |
+
$ echo "HF_TOKEN=🪄your_huggingface_token" > .env
|
| 64 |
+
|
| 65 |
+
# 4. Launch the Streamlit app
|
| 66 |
+
$ streamlit run main.py
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
💡 The helper script `start.sh` automates the above and sets safe memory limits for `llama‑cpp-python`.
|
| 70 |
+
|
| 71 |
+
### GPU Off‑loading (Gemma‑3)
|
| 72 |
+
|
| 73 |
+
Set `GEMMA_GPU_LAYERS=<num_layers>` in your environment (defaults to **48**) to off‑load those layers to Metal/CUDA.
|
| 74 |
+
|
| 75 |
+
---
|
| 76 |
+
|
| 77 |
+
## Project Layout
|
| 78 |
+
|
| 79 |
+
```
|
| 80 |
+
.
|
| 81 |
+
├── main.py # Streamlit UI
|
| 82 |
+
├── utils/ # Translation & analysis helpers
|
| 83 |
+
│ ├── tilmash_translation.py
|
| 84 |
+
│ ├── gemma_translation.py
|
| 85 |
+
│ ├── readability_indices.py
|
| 86 |
+
│ └── ...
|
| 87 |
+
├── models/ # Extra resources (NLTK, etc.)
|
| 88 |
+
├── config.py # Default env‑vars
|
| 89 |
+
├── start.sh # Convenience launcher
|
| 90 |
+
└── requirements.txt # Python deps
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## Configuration Keys
|
| 94 |
+
|
| 95 |
+
| Variable | Default | Purpose |
|
| 96 |
+
|------------------------|---------|-------------------------------------------|
|
| 97 |
+
| `GEMMA_GPU_LAYERS` | 48 | Layers to move to GPU (0 = CPU‑only) |
|
| 98 |
+
| `GEMMA_CONTEXT_SIZE` | 8192 | Context window for Gemma‑3 |
|
| 99 |
+
| `MAX_PARALLEL_MODELS` | 4 | Concurrency guard |
|
| 100 |
+
| `MAX_TOKENS` | 4096 | Generation cap per request |
|
| 101 |
+
| `CHUNK_SIZE` | 3000 | Token threshold before auto‑chunking |
|
| 102 |
+
|
| 103 |
+
Override any of these via the environment or edit **config.py**.
|
| 104 |
+
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
## How It Works
|
| 108 |
+
|
| 109 |
+
1. **File ingestion** — `.txt`, `.docx`, `.pdf` loaded via `utils/file_readers.py`
|
| 110 |
+
2. **Language detection** — `langdetect` (auto‑detect option in UI)
|
| 111 |
+
3. **Translation pipeline** — <3000 tokens translate directly; longer texts are chunked (`utils/chunking.py`) and streamed through Tilmash or Gemma‑3
|
| 112 |
+
4. **Readability analysis** — scores computed in `utils/readability_indices.py` and color‑coded in the app.
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
>>>>>>> 805a119 (Initial commit for HF Space)
|
check_gpu.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Этот скрипт проверяет наличие GPU и совместимость с PyTorch.
|
| 5 |
+
Запустите его, чтобы проверить конфигурацию вашей системы:
|
| 6 |
+
|
| 7 |
+
python check_gpu.py
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import platform
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
def check_system():
|
| 15 |
+
print(f"Операционная система: {platform.system()} {platform.release()}")
|
| 16 |
+
print(f"Python версия: {sys.version}")
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
import torch
|
| 20 |
+
print(f"\nПроверка PyTorch:")
|
| 21 |
+
print(f"Версия PyTorch: {torch.__version__}")
|
| 22 |
+
|
| 23 |
+
# Проверка CUDA
|
| 24 |
+
if hasattr(torch.version, "cuda"):
|
| 25 |
+
print(f"CUDA версия: {torch.version.cuda}")
|
| 26 |
+
else:
|
| 27 |
+
print("CUDA версия: не найдена")
|
| 28 |
+
|
| 29 |
+
# Проверка доступности CUDA
|
| 30 |
+
print(f"CUDA доступен: {torch.cuda.is_available()}")
|
| 31 |
+
|
| 32 |
+
if torch.cuda.is_available():
|
| 33 |
+
print(f"Обнаружено GPU: {torch.cuda.get_device_name(0)}")
|
| 34 |
+
print(f"Количество GPU: {torch.cuda.device_count()}")
|
| 35 |
+
for i in range(torch.cuda.device_count()):
|
| 36 |
+
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
|
| 37 |
+
|
| 38 |
+
# Проверка MPS (Apple Silicon)
|
| 39 |
+
if hasattr(torch.backends, "mps"):
|
| 40 |
+
print(f"\nApple MPS доступен: {torch.backends.mps.is_available()}")
|
| 41 |
+
if torch.backends.mps.is_available():
|
| 42 |
+
print("Обнаружен Apple Silicon GPU (M1/M2/M3)")
|
| 43 |
+
|
| 44 |
+
except ImportError:
|
| 45 |
+
print("PyTorch не установлен.")
|
| 46 |
+
print("Установите PyTorch командой: pip install torch")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Ошибка при проверке PyTorch: {str(e)}")
|
| 49 |
+
|
| 50 |
+
if __name__ == "__main__":
|
| 51 |
+
print("===== Диагностика GPU для Tilmash =====")
|
| 52 |
+
check_system()
|
| 53 |
+
|
| 54 |
+
print("\n===== Готовность системы =====")
|
| 55 |
+
try:
|
| 56 |
+
import torch
|
| 57 |
+
if torch.cuda.is_available():
|
| 58 |
+
print("✅ GPU CUDA обнаружен и готов к использованию")
|
| 59 |
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 60 |
+
print("✅ Apple Silicon GPU (MPS) обнаружен и готов к использованию")
|
| 61 |
+
else:
|
| 62 |
+
print("⚠️ GPU не обнаружен. Tilmash будет работать на CPU (медленнее)")
|
| 63 |
+
except ImportError:
|
| 64 |
+
print("❌ PyTorch не установлен. Установите его командой: pip install torch")
|
| 65 |
+
|
| 66 |
+
print("\nСовет: Если у вас есть GPU, но он не обнаружен, проверьте:\n"
|
| 67 |
+
"1. Драйверы NVIDIA (для CUDA)\n"
|
| 68 |
+
"2. Правильную версию PyTorch для вашей системы\n"
|
| 69 |
+
"3. Переустановите PyTorch с поддержкой CUDA: pip install torch --upgrade")
|
config.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration defaults for the Translator application.
|
| 3 |
+
This file contains the default values for environment variables.
|
| 4 |
+
These are only used if the actual environment variables are not set.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
# Default model configuration
|
| 8 |
+
DEFAULT_CONFIG = {
|
| 9 |
+
"MAX_PARALLEL_MODELS": 4,
|
| 10 |
+
"SESSION_TIMEOUT": 1800,
|
| 11 |
+
"MODEL_INSTANCE_TIMEOUT": 1800,
|
| 12 |
+
"ALLOW_GPU": True, # Разрешить использование GPU если доступно
|
| 13 |
+
"LOGLEVEL": "INFO",
|
| 14 |
+
"MAX_TOKENS": 4096,
|
| 15 |
+
"CHUNK_SIZE": 3000
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# Convert boolean and integer values to strings for environment variables
|
| 19 |
+
ENV_DEFAULTS = {
|
| 20 |
+
key: str(value).lower() if isinstance(value, bool) else str(value)
|
| 21 |
+
for key, value in DEFAULT_CONFIG.items()
|
| 22 |
+
}
|
main.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import streamlit as st
|
| 5 |
+
import logging
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
|
| 8 |
+
# Load environment variables first, before any other code
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# Import configuration defaults (after loading .env to prioritize environment variables)
|
| 12 |
+
from config import ENV_DEFAULTS, DEFAULT_CONFIG
|
| 13 |
+
|
| 14 |
+
# Configure logging based on configuration
|
| 15 |
+
log_level = os.environ.get('LOGLEVEL', DEFAULT_CONFIG['LOGLEVEL']).upper()
|
| 16 |
+
logging.basicConfig(
|
| 17 |
+
level=getattr(logging, log_level),
|
| 18 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 19 |
+
handlers=[
|
| 20 |
+
# Only log to console if level is INFO or higher
|
| 21 |
+
logging.StreamHandler() if log_level != 'WARNING' else logging.NullHandler()
|
| 22 |
+
]
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
# Configure app
|
| 26 |
+
st.set_page_config(page_title="Translator & Readability", layout="wide")
|
| 27 |
+
|
| 28 |
+
# Check for missing environment variables and use defaults from config
|
| 29 |
+
for var, default in ENV_DEFAULTS.items():
|
| 30 |
+
if var not in os.environ:
|
| 31 |
+
logging.debug(f"Environment variable {var} not found, using default: {default}")
|
| 32 |
+
os.environ[var] = default
|
| 33 |
+
|
| 34 |
+
# Model configuration from default config
|
| 35 |
+
MODEL_CONFIG = {
|
| 36 |
+
"max_parallel_models": DEFAULT_CONFIG["MAX_PARALLEL_MODELS"],
|
| 37 |
+
"session_timeout": DEFAULT_CONFIG["SESSION_TIMEOUT"],
|
| 38 |
+
"allow_gpu": DEFAULT_CONFIG["ALLOW_GPU"]
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Initialize model semaphore for limiting concurrent model usage
|
| 42 |
+
import threading
|
| 43 |
+
model_semaphore = threading.Semaphore(MODEL_CONFIG["max_parallel_models"])
|
| 44 |
+
|
| 45 |
+
import tempfile
|
| 46 |
+
import io
|
| 47 |
+
from docx import Document
|
| 48 |
+
import uuid
|
| 49 |
+
import traceback
|
| 50 |
+
|
| 51 |
+
from models.nltk_resources import setup_nltk
|
| 52 |
+
from utils.file_readers import read_file
|
| 53 |
+
from utils.text_processing import detect_language
|
| 54 |
+
from utils.readability_indices import (
|
| 55 |
+
flesch_reading_ease,
|
| 56 |
+
flesch_kincaid_grade_level,
|
| 57 |
+
gunning_fog_index,
|
| 58 |
+
smog_index,
|
| 59 |
+
highlight_complex_text
|
| 60 |
+
)
|
| 61 |
+
from utils.formatting import color_code_index
|
| 62 |
+
from utils.tilmash_translation import tilmash_translate, display_tilmash_streaming_translation
|
| 63 |
+
|
| 64 |
+
# Initialize session state for user identification
|
| 65 |
+
if 'session_id' not in st.session_state:
|
| 66 |
+
st.session_state.session_id = str(uuid.uuid4())
|
| 67 |
+
|
| 68 |
+
if 'translation_lock' not in st.session_state:
|
| 69 |
+
st.session_state.translation_lock = False
|
| 70 |
+
|
| 71 |
+
def handle_translation():
|
| 72 |
+
st.header("Перевод (Kazakh, Russian, English)")
|
| 73 |
+
|
| 74 |
+
# Show session ID in sidebar for debugging
|
| 75 |
+
with st.sidebar.expander("Session Info", expanded=False):
|
| 76 |
+
st.write(f"Session ID: {st.session_state.session_id}")
|
| 77 |
+
|
| 78 |
+
# Add GPU usage option
|
| 79 |
+
if MODEL_CONFIG["allow_gpu"]:
|
| 80 |
+
st.session_state.use_gpu = st.checkbox("Использовать GPU (быстрее)", value=True)
|
| 81 |
+
if st.session_state.use_gpu:
|
| 82 |
+
try:
|
| 83 |
+
import torch
|
| 84 |
+
if torch.cuda.is_available():
|
| 85 |
+
gpu_info = f"CUDA: {torch.cuda.get_device_name(0)}"
|
| 86 |
+
st.success(f"Доступен GPU: {gpu_info}")
|
| 87 |
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 88 |
+
st.success("Доступен Apple Silicon GPU (MPS)")
|
| 89 |
+
else:
|
| 90 |
+
st.warning("GPU не обнаружен, будет использован CPU")
|
| 91 |
+
st.session_state.use_gpu = False
|
| 92 |
+
except ImportError:
|
| 93 |
+
st.warning("PyTorch не установлен, будет использован CPU")
|
| 94 |
+
st.session_state.use_gpu = False
|
| 95 |
+
else:
|
| 96 |
+
st.session_state.use_gpu = False
|
| 97 |
+
st.write("GPU отключен в конфигурации")
|
| 98 |
+
|
| 99 |
+
translate_input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
|
| 100 |
+
input_text = ""
|
| 101 |
+
|
| 102 |
+
if translate_input_method == "Загрузить файл":
|
| 103 |
+
uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
|
| 104 |
+
if uploaded_file is not None:
|
| 105 |
+
suffix = os.path.splitext(uploaded_file.name)[1]
|
| 106 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
| 107 |
+
tmp_file.write(uploaded_file.getbuffer())
|
| 108 |
+
temp_file_path = tmp_file.name
|
| 109 |
+
input_text = read_file(temp_file_path)
|
| 110 |
+
os.remove(temp_file_path)
|
| 111 |
+
st.write("**Содержимое файла:**")
|
| 112 |
+
st.write(input_text)
|
| 113 |
+
else:
|
| 114 |
+
input_text = st.text_area("Вставьте ваш текст здесь", height=200)
|
| 115 |
+
|
| 116 |
+
if input_text:
|
| 117 |
+
auto_detect = st.checkbox("Автоматически определить язык", value=True)
|
| 118 |
+
src_lang = None
|
| 119 |
+
if auto_detect:
|
| 120 |
+
detected_lang = detect_language(input_text)
|
| 121 |
+
if detected_lang in ['ru','en','kk']:
|
| 122 |
+
st.info(f"Определён язык: {detected_lang}")
|
| 123 |
+
src_lang = detected_lang
|
| 124 |
+
else:
|
| 125 |
+
st.warning("Не удалось определить язык. Выберите вручную.")
|
| 126 |
+
src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
|
| 127 |
+
else:
|
| 128 |
+
src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
|
| 129 |
+
|
| 130 |
+
if src_lang == "ru":
|
| 131 |
+
tgt_options = ["en","kk"]
|
| 132 |
+
elif src_lang == "en":
|
| 133 |
+
tgt_options = ["ru","kk"]
|
| 134 |
+
else:
|
| 135 |
+
tgt_options = ["ru","en"]
|
| 136 |
+
|
| 137 |
+
tgt_lang = st.selectbox("Перевод на:", tgt_options)
|
| 138 |
+
|
| 139 |
+
if st.button("Перевести"):
|
| 140 |
+
# Prevent multiple concurrent translations from same session
|
| 141 |
+
if st.session_state.translation_lock:
|
| 142 |
+
st.warning("Перевод уже выполняется. Пожалуйста, дождитесь завершения.")
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
# Set translation lock
|
| 146 |
+
st.session_state.translation_lock = True
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
# Use the model semaphore to limit concurrent model access
|
| 150 |
+
acquired = model_semaphore.acquire(blocking=False)
|
| 151 |
+
if not acquired:
|
| 152 |
+
st.warning("Максимальное количество параллельных моделей достигнуто. Пожалуйста, попробуйте позже.")
|
| 153 |
+
st.session_state.translation_lock = False
|
| 154 |
+
return
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
st.subheader("Результат перевода:")
|
| 158 |
+
# Get the approximate size of the text to determine if chunking is needed
|
| 159 |
+
approx_text_size = len(input_text) / 4 # rough approximation (4 chars ≈ 1 token)
|
| 160 |
+
needs_chunking = approx_text_size > 500 # If text is likely over 500 tokens
|
| 161 |
+
|
| 162 |
+
# Display appropriate spinner message
|
| 163 |
+
spinner_message = "Processing text in chunks..." if needs_chunking else "Processing translation..."
|
| 164 |
+
|
| 165 |
+
# Create a dedicated translator instance for this session
|
| 166 |
+
from utils.tilmash_translation import TilmashTranslator
|
| 167 |
+
# Используем GPU если включено в настройках
|
| 168 |
+
use_gpu = getattr(st.session_state, 'use_gpu', False)
|
| 169 |
+
translator = TilmashTranslator(use_gpu=use_gpu)
|
| 170 |
+
|
| 171 |
+
with st.spinner(spinner_message):
|
| 172 |
+
try:
|
| 173 |
+
# Use direct streaming approach with session-specific translator
|
| 174 |
+
result = ""
|
| 175 |
+
translation_placeholder = st.empty()
|
| 176 |
+
|
| 177 |
+
# Stream translation
|
| 178 |
+
for chunk in translator.translate_streaming(input_text, src_lang, tgt_lang):
|
| 179 |
+
result += chunk
|
| 180 |
+
translation_placeholder.markdown(result)
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
st.error(f"Translation error: {str(e)}")
|
| 184 |
+
logging.error(f"Tilmash translation error: {traceback.format_exc()}")
|
| 185 |
+
result = None
|
| 186 |
+
|
| 187 |
+
if result:
|
| 188 |
+
# Prepare download capability
|
| 189 |
+
doc = Document()
|
| 190 |
+
doc.add_paragraph(result)
|
| 191 |
+
doc_io = io.BytesIO()
|
| 192 |
+
doc.save(doc_io)
|
| 193 |
+
doc_io.seek(0)
|
| 194 |
+
|
| 195 |
+
st.download_button(
|
| 196 |
+
label="Скачать переведённый текст (.docx)",
|
| 197 |
+
data=doc_io,
|
| 198 |
+
file_name="translated_text.docx",
|
| 199 |
+
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 200 |
+
)
|
| 201 |
+
else:
|
| 202 |
+
st.warning("Не удалось выполнить перевод.")
|
| 203 |
+
|
| 204 |
+
# Unload Tilmash model after use
|
| 205 |
+
try:
|
| 206 |
+
if translator.initialized:
|
| 207 |
+
translator.unload_model()
|
| 208 |
+
except Exception as unload_error:
|
| 209 |
+
logging.error(f"Error unloading Tilmash model: {str(unload_error)}")
|
| 210 |
+
except Exception as tilmash_error:
|
| 211 |
+
st.error(f"Tilmash model error: {str(tilmash_error)}")
|
| 212 |
+
logging.error(f"Tilmash model error: {traceback.format_exc()}")
|
| 213 |
+
finally:
|
| 214 |
+
# Release the semaphore
|
| 215 |
+
model_semaphore.release()
|
| 216 |
+
except Exception as outer_error:
|
| 217 |
+
st.error(f"Unexpected error: {str(outer_error)}")
|
| 218 |
+
logging.error(f"Unexpected error: {traceback.format_exc()}")
|
| 219 |
+
finally:
|
| 220 |
+
# Release translation lock
|
| 221 |
+
st.session_state.translation_lock = False
|
| 222 |
+
|
| 223 |
+
def handle_readability_analysis():
|
| 224 |
+
st.header("Анализ удобочитаемости текста")
|
| 225 |
+
input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
|
| 226 |
+
text = ""
|
| 227 |
+
|
| 228 |
+
if input_method == "Загрузить файл":
|
| 229 |
+
uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
|
| 230 |
+
if uploaded_file is not None:
|
| 231 |
+
suffix = os.path.splitext(uploaded_file.name)[1]
|
| 232 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
|
| 233 |
+
tmp_file.write(uploaded_file.getbuffer())
|
| 234 |
+
temp_file_path = tmp_file.name
|
| 235 |
+
text = read_file(temp_file_path)
|
| 236 |
+
os.remove(temp_file_path)
|
| 237 |
+
st.write("**Содержимое файла:**")
|
| 238 |
+
st.write(text)
|
| 239 |
+
else:
|
| 240 |
+
text = st.text_area("Вставьте ваш текст здесь", height=200)
|
| 241 |
+
|
| 242 |
+
if text:
|
| 243 |
+
auto_detect = st.checkbox("Определить язык автоматически", value=True)
|
| 244 |
+
if auto_detect:
|
| 245 |
+
detected_lang = detect_language(text)
|
| 246 |
+
st.info(f"Определён язык: {detected_lang}")
|
| 247 |
+
lang_code = detected_lang if detected_lang in ['ru','en','kk'] else 'en'
|
| 248 |
+
else:
|
| 249 |
+
lang_code = st.selectbox("Язык текста", ["ru", "en", "kk"])
|
| 250 |
+
|
| 251 |
+
if st.button("Анализировать"):
|
| 252 |
+
# Prevent multiple concurrent analyses
|
| 253 |
+
if 'analysis_lock' in st.session_state and st.session_state.analysis_lock:
|
| 254 |
+
st.warning("Анализ уже выполняется. Пожалуйста, дождитесь завершения.")
|
| 255 |
+
return
|
| 256 |
+
|
| 257 |
+
# Set analysis lock
|
| 258 |
+
st.session_state.analysis_lock = True
|
| 259 |
+
|
| 260 |
+
try:
|
| 261 |
+
# Use the model semaphore for consistency with translation
|
| 262 |
+
acquired = model_semaphore.acquire(blocking=False)
|
| 263 |
+
if not acquired:
|
| 264 |
+
st.warning("Система загружена. Пожалуйста, попробуйте позже.")
|
| 265 |
+
st.session_state.analysis_lock = False
|
| 266 |
+
return
|
| 267 |
+
|
| 268 |
+
try:
|
| 269 |
+
with st.spinner("Выполняется анализ..."):
|
| 270 |
+
fre = flesch_reading_ease(text, lang_code)
|
| 271 |
+
fkgl = flesch_kincaid_grade_level(text, lang_code)
|
| 272 |
+
fog = gunning_fog_index(text, lang_code)
|
| 273 |
+
smog = smog_index(text, lang_code)
|
| 274 |
+
highlighted_text, complex_words_list = highlight_complex_text(text, lang_code)
|
| 275 |
+
|
| 276 |
+
st.subheader("Результаты удобочитаемости")
|
| 277 |
+
st.markdown(
|
| 278 |
+
f"**Индекс удобочитаемости Флеша:** {color_code_index('Flesch Reading Ease', fre)}",
|
| 279 |
+
unsafe_allow_html=True
|
| 280 |
+
)
|
| 281 |
+
st.markdown(
|
| 282 |
+
f"**Индекс Флеша-Кинкейда:** {color_code_index('Flesch-Kincaid Grade Level', fkgl)}",
|
| 283 |
+
unsafe_allow_html=True
|
| 284 |
+
)
|
| 285 |
+
st.markdown(
|
| 286 |
+
f"**Индекс тумана Ганнинга:** {color_code_index('Gunning Fog Index', fog)}",
|
| 287 |
+
unsafe_allow_html=True
|
| 288 |
+
)
|
| 289 |
+
st.markdown(
|
| 290 |
+
f"**Индекс SMOG:** {color_code_index('SMOG Index', smog)}",
|
| 291 |
+
unsafe_allow_html=True
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
st.subheader("Сложные слова")
|
| 295 |
+
st.write(", ".join(set(complex_words_list)))
|
| 296 |
+
finally:
|
| 297 |
+
# Release the semaphore
|
| 298 |
+
model_semaphore.release()
|
| 299 |
+
finally:
|
| 300 |
+
# Release analysis lock
|
| 301 |
+
st.session_state.analysis_lock = False
|
| 302 |
+
|
| 303 |
+
def main():
|
| 304 |
+
setup_nltk()
|
| 305 |
+
|
| 306 |
+
# Log the model configuration only once per session
|
| 307 |
+
if 'model_config_logged' not in st.session_state:
|
| 308 |
+
logging.info(f"Using model configuration: {MODEL_CONFIG}")
|
| 309 |
+
st.session_state.model_config_logged = True
|
| 310 |
+
|
| 311 |
+
# Проверка доступности GPU при запуске
|
| 312 |
+
try:
|
| 313 |
+
import torch
|
| 314 |
+
if torch.cuda.is_available():
|
| 315 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 316 |
+
cuda_ver = torch.version.cuda if hasattr(torch.version, "cuda") else "N/A"
|
| 317 |
+
logging.info(f"Обнаружен GPU: {gpu_name}, CUDA {cuda_ver}")
|
| 318 |
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
| 319 |
+
logging.info("Обнаружен Apple Silicon GPU (MPS)")
|
| 320 |
+
else:
|
| 321 |
+
logging.warning("GPU не обнаружен. Устанавливаем устройство на CPU")
|
| 322 |
+
if not torch.cuda.is_available():
|
| 323 |
+
# Вывод диагностической информации
|
| 324 |
+
logging.warning("Диагностика CUDA:")
|
| 325 |
+
logging.warning(f"torch.__version__: {torch.__version__}")
|
| 326 |
+
if hasattr(torch.version, "cuda"):
|
| 327 |
+
logging.warning(f"torch.version.cuda: {torch.version.cuda}")
|
| 328 |
+
if hasattr(torch.cuda, "is_available"):
|
| 329 |
+
logging.warning(f"torch.cuda.is_available(): {torch.cuda.is_available()}")
|
| 330 |
+
except ImportError:
|
| 331 |
+
logging.warning("PyTorch не установлен, будет использован CPU")
|
| 332 |
+
except Exception as e:
|
| 333 |
+
logging.warning(f"Ошибка при проверке GPU: {str(e)}")
|
| 334 |
+
|
| 335 |
+
st.title("Translation & Readability Analysis")
|
| 336 |
+
st.sidebar.header("Функциональность")
|
| 337 |
+
functionality = st.sidebar.radio("Выберите режим:", ["Перевод", "Анализ удобочитаемости"])
|
| 338 |
+
|
| 339 |
+
if functionality == "Перевод":
|
| 340 |
+
handle_translation()
|
| 341 |
+
elif functionality == "Анализ удобочитаемости":
|
| 342 |
+
handle_readability_analysis()
|
| 343 |
+
|
| 344 |
+
if __name__ == "__main__":
|
| 345 |
+
main()
|
models/nltk_resources.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# models/nltk_resources.py
|
| 2 |
+
|
| 3 |
+
import nltk
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
def setup_nltk():
|
| 7 |
+
nltk_data_dir = 'nltk_data'
|
| 8 |
+
|
| 9 |
+
# Add the nltk_data directory to the NLTK data path
|
| 10 |
+
if nltk_data_dir not in nltk.data.path:
|
| 11 |
+
nltk.data.path.insert(0, nltk_data_dir)
|
| 12 |
+
|
| 13 |
+
# Define the required package
|
| 14 |
+
required_package = 'punkt_tab'
|
| 15 |
+
|
| 16 |
+
# Check if the package is installed locally
|
| 17 |
+
try:
|
| 18 |
+
nltk.data.find('tokenizers/punkt_tab')
|
| 19 |
+
except LookupError:
|
| 20 |
+
logging.info(f"Downloading NLTK package: {required_package}")
|
| 21 |
+
nltk.download(required_package, download_dir=nltk_data_dir, quiet=True)
|
nltk_data/tokenizers/punkt_tab.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e57f64187974277726a3417ca6f181ec5403676c717672eef6a748a7b20e0106
|
| 3 |
+
size 4319076
|
nltk_data/tokenizers/punkt_tab/README
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained Punkt Models -- Jan Strunk (New version trained after issues 313 and 514 had been corrected)
|
| 2 |
+
|
| 3 |
+
Most models were prepared using the test corpora from Kiss and Strunk (2006). Additional models have
|
| 4 |
+
been contributed by various people using NLTK for sentence boundary detection.
|
| 5 |
+
|
| 6 |
+
For information about how to use these models, please confer the tokenization HOWTO:
|
| 7 |
+
http://nltk.googlecode.com/svn/trunk/doc/howto/tokenize.html
|
| 8 |
+
and chapter 3.8 of the NLTK book:
|
| 9 |
+
http://nltk.googlecode.com/svn/trunk/doc/book/ch03.html#sec-segmentation
|
| 10 |
+
|
| 11 |
+
There are pretrained tokenizers for the following languages:
|
| 12 |
+
|
| 13 |
+
File Language Source Contents Size of training corpus(in tokens) Model contributed by
|
| 14 |
+
=======================================================================================================================================================================
|
| 15 |
+
czech.pickle Czech Multilingual Corpus 1 (ECI) Lidove Noviny ~345,000 Jan Strunk / Tibor Kiss
|
| 16 |
+
Literarni Noviny
|
| 17 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 18 |
+
danish.pickle Danish Avisdata CD-Rom Ver. 1.1. 1995 Berlingske Tidende ~550,000 Jan Strunk / Tibor Kiss
|
| 19 |
+
(Berlingske Avisdata, Copenhagen) Weekend Avisen
|
| 20 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 21 |
+
dutch.pickle Dutch Multilingual Corpus 1 (ECI) De Limburger ~340,000 Jan Strunk / Tibor Kiss
|
| 22 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 23 |
+
english.pickle English Penn Treebank (LDC) Wall Street Journal ~469,000 Jan Strunk / Tibor Kiss
|
| 24 |
+
(American)
|
| 25 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 26 |
+
estonian.pickle Estonian University of Tartu, Estonia Eesti Ekspress ~359,000 Jan Strunk / Tibor Kiss
|
| 27 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 28 |
+
finnish.pickle Finnish Finnish Parole Corpus, Finnish Books and major national ~364,000 Jan Strunk / Tibor Kiss
|
| 29 |
+
Text Bank (Suomen Kielen newspapers
|
| 30 |
+
Tekstipankki)
|
| 31 |
+
Finnish Center for IT Science
|
| 32 |
+
(CSC)
|
| 33 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 34 |
+
french.pickle French Multilingual Corpus 1 (ECI) Le Monde ~370,000 Jan Strunk / Tibor Kiss
|
| 35 |
+
(European)
|
| 36 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 37 |
+
german.pickle German Neue Zürcher Zeitung AG Neue Zürcher Zeitung ~847,000 Jan Strunk / Tibor Kiss
|
| 38 |
+
(Switzerland) CD-ROM
|
| 39 |
+
(Uses "ss"
|
| 40 |
+
instead of "ß")
|
| 41 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 42 |
+
greek.pickle Greek Efstathios Stamatatos To Vima (TO BHMA) ~227,000 Jan Strunk / Tibor Kiss
|
| 43 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 44 |
+
italian.pickle Italian Multilingual Corpus 1 (ECI) La Stampa, Il Mattino ~312,000 Jan Strunk / Tibor Kiss
|
| 45 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 46 |
+
norwegian.pickle Norwegian Centre for Humanities Bergens Tidende ~479,000 Jan Strunk / Tibor Kiss
|
| 47 |
+
(Bokmål and Information Technologies,
|
| 48 |
+
Nynorsk) Bergen
|
| 49 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 50 |
+
polish.pickle Polish Polish National Corpus Literature, newspapers, etc. ~1,000,000 Krzysztof Langner
|
| 51 |
+
(http://www.nkjp.pl/)
|
| 52 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 53 |
+
portuguese.pickle Portuguese CETENFolha Corpus Folha de São Paulo ~321,000 Jan Strunk / Tibor Kiss
|
| 54 |
+
(Brazilian) (Linguateca)
|
| 55 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 56 |
+
slovene.pickle Slovene TRACTOR Delo ~354,000 Jan Strunk / Tibor Kiss
|
| 57 |
+
Slovene Academy for Arts
|
| 58 |
+
and Sciences
|
| 59 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 60 |
+
spanish.pickle Spanish Multilingual Corpus 1 (ECI) Sur ~353,000 Jan Strunk / Tibor Kiss
|
| 61 |
+
(European)
|
| 62 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 63 |
+
swedish.pickle Swedish Multilingual Corpus 1 (ECI) Dagens Nyheter ~339,000 Jan Strunk / Tibor Kiss
|
| 64 |
+
(and some other texts)
|
| 65 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 66 |
+
turkish.pickle Turkish METU Turkish Corpus Milliyet ~333,000 Jan Strunk / Tibor Kiss
|
| 67 |
+
(Türkçe Derlem Projesi)
|
| 68 |
+
University of Ankara
|
| 69 |
+
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
The corpora contained about 400,000 tokens on average and mostly consisted of newspaper text converted to
|
| 72 |
+
Unicode using the codecs module.
|
| 73 |
+
|
| 74 |
+
Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection.
|
| 75 |
+
Computational Linguistics 32: 485-525.
|
| 76 |
+
|
| 77 |
+
---- Training Code ----
|
| 78 |
+
|
| 79 |
+
# import punkt
|
| 80 |
+
import nltk.tokenize.punkt
|
| 81 |
+
|
| 82 |
+
# Make a new Tokenizer
|
| 83 |
+
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
|
| 84 |
+
|
| 85 |
+
# Read in training corpus (one example: Slovene)
|
| 86 |
+
import codecs
|
| 87 |
+
text = codecs.open("slovene.plain","Ur","iso-8859-2").read()
|
| 88 |
+
|
| 89 |
+
# Train tokenizer
|
| 90 |
+
tokenizer.train(text)
|
| 91 |
+
|
| 92 |
+
# Dump pickled tokenizer
|
| 93 |
+
import pickle
|
| 94 |
+
out = open("slovene.pickle","wb")
|
| 95 |
+
pickle.dump(tokenizer, out)
|
| 96 |
+
out.close()
|
| 97 |
+
|
| 98 |
+
---------
|
nltk_data/tokenizers/punkt_tab/czech/abbrev_types.txt
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
množ
|
| 3 |
+
např
|
| 4 |
+
j.h
|
| 5 |
+
man
|
| 6 |
+
ú
|
| 7 |
+
jug
|
| 8 |
+
dr
|
| 9 |
+
bl
|
| 10 |
+
ml
|
| 11 |
+
okr
|
| 12 |
+
st
|
| 13 |
+
uh
|
| 14 |
+
šp
|
| 15 |
+
judr
|
| 16 |
+
u.s.a
|
| 17 |
+
p
|
| 18 |
+
arg
|
| 19 |
+
žitě
|
| 20 |
+
st.celsia
|
| 21 |
+
etc
|
| 22 |
+
p.s
|
| 23 |
+
t.r
|
| 24 |
+
lok
|
| 25 |
+
mil
|
| 26 |
+
ict
|
| 27 |
+
n
|
| 28 |
+
tl
|
| 29 |
+
min
|
| 30 |
+
č
|
| 31 |
+
d
|
| 32 |
+
al
|
| 33 |
+
ravenně
|
| 34 |
+
mj
|
| 35 |
+
nar
|
| 36 |
+
plk
|
| 37 |
+
s.p
|
| 38 |
+
a.g
|
| 39 |
+
roč
|
| 40 |
+
b
|
| 41 |
+
zdi
|
| 42 |
+
r.s.c
|
| 43 |
+
přek
|
| 44 |
+
m
|
| 45 |
+
gen
|
| 46 |
+
csc
|
| 47 |
+
mudr
|
| 48 |
+
vic
|
| 49 |
+
š
|
| 50 |
+
sb
|
| 51 |
+
resp
|
| 52 |
+
tzn
|
| 53 |
+
iv
|
| 54 |
+
s.r.o
|
| 55 |
+
mar
|
| 56 |
+
w
|
| 57 |
+
čs
|
| 58 |
+
vi
|
| 59 |
+
tzv
|
| 60 |
+
ul
|
| 61 |
+
pen
|
| 62 |
+
zv
|
| 63 |
+
str
|
| 64 |
+
čp
|
| 65 |
+
org
|
| 66 |
+
rak
|
| 67 |
+
sv
|
| 68 |
+
pplk
|
| 69 |
+
u.s
|
| 70 |
+
prof
|
| 71 |
+
c.k
|
| 72 |
+
op
|
| 73 |
+
g
|
| 74 |
+
vii
|
| 75 |
+
kr
|
| 76 |
+
ing
|
| 77 |
+
j.o
|
| 78 |
+
drsc
|
| 79 |
+
m3
|
| 80 |
+
l
|
| 81 |
+
tr
|
| 82 |
+
ceo
|
| 83 |
+
ch
|
| 84 |
+
fuk
|
| 85 |
+
vl
|
| 86 |
+
viii
|
| 87 |
+
líp
|
| 88 |
+
hl.m
|
| 89 |
+
t.zv
|
| 90 |
+
phdr
|
| 91 |
+
o.k
|
| 92 |
+
tis
|
| 93 |
+
doc
|
| 94 |
+
kl
|
| 95 |
+
ard
|
| 96 |
+
čkd
|
| 97 |
+
pok
|
| 98 |
+
apod
|
| 99 |
+
r
|
| 100 |
+
př
|
| 101 |
+
a.s
|
| 102 |
+
j
|
| 103 |
+
jr
|
| 104 |
+
i.m
|
| 105 |
+
e
|
| 106 |
+
kupř
|
| 107 |
+
f
|
| 108 |
+
tř
|
| 109 |
+
xvi
|
| 110 |
+
mir
|
| 111 |
+
atď
|
| 112 |
+
vr
|
| 113 |
+
r.i.v
|
| 114 |
+
hl
|
| 115 |
+
kv
|
| 116 |
+
t.j
|
| 117 |
+
y
|
| 118 |
+
q.p.r
|
nltk_data/tokenizers/punkt_tab/czech/collocations.tab
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
i dejmala
|
| 2 |
+
##number## prosince
|
| 3 |
+
h steina
|
| 4 |
+
##number## listopadu
|
| 5 |
+
a dvořák
|
| 6 |
+
v klaus
|
| 7 |
+
i čnhl
|
| 8 |
+
##number## wladyslawowo
|
| 9 |
+
##number## letech
|
| 10 |
+
a jiráska
|
| 11 |
+
a dubček
|
| 12 |
+
##number## štrasburk
|
| 13 |
+
##number## juniorské
|
| 14 |
+
##number## století
|
| 15 |
+
##number## kola
|
| 16 |
+
##number## pád
|
| 17 |
+
##number## května
|
| 18 |
+
##number## týdne
|
| 19 |
+
v dlouhý
|
| 20 |
+
k design
|
| 21 |
+
##number## červenec
|
| 22 |
+
i ligy
|
| 23 |
+
##number## kolo
|
| 24 |
+
z svěrák
|
| 25 |
+
##number## mája
|
| 26 |
+
##number## šimková
|
| 27 |
+
a bělého
|
| 28 |
+
a bradáč
|
| 29 |
+
##number## ročníku
|
| 30 |
+
##number## dubna
|
| 31 |
+
a vivaldiho
|
| 32 |
+
v mečiara
|
| 33 |
+
c carrićre
|
| 34 |
+
##number## sjezd
|
| 35 |
+
##number## výroční
|
| 36 |
+
##number## kole
|
| 37 |
+
##number## narozenin
|
| 38 |
+
k maleevová
|
| 39 |
+
i čnfl
|
| 40 |
+
##number## pádě
|
| 41 |
+
##number## září
|
| 42 |
+
##number## výročí
|
| 43 |
+
a dvořáka
|
| 44 |
+
h g.
|
| 45 |
+
##number## ledna
|
| 46 |
+
a dvorský
|
| 47 |
+
h měsíc
|
| 48 |
+
##number## srpna
|
| 49 |
+
##number## tř.
|
| 50 |
+
a mozarta
|
| 51 |
+
##number## sudetoněmeckých
|
| 52 |
+
o sokolov
|
| 53 |
+
k škrach
|
| 54 |
+
v benda
|
| 55 |
+
##number## symfonie
|
| 56 |
+
##number## července
|
| 57 |
+
x šalda
|
| 58 |
+
c abrahama
|
| 59 |
+
a tichý
|
| 60 |
+
##number## místo
|
| 61 |
+
k bielecki
|
| 62 |
+
v havel
|
| 63 |
+
##number## etapu
|
| 64 |
+
a dubčeka
|
| 65 |
+
i liga
|
| 66 |
+
##number## světový
|
| 67 |
+
v klausem
|
| 68 |
+
##number## ženy
|
| 69 |
+
##number## létech
|
| 70 |
+
##number## minutě
|
| 71 |
+
##number## listopadem
|
| 72 |
+
##number## místě
|
| 73 |
+
o vlček
|
| 74 |
+
k peteraje
|
| 75 |
+
i sponzor
|
| 76 |
+
##number## června
|
| 77 |
+
##number## min.
|
| 78 |
+
##number## oprávněnou
|
| 79 |
+
##number## květnu
|
| 80 |
+
##number## aktu
|
| 81 |
+
##number## květnem
|
| 82 |
+
##number## října
|
| 83 |
+
i rynda
|
| 84 |
+
##number## února
|
| 85 |
+
i snfl
|
| 86 |
+
a mozart
|
| 87 |
+
z košler
|
| 88 |
+
a dvorskému
|
| 89 |
+
v marhoul
|
| 90 |
+
v mečiar
|
| 91 |
+
##number## ročník
|
| 92 |
+
##number## máje
|
| 93 |
+
v havla
|
| 94 |
+
k gott
|
| 95 |
+
s bacha
|
| 96 |
+
##number## ad
|
nltk_data/tokenizers/punkt_tab/czech/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/czech/sent_starters.txt
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
já
|
| 2 |
+
milena
|
| 3 |
+
tomáš
|
| 4 |
+
oznámila
|
| 5 |
+
podle
|
| 6 |
+
my
|
| 7 |
+
vyplývá
|
| 8 |
+
hlavní
|
| 9 |
+
jelikož
|
| 10 |
+
musíme
|
| 11 |
+
kdyby
|
| 12 |
+
foto
|
| 13 |
+
rozptylové
|
| 14 |
+
snad
|
| 15 |
+
zároveň
|
| 16 |
+
jaroslav
|
| 17 |
+
po
|
| 18 |
+
v
|
| 19 |
+
kromě
|
| 20 |
+
pokud
|
| 21 |
+
toto
|
| 22 |
+
jenže
|
| 23 |
+
oba
|
| 24 |
+
jak
|
| 25 |
+
zatímco
|
| 26 |
+
ten
|
| 27 |
+
myslím
|
| 28 |
+
navíc
|
| 29 |
+
dušan
|
| 30 |
+
zdá
|
| 31 |
+
dnes
|
| 32 |
+
přesto
|
| 33 |
+
tato
|
| 34 |
+
ti
|
| 35 |
+
bratislava
|
| 36 |
+
ale
|
| 37 |
+
když
|
| 38 |
+
nicméně
|
| 39 |
+
tento
|
| 40 |
+
mirka
|
| 41 |
+
přitom
|
| 42 |
+
dokud
|
| 43 |
+
jan
|
| 44 |
+
bohužel
|
| 45 |
+
ta
|
| 46 |
+
díky
|
| 47 |
+
prohlásil
|
| 48 |
+
praha
|
| 49 |
+
jestliže
|
| 50 |
+
jde
|
| 51 |
+
vždyť
|
| 52 |
+
moskva
|
| 53 |
+
proto
|
| 54 |
+
to
|
nltk_data/tokenizers/punkt_tab/danish/abbrev_types.txt
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
tlf
|
| 3 |
+
b.p
|
| 4 |
+
evt
|
| 5 |
+
j.h
|
| 6 |
+
lenz
|
| 7 |
+
mht
|
| 8 |
+
gl
|
| 9 |
+
bl
|
| 10 |
+
stud.polit
|
| 11 |
+
e.j
|
| 12 |
+
st
|
| 13 |
+
o
|
| 14 |
+
dec
|
| 15 |
+
mag
|
| 16 |
+
h.b
|
| 17 |
+
p
|
| 18 |
+
adm
|
| 19 |
+
el.lign
|
| 20 |
+
e.s
|
| 21 |
+
saalba
|
| 22 |
+
styrt
|
| 23 |
+
nr
|
| 24 |
+
m.a.s.h
|
| 25 |
+
etc
|
| 26 |
+
pharm
|
| 27 |
+
hg
|
| 28 |
+
j.j
|
| 29 |
+
dj
|
| 30 |
+
mountainb
|
| 31 |
+
f.kr
|
| 32 |
+
h.r
|
| 33 |
+
cand.jur
|
| 34 |
+
sp
|
| 35 |
+
osv
|
| 36 |
+
s.g
|
| 37 |
+
ndr
|
| 38 |
+
inc
|
| 39 |
+
b.i.g
|
| 40 |
+
dk-sver
|
| 41 |
+
sl
|
| 42 |
+
v.s.o.d
|
| 43 |
+
cand.mag
|
| 44 |
+
d.v.s
|
| 45 |
+
v.i
|
| 46 |
+
bøddel
|
| 47 |
+
fr
|
| 48 |
+
ø«
|
| 49 |
+
dr.phil
|
| 50 |
+
chr
|
| 51 |
+
p.d
|
| 52 |
+
bj
|
| 53 |
+
fhv
|
| 54 |
+
tilskudsforhold
|
| 55 |
+
m.a
|
| 56 |
+
sek
|
| 57 |
+
p.g.a
|
| 58 |
+
int
|
| 59 |
+
pokalf
|
| 60 |
+
ik
|
| 61 |
+
dir
|
| 62 |
+
em-lodtrækn
|
| 63 |
+
a.h
|
| 64 |
+
o.lign
|
| 65 |
+
p.t
|
| 66 |
+
m.v
|
| 67 |
+
n.j
|
| 68 |
+
m.h.t
|
| 69 |
+
m.m
|
| 70 |
+
a.p
|
| 71 |
+
pers
|
| 72 |
+
4-bakketurn
|
| 73 |
+
dr.med
|
| 74 |
+
w.ø
|
| 75 |
+
polit
|
| 76 |
+
fremsættes
|
| 77 |
+
techn
|
| 78 |
+
tidl
|
| 79 |
+
o.g
|
| 80 |
+
i.c.i
|
| 81 |
+
mill
|
| 82 |
+
skt
|
| 83 |
+
m.fl
|
| 84 |
+
cand.merc
|
| 85 |
+
kbh
|
| 86 |
+
indiv
|
| 87 |
+
stk
|
| 88 |
+
dk-maked
|
| 89 |
+
memorandum
|
| 90 |
+
mestersk
|
| 91 |
+
mag.art
|
| 92 |
+
kitzb
|
| 93 |
+
h
|
| 94 |
+
lic
|
| 95 |
+
fig
|
| 96 |
+
dressurst
|
| 97 |
+
sportsg
|
| 98 |
+
r.e.m
|
| 99 |
+
d.u.m
|
| 100 |
+
sct
|
| 101 |
+
kld
|
| 102 |
+
bl.a
|
| 103 |
+
hf
|
| 104 |
+
g.a
|
| 105 |
+
corp
|
| 106 |
+
w
|
| 107 |
+
konk
|
| 108 |
+
zoeterm
|
| 109 |
+
b.t
|
| 110 |
+
a.d
|
| 111 |
+
l.b
|
| 112 |
+
jf
|
| 113 |
+
s.b
|
| 114 |
+
kgl
|
| 115 |
+
ill
|
| 116 |
+
beck
|
| 117 |
+
tosset
|
| 118 |
+
afd
|
| 119 |
+
johs
|
| 120 |
+
pct
|
| 121 |
+
k.b
|
| 122 |
+
sv
|
| 123 |
+
verbalt
|
| 124 |
+
kgs
|
| 125 |
+
l.m.k
|
| 126 |
+
j.l
|
| 127 |
+
aus
|
| 128 |
+
superl
|
| 129 |
+
t.v
|
| 130 |
+
mia
|
| 131 |
+
kr
|
| 132 |
+
pr
|
| 133 |
+
præmien
|
| 134 |
+
j.b.s
|
| 135 |
+
j.o
|
| 136 |
+
o.s.v
|
| 137 |
+
edb-oplysninger
|
| 138 |
+
o.m.a
|
| 139 |
+
ca
|
| 140 |
+
1b
|
| 141 |
+
f.eks
|
| 142 |
+
rens
|
| 143 |
+
ch
|
| 144 |
+
mr
|
| 145 |
+
schw
|
| 146 |
+
d.c
|
| 147 |
+
utraditionelt
|
| 148 |
+
idrætsgym
|
| 149 |
+
hhv
|
| 150 |
+
e.l
|
| 151 |
+
s.s
|
| 152 |
+
eks
|
| 153 |
+
f.o.m
|
| 154 |
+
dk-storbrit
|
| 155 |
+
dk-jugo
|
| 156 |
+
n.z
|
| 157 |
+
derivater
|
| 158 |
+
c
|
| 159 |
+
pt
|
| 160 |
+
vm-kval
|
| 161 |
+
kl
|
| 162 |
+
hr
|
| 163 |
+
cand
|
| 164 |
+
jur
|
| 165 |
+
sav
|
| 166 |
+
h.c
|
| 167 |
+
arab.-danm
|
| 168 |
+
d.a.d
|
| 169 |
+
fl
|
| 170 |
+
o.a
|
| 171 |
+
a.s
|
| 172 |
+
cand.polit
|
| 173 |
+
grundejerform
|
| 174 |
+
j
|
| 175 |
+
faglærte
|
| 176 |
+
cr
|
| 177 |
+
a.a
|
| 178 |
+
mou
|
| 179 |
+
f.r.i
|
| 180 |
+
årh
|
| 181 |
+
o.m.m
|
| 182 |
+
sve
|
| 183 |
+
c.a
|
| 184 |
+
engl
|
| 185 |
+
sikkerhedssystemerne
|
| 186 |
+
m.f
|
| 187 |
+
j.k
|
| 188 |
+
phil
|
| 189 |
+
f
|
| 190 |
+
vet
|
| 191 |
+
mio
|
| 192 |
+
k.e
|
| 193 |
+
m.k
|
| 194 |
+
atla
|
| 195 |
+
idrætsg
|
| 196 |
+
n.n
|
| 197 |
+
4-bakketur
|
| 198 |
+
dvs
|
| 199 |
+
sdr
|
| 200 |
+
s.j
|
| 201 |
+
hol
|
| 202 |
+
s.h
|
| 203 |
+
pei
|
| 204 |
+
kbhvn
|
| 205 |
+
aa
|
| 206 |
+
m.g.i
|
| 207 |
+
fvt
|
| 208 |
+
i«
|
| 209 |
+
b.c
|
| 210 |
+
th
|
| 211 |
+
lrs
|
nltk_data/tokenizers/punkt_tab/danish/collocations.tab
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## skak
|
| 2 |
+
##number## speedway
|
| 3 |
+
##number## rally
|
| 4 |
+
##number## april
|
| 5 |
+
##number## dm-fin
|
| 6 |
+
##number## viceformand
|
| 7 |
+
m jensen
|
| 8 |
+
##number## kano/kajak
|
| 9 |
+
##number## bowling
|
| 10 |
+
##number## dm-finale
|
| 11 |
+
##number## årh.
|
| 12 |
+
##number## januar
|
| 13 |
+
##number## august
|
| 14 |
+
##number## marathon
|
| 15 |
+
##number## kamp
|
| 16 |
+
##number## skihop
|
| 17 |
+
##number## etage
|
| 18 |
+
##number## tennis
|
| 19 |
+
##number## cykling
|
| 20 |
+
e andersen
|
| 21 |
+
##number## december
|
| 22 |
+
g h.
|
| 23 |
+
##number## neb
|
| 24 |
+
##number## sektion
|
| 25 |
+
##number## afd.
|
| 26 |
+
##number## klasse
|
| 27 |
+
##number## trampolin
|
| 28 |
+
##number## bordtennis
|
| 29 |
+
##number## formel
|
| 30 |
+
##number## århundredes
|
| 31 |
+
##number## dm-semifin
|
| 32 |
+
##number## heks
|
| 33 |
+
##number## taekwondo
|
| 34 |
+
##number## galop
|
| 35 |
+
##number## basketball
|
| 36 |
+
##number## dm
|
| 37 |
+
m skræl
|
| 38 |
+
##number## trav
|
| 39 |
+
##number## provins
|
| 40 |
+
##number## triathlon
|
| 41 |
+
k axel
|
| 42 |
+
##number## rugby
|
| 43 |
+
s h.
|
| 44 |
+
##number## klaverkoncert
|
| 45 |
+
a p.
|
| 46 |
+
e løgstrup
|
| 47 |
+
k telefax
|
| 48 |
+
##number## gyldendal
|
| 49 |
+
##number## fodbold
|
| 50 |
+
e rosenfeldt
|
| 51 |
+
##number## oktober
|
| 52 |
+
k o.
|
| 53 |
+
##number## september
|
| 54 |
+
##number## dec.
|
| 55 |
+
##number## juledag
|
| 56 |
+
##number## badminton
|
| 57 |
+
##number## sejlsport
|
| 58 |
+
##number## håndbold
|
| 59 |
+
r førsund
|
| 60 |
+
e jørgensen
|
| 61 |
+
d ##number##
|
| 62 |
+
k e
|
| 63 |
+
##number## alp.ski
|
| 64 |
+
##number## judo
|
| 65 |
+
##number## roning
|
| 66 |
+
##number## november
|
| 67 |
+
##number## atletik
|
| 68 |
+
##number## århundrede
|
| 69 |
+
##number## ridning
|
| 70 |
+
##number## marts
|
| 71 |
+
m andersen
|
| 72 |
+
d roosevelt
|
| 73 |
+
##number## brydning
|
| 74 |
+
s kr.
|
| 75 |
+
##number## runde
|
| 76 |
+
##number## division
|
| 77 |
+
##number## sal
|
| 78 |
+
##number## boksning
|
| 79 |
+
##number## minut
|
| 80 |
+
##number## golf
|
| 81 |
+
##number## juni
|
| 82 |
+
##number## symfoni
|
| 83 |
+
##number## hurtigløb
|
| 84 |
+
k jørgensen
|
| 85 |
+
##number## jörgen
|
| 86 |
+
##number## klasses
|
| 87 |
+
e jacobsen
|
| 88 |
+
k jensen
|
| 89 |
+
##number## februar
|
| 90 |
+
k nielsen
|
| 91 |
+
##number## volleyball
|
| 92 |
+
##number## maj
|
| 93 |
+
##number## verdenskrig
|
| 94 |
+
##number## juli
|
| 95 |
+
##number## ishockey
|
| 96 |
+
##number## kunstskøjteløb
|
| 97 |
+
b jørgensen
|
| 98 |
+
##number## gymnastik
|
| 99 |
+
##number## svømning
|
| 100 |
+
##number## tw
|
| 101 |
+
i pedersens
|
nltk_data/tokenizers/punkt_tab/danish/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/danish/sent_starters.txt
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kronik
|
| 2 |
+
alligevel
|
| 3 |
+
de
|
| 4 |
+
først
|
| 5 |
+
derfor
|
| 6 |
+
vi
|
| 7 |
+
selv
|
| 8 |
+
hertil
|
| 9 |
+
sådan
|
| 10 |
+
dette
|
| 11 |
+
sport
|
| 12 |
+
man
|
| 13 |
+
foto
|
| 14 |
+
begge
|
| 15 |
+
tag
|
| 16 |
+
dertil
|
| 17 |
+
reuter
|
| 18 |
+
efter
|
| 19 |
+
endelig
|
| 20 |
+
ifølge
|
| 21 |
+
lad
|
| 22 |
+
når
|
| 23 |
+
det
|
| 24 |
+
desuden
|
| 25 |
+
nu
|
| 26 |
+
reuters
|
| 27 |
+
årsagen
|
| 28 |
+
tænk
|
| 29 |
+
samtidig
|
| 30 |
+
udover
|
| 31 |
+
men
|
| 32 |
+
endvidere
|
| 33 |
+
rør
|
| 34 |
+
rb
|
| 35 |
+
udstillingen
|
| 36 |
+
faktabox
|
| 37 |
+
reception
|
| 38 |
+
blandt
|
| 39 |
+
hvad
|
| 40 |
+
skær
|
| 41 |
+
lilot
|
| 42 |
+
derudover
|
| 43 |
+
da
|
| 44 |
+
tilsæt
|
| 45 |
+
denne
|
| 46 |
+
afp
|
| 47 |
+
her
|
| 48 |
+
hvis
|
| 49 |
+
hæld
|
| 50 |
+
problemet
|
| 51 |
+
dermed
|
| 52 |
+
jeg
|
| 53 |
+
grafik
|
| 54 |
+
anmeldelse
|
| 55 |
+
den
|
| 56 |
+
ebbe
|
| 57 |
+
resultatet
|
| 58 |
+
tværtimod
|
| 59 |
+
hans
|
| 60 |
+
måske
|
| 61 |
+
feature
|
| 62 |
+
tillæg
|
| 63 |
+
hun
|
| 64 |
+
han
|
nltk_data/tokenizers/punkt_tab/dutch/abbrev_types.txt
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
m.j
|
| 2 |
+
t
|
| 3 |
+
ph
|
| 4 |
+
j.h
|
| 5 |
+
p.a.m
|
| 6 |
+
j.m
|
| 7 |
+
dr
|
| 8 |
+
st
|
| 9 |
+
j.b.m
|
| 10 |
+
p
|
| 11 |
+
nr
|
| 12 |
+
h.s
|
| 13 |
+
e.d
|
| 14 |
+
t.e
|
| 15 |
+
a.v
|
| 16 |
+
esb
|
| 17 |
+
s.z
|
| 18 |
+
drs
|
| 19 |
+
b.b
|
| 20 |
+
m.o
|
| 21 |
+
inc
|
| 22 |
+
n
|
| 23 |
+
pensioenfonds
|
| 24 |
+
s.v.p
|
| 25 |
+
bod
|
| 26 |
+
fr
|
| 27 |
+
pk
|
| 28 |
+
r.p
|
| 29 |
+
c.p.j
|
| 30 |
+
v.l.n.r
|
| 31 |
+
chr
|
| 32 |
+
m.v.d
|
| 33 |
+
int
|
| 34 |
+
o.m
|
| 35 |
+
j.v.d
|
| 36 |
+
u.o.m
|
| 37 |
+
f.c
|
| 38 |
+
k
|
| 39 |
+
bijgebracht
|
| 40 |
+
ontwaakte
|
| 41 |
+
m
|
| 42 |
+
j.w
|
| 43 |
+
a.l
|
| 44 |
+
a.v.d
|
| 45 |
+
s.v
|
| 46 |
+
s
|
| 47 |
+
j.d
|
| 48 |
+
binnengekomen
|
| 49 |
+
ds
|
| 50 |
+
schouwburg
|
| 51 |
+
b.v
|
| 52 |
+
h
|
| 53 |
+
a
|
| 54 |
+
j.a
|
| 55 |
+
aanvielen
|
| 56 |
+
h.g
|
| 57 |
+
p.f
|
| 58 |
+
j.l
|
| 59 |
+
mgr
|
| 60 |
+
c.j
|
| 61 |
+
blz
|
| 62 |
+
l.e.h
|
| 63 |
+
w.k
|
| 64 |
+
g
|
| 65 |
+
m.g
|
| 66 |
+
r.v.d
|
| 67 |
+
ing
|
| 68 |
+
v.d
|
| 69 |
+
c.q
|
| 70 |
+
l
|
| 71 |
+
h.p
|
| 72 |
+
mr
|
| 73 |
+
gesch
|
| 74 |
+
e.l
|
| 75 |
+
p.j
|
| 76 |
+
mm
|
| 77 |
+
j.g
|
| 78 |
+
j.f
|
| 79 |
+
c
|
| 80 |
+
f.m
|
| 81 |
+
jl
|
| 82 |
+
r
|
| 83 |
+
o.a
|
| 84 |
+
a.s
|
| 85 |
+
ir
|
| 86 |
+
v
|
| 87 |
+
j
|
| 88 |
+
jr
|
| 89 |
+
e
|
| 90 |
+
m.i.v
|
| 91 |
+
l.a
|
| 92 |
+
f.v.d
|
| 93 |
+
aansluit
|
| 94 |
+
c.c
|
| 95 |
+
a.m
|
| 96 |
+
f.o.j
|
| 97 |
+
m.b
|
| 98 |
+
y
|
| 99 |
+
th
|
nltk_data/tokenizers/punkt_tab/dutch/collocations.tab
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## sotelo
|
| 2 |
+
##number## clas
|
| 3 |
+
##number## buckler
|
| 4 |
+
##number## carrera
|
| 5 |
+
##number## rmo
|
| 6 |
+
##number## orioli
|
| 7 |
+
w baron
|
| 8 |
+
##number## morales
|
| 9 |
+
##number## snotselelaank
|
| 10 |
+
##number## arcarons
|
| 11 |
+
##number## cavandoli
|
| 12 |
+
##number## pdm
|
| 13 |
+
##number## helvetia
|
| 14 |
+
##number## panasonic
|
| 15 |
+
##number## motorola
|
| 16 |
+
w bruinsma
|
| 17 |
+
##number## heer
|
| 18 |
+
##number## lotus
|
| 19 |
+
##number## banesto
|
| 20 |
+
##number## magnaldi
|
| 21 |
+
w jense
|
| 22 |
+
w heuvelmans
|
| 23 |
+
w spatje
|
| 24 |
+
##number## telekom
|
| 25 |
+
f kennedy
|
| 26 |
+
##number## gatorade
|
| 27 |
+
##number## mg-gb
|
| 28 |
+
##number## once
|
| 29 |
+
##number## peterhansel
|
| 30 |
+
##number## ariostea
|
| 31 |
+
##number## tvm
|
| 32 |
+
##number## höl
|
| 33 |
+
##number## castorama
|
| 34 |
+
##number## tulip
|
| 35 |
+
b situatie
|
| 36 |
+
##number## mas
|
| 37 |
+
##number## lotto
|
nltk_data/tokenizers/punkt_tab/dutch/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/dutch/sent_starters.txt
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
het
|
| 2 |
+
daardoor
|
| 3 |
+
de
|
| 4 |
+
er
|
| 5 |
+
hoewel
|
| 6 |
+
wat
|
| 7 |
+
urlings
|
| 8 |
+
na
|
| 9 |
+
ze
|
| 10 |
+
alleen
|
| 11 |
+
dat
|
| 12 |
+
ik
|
| 13 |
+
pijls
|
| 14 |
+
wie
|
| 15 |
+
daarna
|
| 16 |
+
foto
|
| 17 |
+
als
|
| 18 |
+
boer
|
| 19 |
+
hammes
|
| 20 |
+
verder
|
| 21 |
+
ook
|
| 22 |
+
evers
|
| 23 |
+
vandaar
|
| 24 |
+
toen
|
| 25 |
+
we
|
| 26 |
+
langenberg
|
| 27 |
+
naast
|
| 28 |
+
want
|
| 29 |
+
in
|
| 30 |
+
wij
|
| 31 |
+
zo
|
| 32 |
+
hendrikx
|
| 33 |
+
daar
|
| 34 |
+
crouzen
|
| 35 |
+
dit
|
| 36 |
+
daarnaast
|
| 37 |
+
anp
|
| 38 |
+
zij
|
| 39 |
+
behalve
|
| 40 |
+
waarom
|
| 41 |
+
daarom
|
| 42 |
+
bovendien
|
| 43 |
+
hij
|
| 44 |
+
daarbij
|
| 45 |
+
nee
|
| 46 |
+
volgens
|
| 47 |
+
daarmee
|
| 48 |
+
bukkems
|
| 49 |
+
dvnl
|
| 50 |
+
eén
|
| 51 |
+
pas
|
| 52 |
+
tijdens
|
| 53 |
+
vooral
|
| 54 |
+
maar
|
nltk_data/tokenizers/punkt_tab/english/abbrev_types.txt
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ct
|
| 2 |
+
m.j
|
| 3 |
+
t
|
| 4 |
+
a.c
|
| 5 |
+
n.h
|
| 6 |
+
ms
|
| 7 |
+
p.a.m
|
| 8 |
+
dr
|
| 9 |
+
pa
|
| 10 |
+
p.m
|
| 11 |
+
u.k
|
| 12 |
+
st
|
| 13 |
+
dec
|
| 14 |
+
u.s.a
|
| 15 |
+
lt
|
| 16 |
+
g.k
|
| 17 |
+
adm
|
| 18 |
+
p
|
| 19 |
+
h.m
|
| 20 |
+
ga
|
| 21 |
+
tenn
|
| 22 |
+
yr
|
| 23 |
+
sen
|
| 24 |
+
n.c
|
| 25 |
+
j.j
|
| 26 |
+
d.h
|
| 27 |
+
s.g
|
| 28 |
+
inc
|
| 29 |
+
vs
|
| 30 |
+
s.p.a
|
| 31 |
+
a.t
|
| 32 |
+
n
|
| 33 |
+
feb
|
| 34 |
+
sr
|
| 35 |
+
jan
|
| 36 |
+
s.a.y
|
| 37 |
+
n.y
|
| 38 |
+
col
|
| 39 |
+
g.f
|
| 40 |
+
c.o.m.b
|
| 41 |
+
d
|
| 42 |
+
ft
|
| 43 |
+
va
|
| 44 |
+
r.k
|
| 45 |
+
e.f
|
| 46 |
+
chg
|
| 47 |
+
r.i
|
| 48 |
+
a.g
|
| 49 |
+
minn
|
| 50 |
+
a.h
|
| 51 |
+
k
|
| 52 |
+
n.j
|
| 53 |
+
m
|
| 54 |
+
l.f
|
| 55 |
+
f.j
|
| 56 |
+
gen
|
| 57 |
+
i.m.s
|
| 58 |
+
s.a
|
| 59 |
+
aug
|
| 60 |
+
j.p
|
| 61 |
+
okla
|
| 62 |
+
m.d.c
|
| 63 |
+
ltd
|
| 64 |
+
oct
|
| 65 |
+
s
|
| 66 |
+
vt
|
| 67 |
+
r.a
|
| 68 |
+
j.c
|
| 69 |
+
ariz
|
| 70 |
+
w.w
|
| 71 |
+
b.v
|
| 72 |
+
ore
|
| 73 |
+
h
|
| 74 |
+
w.r
|
| 75 |
+
e.h
|
| 76 |
+
mrs
|
| 77 |
+
cie
|
| 78 |
+
corp
|
| 79 |
+
w
|
| 80 |
+
n.v
|
| 81 |
+
a.d
|
| 82 |
+
r.j
|
| 83 |
+
ok
|
| 84 |
+
. .
|
| 85 |
+
e.m
|
| 86 |
+
w.c
|
| 87 |
+
ill
|
| 88 |
+
nov
|
| 89 |
+
u.s
|
| 90 |
+
prof
|
| 91 |
+
conn
|
| 92 |
+
u.s.s.r
|
| 93 |
+
mg
|
| 94 |
+
f.g
|
| 95 |
+
ph.d
|
| 96 |
+
g
|
| 97 |
+
calif
|
| 98 |
+
messrs
|
| 99 |
+
h.f
|
| 100 |
+
wash
|
| 101 |
+
tues
|
| 102 |
+
sw
|
| 103 |
+
bros
|
| 104 |
+
u.n
|
| 105 |
+
l
|
| 106 |
+
wis
|
| 107 |
+
mr
|
| 108 |
+
sep
|
| 109 |
+
d.c
|
| 110 |
+
ave
|
| 111 |
+
e.l
|
| 112 |
+
co
|
| 113 |
+
s.s
|
| 114 |
+
reps
|
| 115 |
+
c
|
| 116 |
+
r.t
|
| 117 |
+
h.c
|
| 118 |
+
r
|
| 119 |
+
wed
|
| 120 |
+
a.s
|
| 121 |
+
v
|
| 122 |
+
fla
|
| 123 |
+
jr
|
| 124 |
+
r.h
|
| 125 |
+
c.v
|
| 126 |
+
m.b.a
|
| 127 |
+
rep
|
| 128 |
+
a.a
|
| 129 |
+
e
|
| 130 |
+
c.i.t
|
| 131 |
+
l.a
|
| 132 |
+
b.f
|
| 133 |
+
j.b
|
| 134 |
+
d.w
|
| 135 |
+
j.k
|
| 136 |
+
ala
|
| 137 |
+
f
|
| 138 |
+
w.va
|
| 139 |
+
sept
|
| 140 |
+
mich
|
| 141 |
+
n.m
|
| 142 |
+
j.r
|
| 143 |
+
l.p
|
| 144 |
+
s.c
|
| 145 |
+
colo
|
| 146 |
+
fri
|
| 147 |
+
a.m
|
| 148 |
+
g.d
|
| 149 |
+
kan
|
| 150 |
+
maj
|
| 151 |
+
ky
|
| 152 |
+
a.m.e
|
| 153 |
+
n.d
|
| 154 |
+
t.j
|
| 155 |
+
cos
|
| 156 |
+
nev
|
nltk_data/tokenizers/punkt_tab/english/collocations.tab
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## international
|
| 2 |
+
##number## rj
|
| 3 |
+
##number## commodities
|
| 4 |
+
##number## cooper
|
| 5 |
+
b stewart
|
| 6 |
+
##number## genentech
|
| 7 |
+
##number## wedgestone
|
| 8 |
+
i toussie
|
| 9 |
+
##number## pepper
|
| 10 |
+
j fialka
|
| 11 |
+
o ludcke
|
| 12 |
+
##number## insider
|
| 13 |
+
##number## aes
|
| 14 |
+
i magnin
|
| 15 |
+
##number## credit
|
| 16 |
+
##number## corrections
|
| 17 |
+
##number## financing
|
| 18 |
+
##number## henley
|
| 19 |
+
##number## business
|
| 20 |
+
##number## pay-fone
|
| 21 |
+
b wigton
|
| 22 |
+
b edelman
|
| 23 |
+
b levine
|
| 24 |
+
##number## leisure
|
| 25 |
+
b smith
|
| 26 |
+
j walter
|
| 27 |
+
##number## pegasus
|
| 28 |
+
##number## dividend
|
| 29 |
+
j aron
|
| 30 |
+
##number## review
|
| 31 |
+
##number## abreast
|
| 32 |
+
##number## who
|
| 33 |
+
##number## letters
|
| 34 |
+
##number## colgate
|
| 35 |
+
##number## cbot
|
| 36 |
+
##number## notable
|
| 37 |
+
##number## zimmer
|
nltk_data/tokenizers/punkt_tab/english/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/english/sent_starters.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
most
|
| 2 |
+
he
|
| 3 |
+
since
|
| 4 |
+
so
|
| 5 |
+
both
|
| 6 |
+
these
|
| 7 |
+
it
|
| 8 |
+
nevertheless
|
| 9 |
+
this
|
| 10 |
+
indeed
|
| 11 |
+
however
|
| 12 |
+
instead
|
| 13 |
+
under
|
| 14 |
+
similarly
|
| 15 |
+
some
|
| 16 |
+
though
|
| 17 |
+
while
|
| 18 |
+
when
|
| 19 |
+
in
|
| 20 |
+
despite
|
| 21 |
+
although
|
| 22 |
+
nonetheless
|
| 23 |
+
thus
|
| 24 |
+
there
|
| 25 |
+
if
|
| 26 |
+
the
|
| 27 |
+
nor
|
| 28 |
+
separately
|
| 29 |
+
moreover
|
| 30 |
+
but
|
| 31 |
+
they
|
| 32 |
+
yet
|
| 33 |
+
many
|
| 34 |
+
according
|
| 35 |
+
sales
|
| 36 |
+
among
|
| 37 |
+
meanwhile
|
| 38 |
+
even
|
| 39 |
+
i
|
nltk_data/tokenizers/punkt_tab/estonian/abbrev_types.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
eos
|
| 2 |
+
c
|
| 3 |
+
a.d
|
| 4 |
+
t.a.s.s
|
| 5 |
+
e.t
|
| 6 |
+
päevapiltnikud
|
| 7 |
+
c.h
|
| 8 |
+
b.p
|
| 9 |
+
amm
|
| 10 |
+
ameerika-mees
|
| 11 |
+
n.-ö
|
| 12 |
+
cm
|
| 13 |
+
b
|
| 14 |
+
mhm
|
| 15 |
+
a.s
|
| 16 |
+
m.e
|
| 17 |
+
j.l
|
| 18 |
+
j
|
| 19 |
+
u.t
|
| 20 |
+
vm
|
| 21 |
+
g.u.n
|
| 22 |
+
hajutada
|
| 23 |
+
p.s
|
| 24 |
+
a.b
|
| 25 |
+
c.h.-r
|
| 26 |
+
i.q
|
| 27 |
+
gr
|
| 28 |
+
fido
|
| 29 |
+
pankurit
|
| 30 |
+
s.v
|
| 31 |
+
l.l
|
| 32 |
+
c.-h
|
| 33 |
+
m.h
|
| 34 |
+
h.l
|
| 35 |
+
m.k
|
| 36 |
+
j.r
|
| 37 |
+
t.k
|
| 38 |
+
k.h
|
| 39 |
+
89/90
|
| 40 |
+
h
|
| 41 |
+
a
|
| 42 |
+
dost
|
| 43 |
+
v.k
|
| 44 |
+
e.q
|
| 45 |
+
t.j
|
| 46 |
+
m.b
|
| 47 |
+
d
|
| 48 |
+
p.k
|
nltk_data/tokenizers/punkt_tab/estonian/collocations.tab
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## juuni
|
| 2 |
+
##number## novembril
|
| 3 |
+
##number## juulilt
|
| 4 |
+
r järve-vomm
|
| 5 |
+
##number## mida
|
| 6 |
+
n liidu
|
| 7 |
+
##number## milliseid
|
| 8 |
+
##number## oktoobri
|
| 9 |
+
##number## iidol
|
| 10 |
+
m e
|
| 11 |
+
##number## klassist
|
| 12 |
+
##number## millest
|
| 13 |
+
##number## august
|
| 14 |
+
##number## pariis
|
| 15 |
+
##number## septembrist
|
| 16 |
+
##number## oktoober
|
| 17 |
+
##number## märtsini
|
| 18 |
+
##number## kust
|
| 19 |
+
k mägi
|
| 20 |
+
##number## detsembrist
|
| 21 |
+
##number## jaanuari
|
| 22 |
+
##number## epee
|
| 23 |
+
##number## nimetage
|
| 24 |
+
##number## novembrini
|
| 25 |
+
##number## eluaasta
|
| 26 |
+
s mill
|
| 27 |
+
##number## helsingi
|
| 28 |
+
##number## jaanuarini
|
| 29 |
+
##number## aastail
|
| 30 |
+
##number## augustil
|
| 31 |
+
##number## millise
|
| 32 |
+
##number## juulist
|
| 33 |
+
##number## mai
|
| 34 |
+
##number## novembri
|
| 35 |
+
##number## oktoobrist
|
| 36 |
+
##number## juunini
|
| 37 |
+
##number## septembriks
|
| 38 |
+
##number## detsembril
|
| 39 |
+
p s
|
| 40 |
+
##number## jaanuar
|
| 41 |
+
##number## aastate
|
| 42 |
+
##number## milline
|
| 43 |
+
##number## kelle
|
| 44 |
+
##number## jaanuaril
|
| 45 |
+
s stadnikov
|
| 46 |
+
##number## aastaks
|
| 47 |
+
##number## stockholm
|
| 48 |
+
##number## suurim
|
| 49 |
+
##number## aasta
|
| 50 |
+
##number## sajandi
|
| 51 |
+
##number## millega
|
| 52 |
+
##number## aastast
|
| 53 |
+
##number## aastal
|
| 54 |
+
##number## kumb
|
| 55 |
+
##number## septembril
|
| 56 |
+
##number## korruselt
|
| 57 |
+
##number## septembri
|
| 58 |
+
##number## veebruarini
|
| 59 |
+
##number## london
|
| 60 |
+
##number## aastatel
|
| 61 |
+
##number## september
|
| 62 |
+
##number## veebruari
|
| 63 |
+
##number## oktoobrini
|
| 64 |
+
##number## mail
|
| 65 |
+
m kassovitz
|
| 66 |
+
##number## action-film
|
| 67 |
+
##number## mis
|
| 68 |
+
k herkül
|
| 69 |
+
n n
|
| 70 |
+
##number## detsembrini
|
| 71 |
+
##number## imre
|
| 72 |
+
t jõgeda
|
| 73 |
+
##number## casino
|
| 74 |
+
##number## septembrit
|
| 75 |
+
##number## augustini
|
| 76 |
+
##number## juulil
|
| 77 |
+
##number## november
|
| 78 |
+
##number## kuupäeval
|
| 79 |
+
##number## taevas
|
| 80 |
+
##number## septembrini
|
| 81 |
+
##number## detsember
|
| 82 |
+
##number## detsembri
|
| 83 |
+
##number## juunil
|
| 84 |
+
##number## augustist
|
| 85 |
+
n jurist
|
| 86 |
+
##number## missugust
|
| 87 |
+
##number## aastatesse
|
| 88 |
+
##number## aprillil
|
| 89 |
+
##number## augusti
|
| 90 |
+
##number## oktoobril
|
| 91 |
+
##number## märtsil
|
| 92 |
+
##number## a
|
| 93 |
+
##number## the
|
| 94 |
+
##number## sajandil
|
| 95 |
+
##number## aastani
|
| 96 |
+
##number## juuli
|
| 97 |
+
##number## septembrile
|
| 98 |
+
##number## millist
|
| 99 |
+
##number## millised
|
| 100 |
+
##number## veebruaril
|
nltk_data/tokenizers/punkt_tab/estonian/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/estonian/sent_starters.txt
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kalad
|
| 2 |
+
õnneks
|
| 3 |
+
selle
|
| 4 |
+
maimu
|
| 5 |
+
teisipäeval
|
| 6 |
+
ma
|
| 7 |
+
skorpion
|
| 8 |
+
aga
|
| 9 |
+
lisaks
|
| 10 |
+
selleks
|
| 11 |
+
maitse
|
| 12 |
+
esiteks
|
| 13 |
+
erinevalt
|
| 14 |
+
pealegi
|
| 15 |
+
praegu
|
| 16 |
+
kas
|
| 17 |
+
tegelikult
|
| 18 |
+
neitsi
|
| 19 |
+
nädalavahetus
|
| 20 |
+
tema
|
| 21 |
+
kui
|
| 22 |
+
seega
|
| 23 |
+
täna
|
| 24 |
+
lugupidamisega
|
| 25 |
+
miks
|
| 26 |
+
teiseks
|
| 27 |
+
väldi
|
| 28 |
+
pohlak
|
| 29 |
+
osades
|
| 30 |
+
sõnn
|
| 31 |
+
samas
|
| 32 |
+
nimelt
|
| 33 |
+
juhtkiri
|
| 34 |
+
krimi
|
| 35 |
+
nädalavahetusel
|
| 36 |
+
näiteks
|
| 37 |
+
kuidas
|
| 38 |
+
ambur
|
| 39 |
+
telgmaa
|
| 40 |
+
laupäeval
|
| 41 |
+
seetõttu
|
| 42 |
+
rezhissöör
|
| 43 |
+
kahjuks
|
| 44 |
+
ent
|
| 45 |
+
samuti
|
| 46 |
+
ehkki
|
| 47 |
+
veevalaja
|
| 48 |
+
seepärast
|
| 49 |
+
muidugi
|
| 50 |
+
kuna
|
| 51 |
+
tänaseks
|
| 52 |
+
mina
|
| 53 |
+
loomulikult
|
| 54 |
+
ometi
|
| 55 |
+
arvamus
|
| 56 |
+
lõvi
|
| 57 |
+
ee
|
| 58 |
+
niisiis
|
| 59 |
+
mul
|
| 60 |
+
kaksikud
|
| 61 |
+
tõsi
|
| 62 |
+
hinnete
|
| 63 |
+
sestap
|
| 64 |
+
tõenäoliselt
|
| 65 |
+
samal
|
| 66 |
+
see
|
| 67 |
+
paraku
|
| 68 |
+
jäär
|
| 69 |
+
kokkuvõttes
|
| 70 |
+
küllap
|
| 71 |
+
muide
|
| 72 |
+
nüüd
|
| 73 |
+
kolmapäeval
|
| 74 |
+
võibolla
|
| 75 |
+
kuid
|
| 76 |
+
nädalavahetuse
|
| 77 |
+
kuigi
|
| 78 |
+
võid
|
| 79 |
+
lõpuks
|
| 80 |
+
kaalud
|
| 81 |
+
areen
|
| 82 |
+
kirjad
|
| 83 |
+
vähk
|
| 84 |
+
esmaspäeval
|
| 85 |
+
nii
|
| 86 |
+
need
|
| 87 |
+
uue
|
| 88 |
+
ta
|
| 89 |
+
minu
|
nltk_data/tokenizers/punkt_tab/finnish/abbrev_types.txt
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
t
|
| 2 |
+
suom
|
| 3 |
+
dr
|
| 4 |
+
st
|
| 5 |
+
970125090.jtun
|
| 6 |
+
p
|
| 7 |
+
sis
|
| 8 |
+
t.h
|
| 9 |
+
961221327.jtun
|
| 10 |
+
a.i
|
| 11 |
+
milj
|
| 12 |
+
ski
|
| 13 |
+
kp
|
| 14 |
+
970131067.jtun
|
| 15 |
+
970124030.jtun
|
| 16 |
+
nk
|
| 17 |
+
va
|
| 18 |
+
pan
|
| 19 |
+
yhteystiedot
|
| 20 |
+
ruots
|
| 21 |
+
jne
|
| 22 |
+
t.a
|
| 23 |
+
l.-g
|
| 24 |
+
k
|
| 25 |
+
j.w
|
| 26 |
+
p2
|
| 27 |
+
oik
|
| 28 |
+
970102248.jtun
|
| 29 |
+
hj
|
| 30 |
+
s
|
| 31 |
+
vt
|
| 32 |
+
muistelmia
|
| 33 |
+
o.s
|
| 34 |
+
elo
|
| 35 |
+
h
|
| 36 |
+
ortod
|
| 37 |
+
o.l
|
| 38 |
+
w
|
| 39 |
+
tms
|
| 40 |
+
970120219.jtun
|
| 41 |
+
pj
|
| 42 |
+
ok
|
| 43 |
+
toissapäiväinen
|
| 44 |
+
28.t1
|
| 45 |
+
pelintekijä
|
| 46 |
+
970111011.jtun
|
| 47 |
+
op
|
| 48 |
+
os
|
| 49 |
+
ns
|
| 50 |
+
m.g
|
| 51 |
+
o.-i
|
| 52 |
+
m3
|
| 53 |
+
pros
|
| 54 |
+
mr
|
| 55 |
+
970102171.jtun
|
| 56 |
+
waller
|
| 57 |
+
hels
|
| 58 |
+
rotary-järjestössä
|
| 59 |
+
ins
|
| 60 |
+
esim
|
| 61 |
+
apul
|
| 62 |
+
fil
|
| 63 |
+
id
|
| 64 |
+
ym
|
| 65 |
+
j
|
| 66 |
+
rf
|
| 67 |
+
v.o
|
| 68 |
+
lis
|
| 69 |
+
c.a
|
| 70 |
+
em
|
| 71 |
+
kand
|
| 72 |
+
r.y
|
| 73 |
+
valt
|
| 74 |
+
dipl
|
| 75 |
+
ö
|
| 76 |
+
970111092.jtun
|
| 77 |
+
ponteva
|
| 78 |
+
y
|
| 79 |
+
kapakoista
|
| 80 |
+
970130160.jtun
|
| 81 |
+
th
|
nltk_data/tokenizers/punkt_tab/finnish/collocations.tab
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## sm
|
| 2 |
+
##number## ohjelmassa
|
| 3 |
+
##number## a3
|
| 4 |
+
##number## rc3
|
| 5 |
+
##number## rxd4
|
| 6 |
+
##number## hxg4
|
| 7 |
+
o stenberg
|
| 8 |
+
##number## lg5
|
| 9 |
+
##number## tallitontun
|
| 10 |
+
##number## lähetysohjeet
|
| 11 |
+
##number## uimakoulu
|
| 12 |
+
##number## jaana
|
| 13 |
+
##number## alustuksen
|
| 14 |
+
##number## uppo-nallen
|
| 15 |
+
##number## anne
|
| 16 |
+
##number## rxf3
|
| 17 |
+
a sjögren
|
| 18 |
+
##number## kamarikuoro
|
| 19 |
+
##number## vetäjänä
|
| 20 |
+
##number## pääsymaksu
|
| 21 |
+
##number## kerros
|
| 22 |
+
##number## kurssi
|
| 23 |
+
##number## kuori
|
| 24 |
+
##number## g4
|
| 25 |
+
##number## h3
|
| 26 |
+
##number## tiede-teatterissa
|
| 27 |
+
##number## kh2
|
| 28 |
+
##number## kausimaksu
|
| 29 |
+
##number## tia
|
| 30 |
+
##number## gxf5
|
| 31 |
+
##number## täky-galleria
|
| 32 |
+
##number## le2
|
| 33 |
+
##number## te8+
|
| 34 |
+
##number## la4
|
| 35 |
+
##number## keitä
|
| 36 |
+
##number## huhtikuuta
|
| 37 |
+
##number## menotiedoissa
|
| 38 |
+
##number## valmista
|
| 39 |
+
##number## txb5
|
| 40 |
+
##number## maskeerauskurssin
|
| 41 |
+
##number## rd2
|
| 42 |
+
##number## re2
|
| 43 |
+
##number## solisteina
|
| 44 |
+
##number## esitelmä
|
| 45 |
+
##number## puupiirrossarja
|
| 46 |
+
##number## ta1
|
| 47 |
+
##number## vaahdota
|
| 48 |
+
##number## h4
|
| 49 |
+
##number## kesäkuuta
|
| 50 |
+
##number## liikkeitä
|
| 51 |
+
##number## tuolloin
|
| 52 |
+
##number## viikko
|
| 53 |
+
##number## mittaa
|
| 54 |
+
a sjögrenin
|
| 55 |
+
##number## exf6
|
| 56 |
+
##number## rc6+
|
| 57 |
+
##number## viimeistele
|
| 58 |
+
##number## ld1
|
| 59 |
+
##number## elokuuta
|
| 60 |
+
##number## dh5+
|
| 61 |
+
##number## syyskuuta
|
| 62 |
+
##number## opettajina
|
| 63 |
+
##number## b3
|
| 64 |
+
##number## rauhankatu
|
| 65 |
+
c clarke
|
| 66 |
+
##number## saakka
|
| 67 |
+
##number## elokuvat
|
| 68 |
+
b huggins
|
| 69 |
+
g gahmberg
|
| 70 |
+
##number## luento
|
| 71 |
+
##number## lf3
|
| 72 |
+
##number## tammikuuta
|
| 73 |
+
##number## ryömä
|
| 74 |
+
##number## meller
|
| 75 |
+
##number## jäsenkortti
|
| 76 |
+
##number## esiintyjinä
|
| 77 |
+
##number## maria
|
| 78 |
+
##number## lf4
|
| 79 |
+
##number## siirto
|
| 80 |
+
##number## aurinko
|
| 81 |
+
##number## lxg6
|
| 82 |
+
##number## marraskuuta
|
| 83 |
+
##number## harjoituksissa
|
| 84 |
+
##number## romantika-yhtye
|
| 85 |
+
##number## g3
|
| 86 |
+
##number## heinäkuuta
|
| 87 |
+
##number## rxd5
|
| 88 |
+
##number## kuumenna
|
| 89 |
+
e hämäläisen
|
| 90 |
+
##number## bxc4
|
| 91 |
+
##number## te1
|
| 92 |
+
##number## kg2
|
| 93 |
+
##number## osallistumismaksu
|
| 94 |
+
##number## re5
|
| 95 |
+
##number## ohjelma
|
| 96 |
+
##number## varapuheenjohtajaksi
|
| 97 |
+
##number## raisa
|
| 98 |
+
##number## päivään
|
| 99 |
+
##number## luokan
|
| 100 |
+
##number## sulata
|
| 101 |
+
##number## levitä
|
| 102 |
+
##number## kaustinen
|
| 103 |
+
##number## kuoroa
|
| 104 |
+
##number## df3
|
| 105 |
+
v helsingistä
|
| 106 |
+
##number## mieskuoro
|
| 107 |
+
##number## lokakuuta
|
| 108 |
+
##number## kerho
|
| 109 |
+
##number## helmikuuta
|
| 110 |
+
##number## kokkola
|
| 111 |
+
##number## suuruusluokan
|
| 112 |
+
v kaupungista
|
| 113 |
+
##number## krs
|
| 114 |
+
##number## tekstit
|
| 115 |
+
##number## menyy
|
| 116 |
+
##number## rf3
|
| 117 |
+
##number## ulkoasiainministeriön
|
| 118 |
+
##number## kaada
|
| 119 |
+
##number## cxd5
|
| 120 |
+
##number## ilmailumuseo
|
| 121 |
+
e waris
|
| 122 |
+
##number## kierros
|
| 123 |
+
##number## tunnille
|
| 124 |
+
##number## kh3
|
| 125 |
+
##number## ohjaus
|
| 126 |
+
a t.
|
| 127 |
+
##number## postimaksu
|
| 128 |
+
##number## pane
|
| 129 |
+
##number## th3
|
| 130 |
+
##number## joulukuuta
|
| 131 |
+
##number## vatkaa
|
| 132 |
+
##number## kokeessa
|
| 133 |
+
l j.
|
| 134 |
+
##number## asti
|
| 135 |
+
##number## opastajana
|
| 136 |
+
##number## kirsi
|
| 137 |
+
##number## lc2
|
| 138 |
+
##number## lh2
|
| 139 |
+
##number## e4
|
| 140 |
+
##number## sairaankuljetukset
|
| 141 |
+
##number## sekoita
|
| 142 |
+
##number## mervi
|
| 143 |
+
##number## de2
|
| 144 |
+
a pietilän
|
| 145 |
+
##number## kf1
|
| 146 |
+
##number## toukokuuta
|
| 147 |
+
##number## maaliskuuta
|
| 148 |
+
##number## leikkaa
|
| 149 |
+
##number## ryhmänäytökset
|
| 150 |
+
v maaseudulta
|
| 151 |
+
##number## de3-e1
|
| 152 |
+
##number## c4
|
| 153 |
+
##number## ta1-b1
|
| 154 |
+
##number## d5
|
| 155 |
+
##number## pia
|
| 156 |
+
##number## lxd6
|
| 157 |
+
##number## d4
|
| 158 |
+
##number## f3-f4
|
| 159 |
+
##number## dxg6+
|
| 160 |
+
##number## sari
|
| 161 |
+
##number## pelkkään
|
| 162 |
+
##number## ld3
|
| 163 |
+
##number## perkaa
|
| 164 |
+
##number## lg3
|
| 165 |
+
##number## kg3
|
| 166 |
+
##number## kvm
|
| 167 |
+
##number## tb1xb6
|
nltk_data/tokenizers/punkt_tab/finnish/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/finnish/sent_starters.txt
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
siinä
|
| 2 |
+
lämpötila
|
| 3 |
+
viiden
|
| 4 |
+
he
|
| 5 |
+
vapaa
|
| 6 |
+
viime
|
| 7 |
+
useimmat
|
| 8 |
+
kansallisooppera
|
| 9 |
+
rooleissa
|
| 10 |
+
näin
|
| 11 |
+
odotettavissa
|
| 12 |
+
tiedustelut
|
| 13 |
+
kansallisteatterin
|
| 14 |
+
sen
|
| 15 |
+
musiikki
|
| 16 |
+
monet
|
| 17 |
+
uusi
|
| 18 |
+
avoinna
|
| 19 |
+
pakkasta
|
| 20 |
+
freeze
|
| 21 |
+
tämä
|
| 22 |
+
lämpö
|
| 23 |
+
lautakunta
|
| 24 |
+
vastaväittäjänä
|
| 25 |
+
päivällä
|
| 26 |
+
tällä
|
| 27 |
+
esimerkiksi
|
| 28 |
+
varoituksia
|
| 29 |
+
merenkurkku
|
| 30 |
+
meriennuste
|
| 31 |
+
näyttelyssä
|
| 32 |
+
kun
|
| 33 |
+
pilvistä
|
| 34 |
+
silloin
|
| 35 |
+
selkämeren
|
| 36 |
+
suurin
|
| 37 |
+
se
|
| 38 |
+
jos
|
| 39 |
+
vaihtelevaa
|
| 40 |
+
vastaväittäjinä
|
| 41 |
+
sivu
|
| 42 |
+
kaupunginteatterin
|
| 43 |
+
pilvisyys
|
| 44 |
+
siellä
|
| 45 |
+
siksi
|
| 46 |
+
kurssimaksu
|
| 47 |
+
tämän
|
| 48 |
+
kotimaa
|
| 49 |
+
näiden
|
| 50 |
+
teatteri
|
| 51 |
+
kaikki
|
| 52 |
+
puolipilvistä
|
| 53 |
+
niiden
|
| 54 |
+
maksimilämpötila
|
| 55 |
+
lisäksi
|
| 56 |
+
kaupunginhallitus
|
| 57 |
+
helsingin
|
| 58 |
+
nyt
|
| 59 |
+
samalla
|
| 60 |
+
hänen
|
| 61 |
+
olen
|
| 62 |
+
kaupunkikierros
|
| 63 |
+
vastaväittäjä
|
| 64 |
+
ne
|
| 65 |
+
tästä
|
| 66 |
+
enimmäkseen
|
| 67 |
+
poika
|
| 68 |
+
niinpä
|
| 69 |
+
viirus
|
| 70 |
+
me
|
| 71 |
+
poliisi
|
| 72 |
+
liput
|
| 73 |
+
ilmoittautuminen
|
| 74 |
+
tarjoa
|
| 75 |
+
hän
|
| 76 |
+
molemmat
|
| 77 |
+
ulkomaat
|
| 78 |
+
rock
|
| 79 |
+
lääketieteen
|
| 80 |
+
tanssi
|
| 81 |
+
sainks
|
| 82 |
+
näyttely
|
| 83 |
+
lisätietoja
|
| 84 |
+
ulkomaiden
|
| 85 |
+
näyttelyn
|
| 86 |
+
palo
|
nltk_data/tokenizers/punkt_tab/french/abbrev_types.txt
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
p.o.l
|
| 2 |
+
pds
|
| 3 |
+
3o
|
| 4 |
+
inscr
|
| 5 |
+
suè
|
| 6 |
+
z
|
| 7 |
+
abst
|
| 8 |
+
g.-b
|
| 9 |
+
tél
|
| 10 |
+
r
|
| 11 |
+
ed
|
| 12 |
+
o
|
| 13 |
+
b
|
| 14 |
+
esp
|
| 15 |
+
j.l
|
| 16 |
+
v
|
| 17 |
+
k
|
| 18 |
+
e.p
|
| 19 |
+
aus
|
| 20 |
+
jap
|
| 21 |
+
r.e
|
| 22 |
+
gb-bel
|
| 23 |
+
p
|
| 24 |
+
aut
|
| 25 |
+
usx
|
| 26 |
+
arg
|
| 27 |
+
g
|
| 28 |
+
e
|
| 29 |
+
etc
|
| 30 |
+
fra
|
| 31 |
+
p.s
|
| 32 |
+
j.-l
|
| 33 |
+
blu
|
| 34 |
+
e.-u
|
| 35 |
+
f.b
|
| 36 |
+
msf
|
| 37 |
+
e.d
|
| 38 |
+
shi
|
| 39 |
+
can
|
| 40 |
+
j.b
|
| 41 |
+
s.a
|
| 42 |
+
f.o
|
| 43 |
+
you
|
| 44 |
+
mir
|
| 45 |
+
inc
|
| 46 |
+
ital
|
| 47 |
+
expr
|
| 48 |
+
tch
|
| 49 |
+
g-b-bel
|
| 50 |
+
cid
|
| 51 |
+
c.u
|
| 52 |
+
ctk
|
| 53 |
+
j.-m.g
|
| 54 |
+
bta
|
| 55 |
+
p.-b
|
| 56 |
+
cie
|
| 57 |
+
ita
|
| 58 |
+
equ
|
| 59 |
+
corp
|
| 60 |
+
vot
|
| 61 |
+
w
|
nltk_data/tokenizers/punkt_tab/french/collocations.tab
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## shinozuka-magne
|
| 2 |
+
##number## ambrosino-baumgartner
|
| 3 |
+
c tanvier
|
| 4 |
+
f b.
|
| 5 |
+
##number## waldegaard-fenouil
|
| 6 |
+
##number## fermé
|
| 7 |
+
a dechaume
|
| 8 |
+
i demongeot
|
| 9 |
+
s motos
|
| 10 |
+
##number## rahier
|
| 11 |
+
##number## magnaldi
|
| 12 |
+
##number## orioli
|
| 13 |
+
f tél.
|
| 14 |
+
##number## cowan-delferrier
|
| 15 |
+
##number## vatanen-berglund
|
| 16 |
+
##number## picco
|
| 17 |
+
##number## masuoka-oligo
|
| 18 |
+
##number## medardo
|
nltk_data/tokenizers/punkt_tab/french/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/french/sent_starters.txt
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
c
|
| 2 |
+
depuis
|
| 3 |
+
la
|
| 4 |
+
enfin
|
| 5 |
+
certains
|
| 6 |
+
selon
|
| 7 |
+
cet
|
| 8 |
+
car
|
| 9 |
+
ces
|
| 10 |
+
il
|
| 11 |
+
cependant
|
| 12 |
+
pour
|
| 13 |
+
j
|
| 14 |
+
alors
|
| 15 |
+
un
|
| 16 |
+
certes
|
| 17 |
+
les
|
| 18 |
+
nous
|
| 19 |
+
dans
|
| 20 |
+
le
|
| 21 |
+
une
|
| 22 |
+
si
|
| 23 |
+
mais
|
| 24 |
+
en
|
| 25 |
+
dès
|
| 26 |
+
or
|
| 27 |
+
tout
|
| 28 |
+
ils
|
| 29 |
+
l
|
| 30 |
+
mr
|
| 31 |
+
malgré
|
| 32 |
+
elles
|
| 33 |
+
né
|
| 34 |
+
je
|
| 35 |
+
on
|
| 36 |
+
quand
|
| 37 |
+
pourtant
|
| 38 |
+
cela
|
| 39 |
+
a
|
| 40 |
+
après
|
| 41 |
+
puis
|
| 42 |
+
ce
|
| 43 |
+
elle
|
| 44 |
+
voilà
|
| 45 |
+
cette
|
| 46 |
+
comment
|
| 47 |
+
quant
|
| 48 |
+
ainsi
|
nltk_data/tokenizers/punkt_tab/german/abbrev_types.txt
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rfr
|
| 2 |
+
t
|
| 3 |
+
c
|
| 4 |
+
a.d
|
| 5 |
+
dk
|
| 6 |
+
he
|
| 7 |
+
mjm
|
| 8 |
+
inkl
|
| 9 |
+
bt
|
| 10 |
+
69f
|
| 11 |
+
crz
|
| 12 |
+
dr
|
| 13 |
+
st
|
| 14 |
+
ib
|
| 15 |
+
liv
|
| 16 |
+
mrd
|
| 17 |
+
n.r
|
| 18 |
+
rg
|
| 19 |
+
v
|
| 20 |
+
vgl
|
| 21 |
+
mgr
|
| 22 |
+
cs
|
| 23 |
+
prof
|
| 24 |
+
j
|
| 25 |
+
kfr
|
| 26 |
+
bd
|
| 27 |
+
fre
|
| 28 |
+
gfh
|
| 29 |
+
fon
|
| 30 |
+
m
|
| 31 |
+
rp
|
| 32 |
+
nr
|
| 33 |
+
chr
|
| 34 |
+
etc
|
| 35 |
+
hg
|
| 36 |
+
sx
|
| 37 |
+
rz
|
| 38 |
+
48f
|
| 39 |
+
kmu
|
| 40 |
+
abs
|
| 41 |
+
nkm
|
| 42 |
+
z.b
|
| 43 |
+
usw
|
| 44 |
+
f
|
| 45 |
+
d.h
|
| 46 |
+
lz
|
| 47 |
+
sc
|
| 48 |
+
usf
|
| 49 |
+
gir
|
| 50 |
+
hag
|
| 51 |
+
ff
|
| 52 |
+
mio
|
| 53 |
+
zr
|
| 54 |
+
k
|
| 55 |
+
h
|
| 56 |
+
mey
|
| 57 |
+
bst
|
| 58 |
+
ne
|
| 59 |
+
u.a
|
| 60 |
+
fem
|
| 61 |
+
bzw
|
| 62 |
+
bü
|
| 63 |
+
med
|
| 64 |
+
u
|
| 65 |
+
lts
|
| 66 |
+
fr
|
| 67 |
+
s.o.s
|
| 68 |
+
w
|
| 69 |
+
lib
|
| 70 |
+
k.a
|
| 71 |
+
th
|
nltk_data/tokenizers/punkt_tab/german/collocations.tab
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
##number## oktober
|
| 2 |
+
##number## jahrhunderts
|
| 3 |
+
##number## geburtstag
|
| 4 |
+
##number## juni
|
| 5 |
+
s ##number##
|
| 6 |
+
##number## september
|
| 7 |
+
##number## mai
|
| 8 |
+
##number## dezember
|
| 9 |
+
##number## april
|
| 10 |
+
##number## ahv-revision
|
| 11 |
+
##number## revision
|
| 12 |
+
##number## jahrhundert
|
| 13 |
+
##number## landwirtschaftsbericht
|
| 14 |
+
##number## altersjahr
|
| 15 |
+
##number## februar
|
| 16 |
+
a schumpeter
|
| 17 |
+
##number## freiheit
|
| 18 |
+
##number## august
|
| 19 |
+
##number## januar
|
| 20 |
+
##number## märz
|
| 21 |
+
a meyers
|
| 22 |
+
##number## november
|
| 23 |
+
##number## bauetappe
|
| 24 |
+
##number## ahv-
|
| 25 |
+
##number## eu-richtlinie
|
| 26 |
+
##number## juli
|
| 27 |
+
a meyer
|
| 28 |
+
##number## säule
|
nltk_data/tokenizers/punkt_tab/german/ortho_context.tab
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nltk_data/tokenizers/punkt_tab/german/sent_starters.txt
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
das
|
| 2 |
+
man
|
| 3 |
+
es
|
| 4 |
+
wir
|
| 5 |
+
dabei
|
| 6 |
+
ferner
|
| 7 |
+
ähnliches
|
| 8 |
+
während
|
| 9 |
+
entscheidend
|
| 10 |
+
ausserdem
|
| 11 |
+
ein
|
| 12 |
+
in
|
| 13 |
+
der
|
| 14 |
+
daraus
|
| 15 |
+
obschon
|
| 16 |
+
beide
|
| 17 |
+
hier
|
| 18 |
+
all
|
| 19 |
+
neben
|
| 20 |
+
solche
|
| 21 |
+
hingegen
|
| 22 |
+
selbstverständlich
|
| 23 |
+
daneben
|
| 24 |
+
hinzu
|
| 25 |
+
vielmehr
|
| 26 |
+
sie
|
| 27 |
+
natürlich
|
| 28 |
+
obwohl
|
| 29 |
+
nun
|
| 30 |
+
doch
|
| 31 |
+
ob
|
| 32 |
+
abgesehen
|
| 33 |
+
überdies
|
| 34 |
+
im
|
| 35 |
+
zweitens
|
| 36 |
+
darin
|
| 37 |
+
erstens
|
| 38 |
+
dieses
|
| 39 |
+
nach
|
| 40 |
+
wer
|
| 41 |
+
da
|
| 42 |
+
interessant
|
| 43 |
+
seit
|
| 44 |
+
zudem
|
| 45 |
+
darüber
|
| 46 |
+
umgekehrt
|
| 47 |
+
ähnlich
|
| 48 |
+
aber
|
| 49 |
+
was
|
| 50 |
+
nachdem
|
| 51 |
+
insbesondere
|
| 52 |
+
statt
|
| 53 |
+
angesichts
|
| 54 |
+
gefragt
|
| 55 |
+
gleiches
|
| 56 |
+
solange
|
| 57 |
+
wenn
|
| 58 |
+
dies
|
| 59 |
+
dass
|
| 60 |
+
wie
|
| 61 |
+
damit
|
| 62 |
+
allerdings
|
| 63 |
+
denn
|
| 64 |
+
letztere
|
| 65 |
+
eine
|
| 66 |
+
selbst
|
| 67 |
+
gleichzeitig
|
| 68 |
+
wo
|
| 69 |
+
weder
|
| 70 |
+
gerade
|
| 71 |
+
unter
|
| 72 |
+
problematischer
|
| 73 |
+
wieso
|
| 74 |
+
dennoch
|
| 75 |
+
bei
|
| 76 |
+
deshalb
|
| 77 |
+
davon
|
| 78 |
+
andernfalls
|
| 79 |
+
er
|
| 80 |
+
die
|
| 81 |
+
anders
|
| 82 |
+
auch
|
| 83 |
+
ebenso
|
| 84 |
+
so
|
| 85 |
+
inzwischen
|
| 86 |
+
sonst
|
| 87 |
+
immerhin
|
| 88 |
+
entsprechend
|
| 89 |
+
danach
|
| 90 |
+
am
|
| 91 |
+
trotz
|
| 92 |
+
trotzdem
|
| 93 |
+
worum
|
| 94 |
+
damals
|
| 95 |
+
dafür
|
| 96 |
+
schliesslich
|
| 97 |
+
gemäss
|
| 98 |
+
demgegenüber
|
| 99 |
+
warum
|
| 100 |
+
letzteres
|
| 101 |
+
mit
|
| 102 |
+
dazu
|
| 103 |
+
anderseits
|
| 104 |
+
ganz
|
| 105 |
+
zwar
|
| 106 |
+
dieser
|
| 107 |
+
diese
|
nltk_data/tokenizers/punkt_tab/greek/abbrev_types.txt
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
κλ
|
| 2 |
+
δημ
|
| 3 |
+
χλμ
|
| 4 |
+
σ.τ.ε
|
| 5 |
+
ό.π
|
| 6 |
+
δρχ
|
| 7 |
+
κων
|
| 8 |
+
χρ
|
| 9 |
+
π.α
|
| 10 |
+
ριχ
|
| 11 |
+
π.χρ
|
| 12 |
+
υγ
|
| 13 |
+
tel
|
| 14 |
+
ζ
|
| 15 |
+
ο.π
|
| 16 |
+
βασ
|
| 17 |
+
γλ
|
| 18 |
+
n.c
|
| 19 |
+
d.j
|
| 20 |
+
σωκ
|
| 21 |
+
π
|
| 22 |
+
ιω
|
| 23 |
+
αχ
|
| 24 |
+
βα
|
| 25 |
+
γερ
|
| 26 |
+
εκδ
|
| 27 |
+
κλπ
|
| 28 |
+
φ
|
| 29 |
+
ελ
|
| 30 |
+
οσ
|
| 31 |
+
α
|
| 32 |
+
σελ
|
| 33 |
+
ευ
|
| 34 |
+
ε.έ
|
| 35 |
+
ρ
|
| 36 |
+
ε.τ.α
|
| 37 |
+
λ
|
| 38 |
+
εβ
|
| 39 |
+
θρ
|
| 40 |
+
ν
|
| 41 |
+
βλ
|
| 42 |
+
ηλ
|
| 43 |
+
γ
|
| 44 |
+
αρ
|
| 45 |
+
π.χ
|
| 46 |
+
ε.μ
|
| 47 |
+
κ.μ
|
| 48 |
+
α.ε
|
| 49 |
+
μιχ
|
| 50 |
+
δισ
|
| 51 |
+
ολ
|
| 52 |
+
μ
|
| 53 |
+
κ.ά
|
| 54 |
+
κ
|
| 55 |
+
δηλ
|
| 56 |
+
ε.α.χ
|
| 57 |
+
πρ
|
| 58 |
+
αγ
|
| 59 |
+
μac
|
| 60 |
+
κ.ο.κ
|
| 61 |
+
λ.χ
|
| 62 |
+
θ
|
| 63 |
+
αδσ
|
| 64 |
+
εκατ
|
| 65 |
+
δρη
|
| 66 |
+
εμμ
|
| 67 |
+
δ
|
| 68 |
+
δεκ
|
| 69 |
+
σ.σ
|
| 70 |
+
55ο
|
| 71 |
+
κκ
|
| 72 |
+
αδ
|
| 73 |
+
τ.μ
|
| 74 |
+
ε.ε
|
| 75 |
+
μ.χ
|
| 76 |
+
ν.μ
|
| 77 |
+
κτλ
|
| 78 |
+
δολ
|
| 79 |
+
κ.ά.π
|
| 80 |
+
αγγ
|
| 81 |
+
μ.κ
|
| 82 |
+
δ.σ
|
| 83 |
+
μπ
|
| 84 |
+
έκδ
|
| 85 |
+
ι
|
| 86 |
+
v
|
| 87 |
+
χαρ
|
| 88 |
+
γρ
|
| 89 |
+
μ.μ.ε
|
| 90 |
+
σχ
|
| 91 |
+
λεκ
|
| 92 |
+
σπ
|
| 93 |
+
πλι
|
| 94 |
+
αθ
|
| 95 |
+
χ
|
| 96 |
+
τζ
|
| 97 |
+
τρισ
|
| 98 |
+
στ
|
| 99 |
+
ευθ
|
| 100 |
+
μ.μ
|