m2m100_418M_br_fr / dl_opus.yaml
lgrobol's picture
Upload dl_opus.yaml
f1d5efc
common:
output_directory: local/opus
steps:
# The quality of wikimatrix is really really bad for this pair: very poor alignment
# - type: opus_read
# parameters:
# corpus_name: WikiMatrix
# source_language: br
# target_language: fr
# release: latest
# preprocessing: raw
# src_output: wiki.br.gz
# tgt_output: wiki.fr.gz
# The quality of ccmatrix is really really bad for this pair: very few usable breton sentences
# - type: opus_read
# parameters:
# corpus_name: MultiCCAligned
# source_language: br
# target_language: fr
# release: latest
# preprocessing: raw
# src_output: cc.br.gz
# tgt_output: cc.fr.gz
- type: opus_read
parameters:
corpus_name: OfisPublik
source_language: br
target_language: fr
release: latest
preprocessing: raw
src_output: ofis.br.gz
tgt_output: ofis.fr.gz
suppress_prompts: true
- type: opus_read
parameters:
corpus_name: OpenSubtitles
source_language: br
target_language: fr
release: latest
preprocessing: raw
src_output: ost.br.gz
tgt_output: ost.fr.gz
suppress_prompts: true
- type: opus_read
parameters:
corpus_name: Tatoeba
source_language: br
target_language: fr
release: latest
preprocessing: raw
src_output: tatoeba.br.gz
tgt_output: tatoeba.fr.gz
suppress_prompts: true
# - type: opus_read
# parameters:
# corpus_name: wikimedia
# source_language: br
# target_language: fr
# release: latest
# preprocessing: raw
# src_output: wikimedia.br.gz
# tgt_output: wikimedia.fr.gz
# suppress_prompts: true
# - type: opus_read
# parameters:
# corpus_name: Mozilla-I10n
# source_language: br
# target_language: fr
# release: latest
# preprocessing: raw
# src_output: mozilla.br.gz
# tgt_output: mozilla.fr.gz
# suppress_prompts: true
# - type: opus_read
# parameters:
# corpus_name: KDE4
# source_language: br
# target_language: fr
# release: latest
# preprocessing: raw
# src_output: kde.br.gz
# tgt_output: kde.fr.gz
# suppress_prompts: true
# - type: opus_read
# parameters:
# corpus_name: GNOME
# source_language: br
# target_language: fr
# release: latest
# preprocessing: raw
# src_output: gnome.br.gz
# tgt_output: gnome.fr.gz
# suppress_prompts: true
- type: concatenate
parameters:
inputs:
- ofis.br.gz
- tatoeba.br.gz
output: good.br.gz
- type: concatenate
parameters:
inputs:
- ofis.fr.gz
- tatoeba.fr.gz
output: good.fr.gz
- type: concatenate
parameters:
inputs:
# - wiki.br.gz
# - cc.br.gz
# - wikimedia.br.gz
# - gnome.br.gz
# - kde.br.gz
# - mozilla.br.gz
- ost.br.gz
output: dubious.br.gz
- type: concatenate
parameters:
inputs:
# - wiki.fr.gz
# - cc.fr.gz
# - wikimedia.fr.gz
# - gnome.fr.gz
# - kde.fr.gz
# - mozilla.fr.gz
- ost.fr.gz
output: dubious.fr.gz
- type: concatenate
parameters:
inputs:
- dubious.br.gz
- good.br.gz
output: align_train.br.gz
- type: concatenate
parameters:
inputs:
- dubious.fr.gz
- good.fr.gz
output: align_train.fr.gz
- type: filter
parameters:
inputs:
- align_train.br.gz
- align_train.fr.gz
outputs:
- align_train-filtered.br.gz
- align_train-filtered.fr.gz
filters:
- LengthFilter:
unit: word
min_length: 1
max_length: 128
- type: train_alignment
parameters:
src_data: align_train-filtered.br.gz
tgt_data: align_train-filtered.fr.gz
output: alignment.priors
parameters: {}
# TODO: dedup and more agressive filtering
- type: filter
parameters:
inputs:
- dubious.br.gz
- dubious.fr.gz
outputs:
- dubious-filtered.br.gz
- dubious-filtered.fr.gz
filters:
- LengthFilter:
unit: word
min_length: 4
max_length: 128
- WordAlignFilter:
priors: alignment.priors
- type: concatenate
parameters:
inputs:
- dubious-filtered.br.gz
- good.br.gz
output: all.br.gz
- type: concatenate
parameters:
inputs:
- dubious-filtered.fr.gz
- good.fr.gz
output: all.fr.gz
# - type: remove_duplicates
# parameters:
# inputs:
# - all.br.gz
# outputs:
# - dedup.br.gz
# - type: remove_duplicates
# parameters:
# inputs:
# - all.fr.gz
# outputs:
# - dedup.fr.gz
- type: filter
parameters:
inputs:
- all.br.gz
- all.fr.gz
outputs:
- filtered.br.gz
- filtered.fr.gz
filters: &myfilters
- LengthFilter:
unit: word
min_length: 1
max_length: 128
- LengthRatioFilter:
unit: word
threshold: 3
- NonZeroNumeralsFilter: {}
- AlphabetRatioFilter: {}
- SimilarityFilter: {}
- RepetitionFilter:
threshold: 3
min_length: 5
max_length: 128