{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "451d890e", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 2, "id": "eb0e4037", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3451cb7648e349cbbbdea3b672207ef7", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/1.68k [00:00 \"../3gram.arpa\"" ] }, { "cell_type": "code", "execution_count": 9, "id": "c2c8c8ce", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== 1/5 Counting and sorting n-grams ===\n", "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "Unigram tokens 32852369 types 1308846\n", "=== 2/5 Calculating and sorting adjusted counts ===\n", "Chain sizes: 1:15706152 2:14474877952 3:27140399104 4:43424632832 5:63327596544\n", "Statistics:\n", "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n", "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n", "3 23789023 D1=0.910002 D2=1.27136 D3+=1.38596\n", "4 28332665 D1=0.955371 D2=1.42566 D3+=1.4677\n", "5 30063763 D1=0.898851 D2=1.71714 D3+=1.29889\n", "Memory estimate for binary LM:\n", "type MB\n", "probing 2032 assuming -p 1.5\n", "probing 2408 assuming -r models -p 1.5\n", "trie 1058 without quantization\n", "trie 613 assuming -q 8 -b 8 quantization \n", "trie 921 assuming -a 22 array pointer compression\n", "trie 476 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n", "=== 3/5 Calculating and sorting initial probabilities ===\n", "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "####################################################################################################\n", "=== 4/5 Calculating and writing order-interpolated probabilities ===\n", "Chain sizes: 1:15706140 2:203523824 3:475780460 4:679983960 5:841785364\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "####################################################################################################\n", "=== 5/5 Writing ARPA model ===\n", "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n", "****************************************************************************************************\n", "Name:lmplz\tVmPeak:145104204 kB\tVmRSS:38296 kB\tRSSMax:26419104 kB\tuser:89.0779\tsys:42.0565\tCPU:131.134\treal:97.4678\n" ] } ], "source": [ "!../../kenlm/build/bin/lmplz -o 5 <\"kenlm_text_te.txt\" > \"../5gram.arpa\"" ] }, { "cell_type": "code", "execution_count": null, "id": "62b727b7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "id": "c27f1ef3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 19.1 s, sys: 3.81 s, total: 22.9 s\n", "Wall time: 22.9 s\n" ] } ], "source": [ "%%time\n", "with open(\"../3gram.arpa\", \"r\") as read_file, open(\"../3gram_correct.arpa\", \"w\") as write_file:\n", " has_added_eos = False\n", " for line in read_file:\n", " if not has_added_eos and \"ngram 1=\" in line:\n", " count=line.strip().split(\"=\")[-1]\n", " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n", " elif not has_added_eos and \"\" in line:\n", " write_file.write(line)\n", " write_file.write(line.replace(\"\", \"\"))\n", " has_added_eos = True\n", " else:\n", " write_file.write(line)" ] }, { "cell_type": "code", "execution_count": 11, "id": "8c8d963b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 5s, sys: 12.8 s, total: 1min 18s\n", "Wall time: 1min 18s\n" ] } ], "source": [ "%%time\n", "with open(\"../5gram.arpa\", \"r\") as read_file, open(\"../5gram_correct.arpa\", \"w\") as write_file:\n", " has_added_eos = False\n", " for line in read_file:\n", " if not has_added_eos and \"ngram 1=\" in line:\n", " count=line.strip().split(\"=\")[-1]\n", " write_file.write(line.replace(f\"{count}\", f\"{int(count)+1}\"))\n", " elif not has_added_eos and \"\" in line:\n", " write_file.write(line)\n", " write_file.write(line.replace(\"\", \"\"))\n", " has_added_eos = True\n", " else:\n", " write_file.write(line)" ] }, { "cell_type": "code", "execution_count": null, "id": "9447691c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "95d50071", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }