Xenova's picture
Xenova HF staff
Upload 34 files
5a62850 verified
raw
history blame contribute delete
No virus
46.8 kB
"use strict";(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[165],{9191:function(e,t,s){s.d(t,{t2:function(){return AutoTokenizer}});var n=s(1802),r=s(4320),i=s(4297),o=s(6001),a=s(6106),l=s(8828);async function loadTokenizer(e,t){let s=await Promise.all([(0,r.yM)(e,"tokenizer.json",!0,t),(0,r.yM)(e,"tokenizer_config.json",!0,t)]);return null!==t.legacy&&(s[1].legacy=t.legacy),s}function createPattern(e,t=!0){if(void 0!==e.Regex){let t=e.Regex.replace(/\\([#&~])/g,"$1");return RegExp(t,"gu")}if(void 0===e.String)return console.warn("Unknown pattern type:",e),null;{let s=(0,n.hr)(e.String);return RegExp(t?s:`(${s})`,"gu")}}function objectToMap(e){return new Map(Object.entries(e))}function prepareTensorForDecode(e){let t=e.dims;switch(t.length){case 1:return e.tolist();case 2:if(1!==t[0])throw Error("Unable to decode tensor with `batch size !== 1`. Use `tokenizer.batch_decode(...)` for batched inputs.");return e.tolist()[0];default:throw Error(`Expected tensor to have 1-2 dimensions, got ${t.length}.`)}}function clean_up_tokenization(e){return e.replace(/ \./g,".").replace(/ \?/g,"?").replace(/ \!/g,"!").replace(/ ,/g,",").replace(/ \' /g,"'").replace(/ n\'t/g,"n't").replace(/ \'m/g,"'m").replace(/ \'s/g,"'s").replace(/ \'ve/g,"'ve").replace(/ \'re/g,"'re")}function remove_accents(e){return e.replace(/[\u0300-\u036f]/g,"")}let c="\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E";let AddedToken=class AddedToken{constructor(e){this.content=e.content,this.id=e.id,this.single_word=e.single_word??!1,this.lstrip=e.lstrip??!1,this.rstrip=e.rstrip??!1,this.special=e.special??!1,this.normalized=e.normalized??null}};let TokenizerModel=class TokenizerModel extends n.Ag{constructor(e){super(),this.config=e,this.vocab=[],this.tokens_to_ids=new Map,this.unk_token_id=void 0,this.unk_token=void 0,this.end_of_word_suffix=void 0,this.fuse_unk=this.config.fuse_unk??!1}static fromConfig(e,...t){switch(e.type){case"WordPiece":return new WordPieceTokenizer(e);case"Unigram":return new Unigram(e,...t);case"BPE":return new BPE(e);default:if(e.vocab)return new LegacyTokenizerModel(e,...t);throw Error(`Unknown TokenizerModel type: ${e.type}`)}}_call(e){let t=this.encode(e);return this.fuse_unk&&(t=function(e,t,s){let n=[],r=0;for(;r<e.length;){if(n.push(e[r]),(s.get(e[r])??t)!==t){++r;continue}for(;r<e.length&&(s.get(e[r])??t)===t;)++r}return n}(t,this.unk_token_id,this.tokens_to_ids)),t}encode(e){throw Error("encode should be implemented in subclass.")}convert_tokens_to_ids(e){return e.map(e=>this.tokens_to_ids.get(e)??this.unk_token_id)}convert_ids_to_tokens(e){return e.map(e=>this.vocab[e]??this.unk_token)}};let WordPieceTokenizer=class WordPieceTokenizer extends TokenizerModel{constructor(e){for(let[t,s]of(super(e),this.tokens_to_ids=objectToMap(e.vocab),this.unk_token_id=this.tokens_to_ids.get(e.unk_token),this.unk_token=e.unk_token,this.max_input_chars_per_word=e.max_input_chars_per_word??100,this.vocab=Array(this.tokens_to_ids.size),this.tokens_to_ids))this.vocab[s]=t}encode(e){let t=[];for(let s of e){let e=[...s];if(e.length>this.max_input_chars_per_word){t.push(this.unk_token);continue}let n=!1,r=0,i=[];for(;r<e.length;){let t=e.length,s=null;for(;r<t;){let n=e.slice(r,t).join("");if(r>0&&(n=this.config.continuing_subword_prefix+n),this.tokens_to_ids.has(n)){s=n;break}--t}if(null===s){n=!0;break}i.push(s),r=t}n?t.push(this.unk_token):t.push(...i)}return t}};let Unigram=class Unigram extends TokenizerModel{constructor(e,t){super(e);let s=e.vocab.length;this.vocab=Array(s),this.scores=Array(s);for(let t=0;t<s;++t){let s=e.vocab[t];this.vocab[t]=s[0],this.scores[t]=s[1]}this.unk_token_id=e.unk_id,this.unk_token=this.vocab[e.unk_id],this.tokens_to_ids=new Map(this.vocab.map((e,t)=>[e,t])),this.bosToken=" ",this.bosTokenId=this.tokens_to_ids.get(this.bosToken),this.eosToken=t.eos_token,this.eosTokenId=this.tokens_to_ids.get(this.eosToken),this.unkToken=this.vocab[this.unk_token_id],this.minScore=(0,i.VV)(this.scores)[0],this.unkScore=this.minScore-10,this.scores[this.unk_token_id]=this.unkScore,this.trie=new a.GA,this.trie.extend(this.vocab),this.fuse_unk=!0}populateNodes(e){let t=e.sentence,s=t.length,n=0;for(;n<s;){let s=!1,r=[];for(let i of this.trie.commonPrefixSearch(t.slice(n))){r.push(i);let t=this.tokens_to_ids.get(i),o=this.scores[t],a=i.length;e.insert(n,a,o,t),s||1!==a||(s=!0)}s||e.insert(n,1,this.unkScore,this.unk_token_id),n+=1}}tokenize(e){let t=new a.pQ(e,this.bosTokenId,this.eosTokenId);return this.populateNodes(t),t.tokens()}encode(e){let t=[];for(let s of e){let e=this.tokenize(s);t.push(...e)}return t}};let d=(()=>{let e=[...Array.from({length:94},(e,t)=>t+33),...Array.from({length:12},(e,t)=>t+161),...Array.from({length:82},(e,t)=>t+174)],t=e.slice(),s=0;for(let n=0;n<256;++n)e.includes(n)||(e.push(n),t.push(256+s),s+=1);let n=t.map(e=>String.fromCharCode(e));return Object.fromEntries(e.map((e,t)=>[e,n[t]]))})(),h=(0,n.$2)(d);let BPE=class BPE extends TokenizerModel{constructor(e){for(let[t,s]of(super(e),this.BPE_SPLIT_TOKEN=" ",this.tokens_to_ids=objectToMap(e.vocab),this.unk_token_id=this.tokens_to_ids.get(e.unk_token),this.unk_token=e.unk_token,this.vocab=Array(this.tokens_to_ids.size),this.tokens_to_ids))this.vocab[s]=t;this.bpe_ranks=new Map(e.merges.map((e,t)=>[e,t])),this.merges=e.merges.map(e=>e.split(this.BPE_SPLIT_TOKEN)),this.end_of_word_suffix=e.end_of_word_suffix,this.continuing_subword_suffix=e.continuing_subword_suffix??null,this.byte_fallback=this.config.byte_fallback??!1,this.byte_fallback&&(this.text_encoder=new TextEncoder),this.cache=new Map}bpe(e){if(0===e.length)return[];let t=this.cache.get(e);if(void 0!==t)return t;let s=Array.from(e);this.end_of_word_suffix&&(s[s.length-1]+=this.end_of_word_suffix);let n=[];if(s.length>1){let e=new a.Z3((e,t)=>e.score<t.score),t={token:s[0],bias:0,prev:null,next:null},r=t;for(let t=1;t<s.length;++t){let n={bias:t/s.length,token:s[t],prev:r,next:null};r.next=n,this._add_node(e,r),r=n}for(;!e.isEmpty();){let s=e.pop();if(s.deleted||!s.next||s.next.deleted)continue;if(s.deleted=!0,s.next.deleted=!0,s.prev){let e={...s.prev};s.prev.deleted=!0,s.prev=e,e.prev?e.prev.next=e:t=e}let n={token:s.token+s.next.token,bias:s.bias,prev:s.prev,next:s.next.next};n.prev?(n.prev.next=n,this._add_node(e,n.prev)):t=n,n.next&&(n.next.prev=n,this._add_node(e,n))}for(let e=t;null!==e;e=e.next)n.push(e.token)}else n=s;if(this.continuing_subword_suffix)for(let e=0;e<n.length-1;++e)n[e]+=this.continuing_subword_suffix;return this.cache.set(e,n),n}_add_node(e,t){let s=this.bpe_ranks.get(t.token+this.BPE_SPLIT_TOKEN+t.next.token);void 0!==s&&(t.score=s+t.bias,e.push(t))}encode(e){let t=[];for(let s of e){let e=this.bpe(s);for(let s of e)this.tokens_to_ids.has(s)?t.push(s):this.byte_fallback?t.push(...Array.from(this.text_encoder.encode(s)).map(e=>`<0x${e.toString(16).toUpperCase().padStart(2,"0")}>`)):t.push(this.unk_token)}return t}};let LegacyTokenizerModel=class LegacyTokenizerModel extends TokenizerModel{constructor(e,t){for(let[s,n]of(super(e),this.tokens_to_ids=objectToMap(t.target_lang?e.vocab[t.target_lang]:e.vocab),this.bos_token=t.bos_token,this.bos_token_id=this.tokens_to_ids.get(this.bos_token),this.eos_token=t.eos_token,this.eos_token_id=this.tokens_to_ids.get(this.eos_token),this.pad_token=t.pad_token,this.pad_token_id=this.tokens_to_ids.get(this.pad_token),this.unk_token=t.unk_token,this.unk_token_id=this.tokens_to_ids.get(this.unk_token),this.vocab=Array(this.tokens_to_ids.size),this.tokens_to_ids))this.vocab[n]=s}encode(e){return e}};let Normalizer=class Normalizer extends n.Ag{constructor(e){super(),this.config=e}static fromConfig(e){if(null===e)return null;switch(e.type){case"BertNormalizer":return new BertNormalizer(e);case"Precompiled":return new Precompiled(e);case"Sequence":return new NormalizerSequence(e);case"Replace":return new Replace(e);case"NFC":return new NFC(e);case"NFKC":return new NFKC(e);case"NFKD":return new NFKD(e);case"Strip":return new StripNormalizer(e);case"StripAccents":return new StripAccents(e);case"Lowercase":return new Lowercase(e);case"Prepend":return new Prepend(e);default:throw Error(`Unknown Normalizer type: ${e.type}`)}}normalize(e){throw Error("normalize should be implemented in subclass.")}_call(e){return this.normalize(e)}};let Replace=class Replace extends Normalizer{normalize(e){let t=createPattern(this.config.pattern);return null===t?e:e.replaceAll(t,this.config.content)}};let NFC=class NFC extends Normalizer{normalize(e){return e=e.normalize("NFC")}};let NFKC=class NFKC extends Normalizer{normalize(e){return e=e.normalize("NFKC")}};let NFKD=class NFKD extends Normalizer{normalize(e){return e=e.normalize("NFKD")}};let StripNormalizer=class StripNormalizer extends Normalizer{normalize(e){return this.config.strip_left&&this.config.strip_right?e=e.trim():(this.config.strip_left&&(e=e.trimStart()),this.config.strip_right&&(e=e.trimEnd())),e}};let StripAccents=class StripAccents extends Normalizer{normalize(e){return e=remove_accents(e)}};let Lowercase=class Lowercase extends Normalizer{normalize(e){return e=e.toLowerCase()}};let Prepend=class Prepend extends Normalizer{normalize(e){return e=this.config.prepend+e}};let NormalizerSequence=class NormalizerSequence extends Normalizer{constructor(e){super(e),this.normalizers=e.normalizers.map(e=>Normalizer.fromConfig(e))}normalize(e){return this.normalizers.reduce((e,t)=>t.normalize(e),e)}};let BertNormalizer=class BertNormalizer extends Normalizer{_tokenize_chinese_chars(e){let t=[];for(let s=0;s<e.length;++s){let n=e[s],r=n.charCodeAt(0);this._is_chinese_char(r)?(t.push(" "),t.push(n),t.push(" ")):t.push(n)}return t.join("")}_is_chinese_char(e){return e>=19968&&e<=40959||e>=13312&&e<=19903||e>=131072&&e<=173791||e>=173824&&e<=177983||e>=177984&&e<=178207||e>=178208&&e<=183983||e>=63744&&e<=64255||e>=194560&&e<=195103}stripAccents(e){return e.normalize("NFD").replace(/[\u0300-\u036f]/g,"")}_is_control(e){switch(e){case" ":case"\n":case"\r":return!1;default:return/^\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}$/u.test(e)}}_clean_text(e){let t=[];for(let s of e){let e=s.charCodeAt(0);0===e||65533===e||this._is_control(s)||(/^\s$/.test(s)?t.push(" "):t.push(s))}return t.join("")}normalize(e){return this.config.clean_text&&(e=this._clean_text(e)),this.config.handle_chinese_chars&&(e=this._tokenize_chinese_chars(e)),this.config.lowercase?(e=e.toLowerCase(),!1!==this.config.strip_accents&&(e=this.stripAccents(e))):this.config.strip_accents&&(e=this.stripAccents(e)),e}};let PreTokenizer=class PreTokenizer extends n.Ag{static fromConfig(e){if(null===e)return null;switch(e.type){case"BertPreTokenizer":return new BertPreTokenizer(e);case"Sequence":return new PreTokenizerSequence(e);case"WhitespaceSplit":return new WhitespaceSplit(e);case"Metaspace":return new MetaspacePreTokenizer(e);case"ByteLevel":return new ByteLevelPreTokenizer(e);case"Split":return new SplitPreTokenizer(e);case"Punctuation":return new PunctuationPreTokenizer(e);case"Digits":return new DigitsPreTokenizer(e);case"Replace":return new ReplacePreTokenizer(e);default:throw Error(`Unknown PreTokenizer type: ${e.type}`)}}pre_tokenize_text(e,t){throw Error("pre_tokenize_text should be implemented in subclass.")}pre_tokenize(e,t){return(Array.isArray(e)?e.map(e=>this.pre_tokenize_text(e,t)):this.pre_tokenize_text(e,t)).flat()}_call(e,t){return this.pre_tokenize(e,t)}};let BertPreTokenizer=class BertPreTokenizer extends PreTokenizer{constructor(e){super(),this.pattern=RegExp(`[^\\s${c}]+|[${c}]`,"gu")}pre_tokenize_text(e,t){return e.trim().match(this.pattern)||[]}};let ByteLevelPreTokenizer=class ByteLevelPreTokenizer extends PreTokenizer{constructor(e){super(),this.config=e,this.add_prefix_space=this.config.add_prefix_space,this.trim_offsets=this.config.trim_offsets,this.use_regex=this.config.use_regex??!0,this.pattern=/'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu,this.byte_encoder=d,this.text_encoder=new TextEncoder}pre_tokenize_text(e,t){this.add_prefix_space&&!e.startsWith(" ")&&(e=" "+e);let s=this.use_regex?e.match(this.pattern)||[]:[e];return s.map(e=>Array.from(this.text_encoder.encode(e),e=>this.byte_encoder[e]).join(""))}};let SplitPreTokenizer=class SplitPreTokenizer extends PreTokenizer{constructor(e){super(),this.config=e,this.pattern=createPattern(this.config.pattern,this.config.invert)}pre_tokenize_text(e,t){return null===this.pattern?[]:this.config.invert?e.match(this.pattern)||[]:function(e,t){let s=[],n=0;for(let r of e.matchAll(t)){let t=r[0];n<r.index&&s.push(e.slice(n,r.index)),t.length>0&&s.push(t),n=r.index+t.length}return n<e.length&&s.push(e.slice(n)),s}(e,this.pattern)}};let PunctuationPreTokenizer=class PunctuationPreTokenizer extends PreTokenizer{constructor(e){super(),this.config=e,this.pattern=RegExp(`[^${c}]+|[${c}]+`,"gu")}pre_tokenize_text(e,t){return e.match(this.pattern)||[]}};let DigitsPreTokenizer=class DigitsPreTokenizer extends PreTokenizer{constructor(e){super(),this.config=e;let t=`[^\\d]+|\\d${this.config.individual_digits?"":"+"}`;this.pattern=RegExp(t,"gu")}pre_tokenize_text(e,t){return e.match(this.pattern)||[]}};let PostProcessor=class PostProcessor extends n.Ag{constructor(e){super(),this.config=e}static fromConfig(e){if(null===e)return null;switch(e.type){case"TemplateProcessing":return new TemplateProcessing(e);case"ByteLevel":return new ByteLevelPostProcessor(e);case"RobertaProcessing":return new RobertaProcessing(e);case"BertProcessing":return new BertProcessing(e);default:throw Error(`Unknown PostProcessor type: ${e.type}`)}}post_process(e,...t){throw Error("post_process should be implemented in subclass.")}_call(e,...t){return this.post_process(e,...t)}};let BertProcessing=class BertProcessing extends PostProcessor{constructor(e){super(e),this.cls=e.cls[0],this.sep=e.sep[0]}post_process(e,t=null,{add_special_tokens:s=!0}={}){s&&(e=(0,n.eG)([this.cls],e,[this.sep]));let r=Array(e.length).fill(0);if(null!==t){let i=s&&this instanceof RobertaProcessing?[this.sep]:[],o=s?[this.sep]:[];e=(0,n.eG)(e,i,t,o),r=(0,n.eG)(r,Array(t.length+i.length+o.length).fill(1))}return{tokens:e,token_type_ids:r}}};let RobertaProcessing=class RobertaProcessing extends BertProcessing{};let TemplateProcessing=class TemplateProcessing extends PostProcessor{constructor(e){super(e),this.single=e.single,this.pair=e.pair}post_process(e,t=null,{add_special_tokens:s=!0}={}){let r=null===t?this.single:this.pair,i=[],o=[];for(let a of r)"SpecialToken"in a?s&&(i.push(a.SpecialToken.id),o.push(a.SpecialToken.type_id)):"Sequence"in a&&("A"===a.Sequence.id?(i=(0,n.eG)(i,e),o=(0,n.eG)(o,Array(e.length).fill(a.Sequence.type_id))):"B"===a.Sequence.id&&(i=(0,n.eG)(i,t),o=(0,n.eG)(o,Array(t.length).fill(a.Sequence.type_id))));return{tokens:i,token_type_ids:o}}};let ByteLevelPostProcessor=class ByteLevelPostProcessor extends PostProcessor{post_process(e,t=null){return t&&(e=(0,n.eG)(e,t)),{tokens:e}}};let Decoder=class Decoder extends n.Ag{constructor(e){super(),this.config=e,this.added_tokens=[],this.end_of_word_suffix=null,this.trim_offsets=e.trim_offsets}static fromConfig(e){if(null===e)return null;switch(e.type){case"WordPiece":return new WordPieceDecoder(e);case"Metaspace":return new MetaspaceDecoder(e);case"ByteLevel":return new ByteLevelDecoder(e);case"Replace":return new ReplaceDecoder(e);case"ByteFallback":return new ByteFallback(e);case"Fuse":return new FuseDecoder(e);case"Strip":return new StripDecoder(e);case"Sequence":return new DecoderSequence(e);case"CTC":return new CTCDecoder(e);case"BPEDecoder":return new BPEDecoder(e);default:throw Error(`Unknown Decoder type: ${e.type}`)}}_call(e){return this.decode(e)}decode(e){return this.decode_chain(e).join("")}decode_chain(e){throw Error("`decode_chain` should be implemented in subclass.")}};let ReplaceDecoder=class ReplaceDecoder extends Decoder{decode_chain(e){let t=createPattern(this.config.pattern);return null===t?e:e.map(e=>e.replaceAll(t,this.config.content))}};let ByteFallback=class ByteFallback extends Decoder{constructor(e){super(e),this.text_decoder=new TextDecoder}decode_chain(e){let t=[],s=[];for(let n of e){let e=null;if(6===n.length&&n.startsWith("<0x")&&n.endsWith(">")){let t=parseInt(n.slice(3,5),16);isNaN(t)||(e=t)}if(null!==e)s.push(e);else{if(s.length>0){let e=this.text_decoder.decode(Uint8Array.from(s));t.push(e),s=[]}t.push(n)}}if(s.length>0){let e=this.text_decoder.decode(Uint8Array.from(s));t.push(e),s=[]}return t}};let FuseDecoder=class FuseDecoder extends Decoder{decode_chain(e){return[e.join("")]}};let StripDecoder=class StripDecoder extends Decoder{constructor(e){super(e),this.content=this.config.content,this.start=this.config.start,this.stop=this.config.stop}decode_chain(e){return e.map(e=>{let t=0;for(let s=0;s<this.start;++s){if(e[s]===this.content){t=s+1;continue}break}let s=e.length;for(let t=0;t<this.stop;++t){let n=e.length-t-1;if(e[n]===this.content){s=n;continue}break}return e.slice(t,s)})}};let WordPieceDecoder=class WordPieceDecoder extends Decoder{constructor(e){super(e),this.cleanup=e.cleanup}decode_chain(e){return e.map((e,t)=>(0!==t&&(e=e.startsWith(this.config.prefix)?e.replace(this.config.prefix,""):" "+e),this.cleanup&&(e=clean_up_tokenization(e)),e))}};let ByteLevelDecoder=class ByteLevelDecoder extends Decoder{constructor(e){super(e),this.byte_decoder=h,this.text_decoder=new TextDecoder("utf-8",{fatal:!1,ignoreBOM:!0}),this.end_of_word_suffix=null}convert_tokens_to_string(e){let t=e.join(""),s=new Uint8Array([...t].map(e=>this.byte_decoder[e])),n=this.text_decoder.decode(s);return n}decode_chain(e){let t=[],s=[];for(let n of e)void 0!==this.added_tokens.find(e=>e.content===n)?(s.length>0&&(t.push(this.convert_tokens_to_string(s)),s=[]),t.push(n)):s.push(n);return s.length>0&&t.push(this.convert_tokens_to_string(s)),t}};let CTCDecoder=class CTCDecoder extends Decoder{constructor(e){super(e),this.pad_token=this.config.pad_token,this.word_delimiter_token=this.config.word_delimiter_token,this.cleanup=this.config.cleanup}convert_tokens_to_string(e){if(0===e.length)return"";let t=[e[0]];for(let s=1;s<e.length;++s)e[s]!==t.at(-1)&&t.push(e[s]);let s=t.filter(e=>e!==this.pad_token),n=s.join("");return this.cleanup&&(n=clean_up_tokenization(n).replaceAll(this.word_delimiter_token," ").trim()),n}decode_chain(e){return[this.convert_tokens_to_string(e)]}};let DecoderSequence=class DecoderSequence extends Decoder{constructor(e){super(e),this.decoders=e.decoders.map(e=>Decoder.fromConfig(e))}decode_chain(e){return this.decoders.reduce((e,t)=>t.decode_chain(e),e)}};let BPEDecoder=class BPEDecoder extends Decoder{constructor(e){super(e),this.suffix=this.config.suffix}decode_chain(e){return e.map((t,s)=>t.replaceAll(this.suffix,s===e.length-1?"":" "))}};let VitsDecoder=class VitsDecoder extends Decoder{decode_chain(e){let t="";for(let s=1;s<e.length;s+=2)t+=e[s];return[t]}};let MetaspacePreTokenizer=class MetaspacePreTokenizer extends PreTokenizer{constructor(e){super(),this.addPrefixSpace=e.add_prefix_space,this.replacement=e.replacement,this.strRep=e.str_rep||this.replacement,this.prepend_scheme=e.prepend_scheme??"always"}pre_tokenize_text(e,{section_index:t}={}){let s=e.replaceAll(" ",this.strRep);return this.addPrefixSpace&&!s.startsWith(this.replacement)&&("always"===this.prepend_scheme||"first"===this.prepend_scheme&&0===t)&&(s=this.strRep+s),[s]}};let MetaspaceDecoder=class MetaspaceDecoder extends Decoder{constructor(e){super(e),this.addPrefixSpace=e.add_prefix_space,this.replacement=e.replacement}decode_chain(e){let t=[];for(let s=0;s<e.length;++s){let n=e[s].replaceAll(this.replacement," ");this.addPrefixSpace&&0==s&&n.startsWith(" ")&&(n=n.substring(1)),t.push(n)}return t}};let Precompiled=class Precompiled extends Normalizer{constructor(e){super(e),this.charsmap=e.precompiled_charsmap}normalize(e){if((e=(e=e.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm,"")).replace(/[\u0009\u000A\u000C\u000D\u1680\u200B\u200C\u200E\u200F\u2028\u2029\u2581\uFEFF\uFFFD]/gm," ")).includes("~")){let t=e.split("~");e=t.map(e=>e.normalize("NFKC")).join("~")}else e=e.normalize("NFKC");return e}};let PreTokenizerSequence=class PreTokenizerSequence extends PreTokenizer{constructor(e){super(),this.tokenizers=e.pretokenizers.map(e=>PreTokenizer.fromConfig(e))}pre_tokenize_text(e,t){return this.tokenizers.reduce((e,s)=>s.pre_tokenize(e,t),[e])}};let WhitespaceSplit=class WhitespaceSplit extends PreTokenizer{constructor(e){super()}pre_tokenize_text(e,t){return e.match(/\S+/g)||[]}};let ReplacePreTokenizer=class ReplacePreTokenizer extends PreTokenizer{constructor(e){super(),this.config=e,this.pattern=createPattern(this.config.pattern),this.content=this.config.content}pre_tokenize_text(e,t){return null===this.pattern?[e]:[e.replaceAll(this.pattern,this.config.content)]}};let u=["bos_token","eos_token","unk_token","sep_token","pad_token","cls_token","mask_token"];let PreTrainedTokenizer=class PreTrainedTokenizer extends n.Ag{return_token_type_ids=!1;_default_chat_template=`{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}`;constructor(e,t){for(let s of(super(),this._tokenizer_config=t,this.normalizer=Normalizer.fromConfig(e.normalizer),this.pre_tokenizer=PreTokenizer.fromConfig(e.pre_tokenizer),this.model=TokenizerModel.fromConfig(e.model,t),this.post_processor=PostProcessor.fromConfig(e.post_processor),this.decoder=Decoder.fromConfig(e.decoder),this.special_tokens=[],this.all_special_ids=[],this.added_tokens=[],e.added_tokens)){let e=new AddedToken(s);this.added_tokens.push(e),this.model.tokens_to_ids.set(e.content,e.id),this.model.vocab[e.id]=e.content,e.special&&(this.special_tokens.push(e.content),this.all_special_ids.push(e.id))}this.additional_special_tokens=t.additional_special_tokens??[],this.special_tokens.push(...this.additional_special_tokens),this.special_tokens=[...new Set(this.special_tokens)],this.decoder&&(this.decoder.added_tokens=this.added_tokens,this.decoder.end_of_word_suffix=this.model.end_of_word_suffix),this.added_tokens_regex=this.added_tokens.length>0?new RegExp(this.added_tokens.map(e=>`${e.lstrip?"\\s*":""}(${(0,n.hr)(e.content)})${e.rstrip?"\\s*":""}`).join("|")):null,this.mask_token=this.getToken("mask_token"),this.mask_token_id=this.model.tokens_to_ids.get(this.mask_token),this.pad_token=this.getToken("pad_token","eos_token"),this.pad_token_id=this.model.tokens_to_ids.get(this.pad_token),this.sep_token=this.getToken("sep_token"),this.sep_token_id=this.model.tokens_to_ids.get(this.sep_token),this.unk_token=this.getToken(t,"unk_token"),this.unk_token_id=this.model.tokens_to_ids.get(this.unk_token),this.model_max_length=t.model_max_length,this.remove_space=t.remove_space,this.clean_up_tokenization_spaces=t.clean_up_tokenization_spaces??!0,this.do_lowercase_and_remove_accent=t.do_lowercase_and_remove_accent??!1,this.padding_side="right",this.legacy=!1,this.chat_template=t.chat_template??null,this._compiled_template_cache=new Map}getToken(...e){for(let t of e){let e=this._tokenizer_config[t];if(e){if("object"!=typeof e)return e;if("AddedToken"===e.__type)return e.content;throw Error(`Unknown token: ${e}`)}}return null}static async from_pretrained(e,{progress_callback:t=null,config:s=null,cache_dir:n=null,local_files_only:r=!1,revision:i="main",legacy:o=null}={}){let a=await loadTokenizer(e,{progress_callback:t,config:s,cache_dir:n,local_files_only:r,revision:i,legacy:o});return new this(...a)}_call(e,{text_pair:t=null,add_special_tokens:s=!0,padding:r=!1,truncation:a=null,max_length:l=null,return_tensor:c=!0}={}){let d;let h=Array.isArray(e);if(h){if(0===e.length)throw Error("text array must be non-empty");if(null!==t){if(Array.isArray(t)){if(e.length!==t.length)throw Error("text and text_pair must have the same length")}else throw Error("text_pair must also be an array");d=e.map((e,n)=>this._encode_plus(e,t[n],{add_special_tokens:s}))}else d=e.map(e=>this._encode_plus(e,null,{add_special_tokens:s}))}else{if(null===e)throw Error("text may not be null");if(Array.isArray(t))throw Error("When specifying `text_pair`, since `text` is a string, `text_pair` must also be a string (i.e., not an array).");d=[this._encode_plus(e,t,{add_special_tokens:s})]}if(null===l?l="max_length"===r?this.model_max_length:(0,i.Fp)(d.map(e=>e.input_ids.length))[0]:a||console.warn("Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=true` to explicitly truncate examples to max length."),l=Math.min(l,this.model_max_length),r||a)for(let e=0;e<d.length;++e)d[e].input_ids.length!==l&&(d[e].input_ids.length>l?a&&function(e,t){for(let s of Object.keys(e))e[s].length=t}(d[e],l):r&&function(e,t,s,r){for(let i of Object.keys(e)){let o=t-e[i].length,a=s(i),l=Array(o).fill(a);e[i]="right"===r?(0,n.eG)(e[i],l):(0,n.eG)(l,e[i])}}(d[e],l,e=>"input_ids"===e?this.pad_token_id:0,this.padding_side));let u={};if(c){if(!(r&&a)&&d.some(e=>{for(let t of Object.keys(e))if(e[t].length!==d[0][t]?.length)return!0;return!1}))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=true' and 'truncation=true' to have batched tensors with the same length.");let e=[d.length,d[0].input_ids.length];for(let t of Object.keys(d[0]))u[t]=new o.es("int64",BigInt64Array.from(d.flatMap(e=>e[t]).map(BigInt)),e)}else{for(let e of Object.keys(d[0]))u[e]=d.map(t=>t[e]);if(!h)for(let e of Object.keys(u))u[e]=u[e][0]}return u}_encode_text(e){if(null===e)return null;let t=this.added_tokens_regex?e.split(this.added_tokens_regex).filter(e=>e):[e],s=t.map((e,t)=>{let s=this.added_tokens.find(t=>t.content===e);if(void 0!==s)return e;{!0===this.remove_space&&(e=e.trim().split(/\s+/).join(" ")),this.do_lowercase_and_remove_accent&&(e=remove_accents(e.toLowerCase())),null!==this.normalizer&&(e=this.normalizer(e));let s=null!==this.pre_tokenizer?this.pre_tokenizer(e,{section_index:t}):[e],n=this.model(s);return n}}).flat();return s}_encode_plus(e,t=null,{add_special_tokens:s=!0}={}){let r=this._encode_text(e),i=this._encode_text(t),o=this.post_processor?this.post_processor(r,i,{add_special_tokens:s}):{tokens:(0,n.eG)(r??[],i??[])},a=this.model.convert_tokens_to_ids(o.tokens),l={input_ids:a,attention_mask:Array(a.length).fill(1)};return this.return_token_type_ids&&o.token_type_ids&&(l.token_type_ids=o.token_type_ids),l}encode(e,t=null,{add_special_tokens:s=!0}={}){let{input_ids:n}=this._encode_plus(e,t,{add_special_tokens:s});return n}batch_decode(e,t={}){return e instanceof o.es&&(e=e.tolist()),e.map(e=>this.decode(e,t))}decode(e,t={}){if(e instanceof o.es&&(e=prepareTensorForDecode(e)),!Array.isArray(e)||0===e.length||!(0,n.Wy)(e[0]))throw Error("token_ids must be a non-empty array of integers.");return this.decode_single(e,t)}decode_single(e,{skip_special_tokens:t=!1,clean_up_tokenization_spaces:s=null}){let n=this.model.convert_ids_to_tokens(e);t&&(n=n.filter(e=>!this.special_tokens.includes(e)));let r=this.decoder?this.decoder(n):n.join(" ");return this.decoder&&this.decoder.end_of_word_suffix&&(r=r.replaceAll(this.decoder.end_of_word_suffix," "),t&&(r=r.trim())),(s??this.clean_up_tokenization_spaces)&&(r=clean_up_tokenization(r)),r}get default_chat_template(){return this._warned_about_chat_template||(console.warn("No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information."),this._warned_about_chat_template=!0),this._default_chat_template}apply_chat_template(e,{chat_template:t=null,add_generation_prompt:s=!1,tokenize:n=!0,padding:r=!1,truncation:i=!1,max_length:o=null,return_tensor:a=!0}={}){t??=this.chat_template??this.default_chat_template;let c=this._compiled_template_cache.get(t);void 0===c&&(c=new l.YS(t),this._compiled_template_cache.set(t,c));let d=Object.create(null);for(let e of u){let t=this.getToken(e);t&&(d[e]=t)}let h=c.render({messages:e,add_generation_prompt:s,...d});return n?this._call(h,{add_special_tokens:!1,padding:r,truncation:i,max_length:o,return_tensor:a}).input_ids:h}};let BertTokenizer=class BertTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let AlbertTokenizer=class AlbertTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let MobileBertTokenizer=class MobileBertTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let SqueezeBertTokenizer=class SqueezeBertTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let DebertaTokenizer=class DebertaTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let DebertaV2Tokenizer=class DebertaV2Tokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let HerbertTokenizer=class HerbertTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let ConvBertTokenizer=class ConvBertTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let RoFormerTokenizer=class RoFormerTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let DistilBertTokenizer=class DistilBertTokenizer extends PreTrainedTokenizer{};let CamembertTokenizer=class CamembertTokenizer extends PreTrainedTokenizer{};let XLMTokenizer=class XLMTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0;constructor(e,t){super(e,t),console.warn('WARNING: `XLMTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.')}};let ElectraTokenizer=class ElectraTokenizer extends PreTrainedTokenizer{return_token_type_ids=!0};let T5Tokenizer=class T5Tokenizer extends PreTrainedTokenizer{};let GPT2Tokenizer=class GPT2Tokenizer extends PreTrainedTokenizer{_default_chat_template='{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}'};let BartTokenizer=class BartTokenizer extends PreTrainedTokenizer{};let MBartTokenizer=class MBartTokenizer extends PreTrainedTokenizer{constructor(e,t){super(e,t),this.languageRegex=/^[a-z]{2}_[A-Z]{2}$/,this.language_codes=this.special_tokens.filter(e=>this.languageRegex.test(e)),this.lang_to_token=e=>e}_build_translation_inputs(e,t,s){return _build_translation_inputs(this,e,t,s)}};let MBart50Tokenizer=class MBart50Tokenizer extends MBartTokenizer{};let RobertaTokenizer=class RobertaTokenizer extends PreTrainedTokenizer{};let BloomTokenizer=class BloomTokenizer extends GPT2Tokenizer{constructor(e,t){let s=".,!?…。,、।۔،",n=e.pre_tokenizer?.pretokenizers[0]?.pattern;n&&n.Regex===` ?[^(\\s|[${s}])]+`&&(n.Regex=` ?[^\\s${s}]+`),super(e,t)}};let LlamaTokenizer=class LlamaTokenizer extends PreTrainedTokenizer{_default_chat_template=`{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>
' + system_message + '
<</SYS>>
' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>
' + content.strip() + '
<</SYS>>
' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}`;DEFAULT_SYSTEM_PROMPT="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.";constructor(e,t){super(e,t),this.use_default_system_prompt=t.use_default_system_prompt??!1,this.legacy=t.legacy??!0,this.legacy||(this.normalizer=null,this.pre_tokenizer=new MetaspacePreTokenizer({replacement:"▁",add_prefix_space:!0,prepend_scheme:"first"}))}_encode_text(e){if(null===e)return null;if(this.legacy||0===e.length)return super._encode_text(e);let t=super._encode_text("▁"+e.replaceAll("▁"," "));return t.length>1&&"▁"===t[0]&&this.special_tokens.includes(t[1])&&(t=t.slice(1)),t}get default_chat_template(){return super.default_chat_template.replaceAll("USE_DEFAULT_PROMPT",this.use_default_system_prompt?"true":"false").replaceAll("DEFAULT_SYSTEM_MESSAGE",this.DEFAULT_SYSTEM_PROMPT.replaceAll("\n","\\n").replaceAll("'","\\'"))}};let CodeLlamaTokenizer=class CodeLlamaTokenizer extends LlamaTokenizer{};let XLMRobertaTokenizer=class XLMRobertaTokenizer extends PreTrainedTokenizer{};let MPNetTokenizer=class MPNetTokenizer extends PreTrainedTokenizer{};let FalconTokenizer=class FalconTokenizer extends PreTrainedTokenizer{};let GPTNeoXTokenizer=class GPTNeoXTokenizer extends PreTrainedTokenizer{};let EsmTokenizer=class EsmTokenizer extends PreTrainedTokenizer{};function _build_translation_inputs(e,t,s,n){if(!("language_codes"in e)||!Array.isArray(e.language_codes))throw Error("Tokenizer must have `language_codes` attribute set and it should be an array of language ids.");if(!("languageRegex"in e)||!(e.languageRegex instanceof RegExp))throw Error("Tokenizer must have `languageRegex` attribute set and it should be a regular expression.");if(!("lang_to_token"in e)||"function"!=typeof e.lang_to_token)throw Error("Tokenizer must have `lang_to_token` attribute set and it should be a function.");let r=n.src_lang,i=n.tgt_lang;if(!e.language_codes.includes(i))throw Error(`Target language code "${i}" is not valid. Must be one of: {${e.language_codes.join(", ")}}`);if(void 0!==r){if(!e.language_codes.includes(r))throw Error(`Source language code "${r}" is not valid. Must be one of: {${e.language_codes.join(", ")}}`);for(let t of e.post_processor.config.single)if("SpecialToken"in t&&e.languageRegex.test(t.SpecialToken.id)){t.SpecialToken.id=e.lang_to_token(r);break}}return n.forced_bos_token_id=e.model.convert_tokens_to_ids([e.lang_to_token(i)])[0],e._call(t,s)}let NllbTokenizer=class NllbTokenizer extends PreTrainedTokenizer{constructor(e,t){super(e,t),this.languageRegex=/^[a-z]{3}_[A-Z][a-z]{3}$/,this.language_codes=this.special_tokens.filter(e=>this.languageRegex.test(e)),this.lang_to_token=e=>e}_build_translation_inputs(e,t,s){return _build_translation_inputs(this,e,t,s)}};let M2M100Tokenizer=class M2M100Tokenizer extends PreTrainedTokenizer{constructor(e,t){super(e,t),this.languageRegex=/^__[a-z]{2,3}__$/,this.language_codes=this.special_tokens.filter(e=>this.languageRegex.test(e)).map(e=>e.slice(2,-2)),this.lang_to_token=e=>`__${e}__`}_build_translation_inputs(e,t,s){return _build_translation_inputs(this,e,t,s)}};let _=[["en","english"],["zh","chinese"],["de","german"],["es","spanish"],["ru","russian"],["ko","korean"],["fr","french"],["ja","japanese"],["pt","portuguese"],["tr","turkish"],["pl","polish"],["ca","catalan"],["nl","dutch"],["ar","arabic"],["sv","swedish"],["it","italian"],["id","indonesian"],["hi","hindi"],["fi","finnish"],["vi","vietnamese"],["he","hebrew"],["uk","ukrainian"],["el","greek"],["ms","malay"],["cs","czech"],["ro","romanian"],["da","danish"],["hu","hungarian"],["ta","tamil"],["no","norwegian"],["th","thai"],["ur","urdu"],["hr","croatian"],["bg","bulgarian"],["lt","lithuanian"],["la","latin"],["mi","maori"],["ml","malayalam"],["cy","welsh"],["sk","slovak"],["te","telugu"],["fa","persian"],["lv","latvian"],["bn","bengali"],["sr","serbian"],["az","azerbaijani"],["sl","slovenian"],["kn","kannada"],["et","estonian"],["mk","macedonian"],["br","breton"],["eu","basque"],["is","icelandic"],["hy","armenian"],["ne","nepali"],["mn","mongolian"],["bs","bosnian"],["kk","kazakh"],["sq","albanian"],["sw","swahili"],["gl","galician"],["mr","marathi"],["pa","punjabi"],["si","sinhala"],["km","khmer"],["sn","shona"],["yo","yoruba"],["so","somali"],["af","afrikaans"],["oc","occitan"],["ka","georgian"],["be","belarusian"],["tg","tajik"],["sd","sindhi"],["gu","gujarati"],["am","amharic"],["yi","yiddish"],["lo","lao"],["uz","uzbek"],["fo","faroese"],["ht","haitian creole"],["ps","pashto"],["tk","turkmen"],["nn","nynorsk"],["mt","maltese"],["sa","sanskrit"],["lb","luxembourgish"],["my","myanmar"],["bo","tibetan"],["tl","tagalog"],["mg","malagasy"],["as","assamese"],["tt","tatar"],["haw","hawaiian"],["ln","lingala"],["ha","hausa"],["ba","bashkir"],["jw","javanese"],["su","sundanese"]],p=new Map(_),k=new Map([..._.map(([e,t])=>[t,e]),["burmese","my"],["valencian","ca"],["flemish","nl"],["haitian","ht"],["letzeburgesch","lb"],["pushto","ps"],["panjabi","pa"],["moldavian","ro"],["moldovan","ro"],["sinhalese","si"],["castilian","es"]]);let WhisperTokenizer=class WhisperTokenizer extends PreTrainedTokenizer{_default_chat_template='{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}';_decode_asr(e,{return_timestamps:t=!1,return_language:s=!1,time_precision:n=null,force_full_sequences:r=!0}={}){if(null===n)throw Error("Must specify time_precision");let o=null,a="word"===t;function new_chunk(){return{language:o,timestamp:[null,null],text:""}}let l=[],c=new_chunk(),d=0,h=this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0]+1,u=[],_=[],k=!1,g=null,f=new Set(this.all_special_ids);for(let s of e){let e=s.tokens,r=a?s.token_timestamps:null,m=null,T=h;if("stride"in s){let[t,r,i]=s.stride;if(d-=r,g=t-i,r&&(T=r/n+h),i)for(let t=e.length-1;t>=0;--t){let s=e[t];if(s>=h){if(null!==m&&(s-h)*n<g)break;m=s}}}let z=[],x=[];for(let s=0;s<e.length;++s){let g=e[s];if(f.has(g)){let e=this.decode([g]),s=p.get(e.slice(2,-2));if(void 0!==s){if(null!==o&&s!==o&&!t){u.push(z);let e=this.findLongestCommonSequence(u)[0],t=this.decode(e);c.text=t,l.push(c),u=[],z=[],c=new_chunk()}o=c.language=s}}else if(g>=h){let e=(g-h)*n+d,t=(0,i.NM)(e,2);if(null!==m&&g>=m)k=!0;else if(k||u.length>0&&g<T)k=!1;else if(null===c.timestamp[0])c.timestamp[0]=t;else if(t===c.timestamp[0]);else{c.timestamp[1]=t,u.push(z),a&&_.push(x);let[e,s]=this.findLongestCommonSequence(u,_),n=this.decode(e);c.text=n,a&&(c.words=this.collateWordTimestamps(e,s,o)),l.push(c),u=[],z=[],_=[],x=[],c=new_chunk()}}else if(z.push(g),a){let e,t=(0,i.NM)(r[s]+d,2);e=s+1<r.length?(0,i.NM)(r[s+1]+d,2):null,x.push([t,e])}}if("stride"in s){let[e,t,n]=s.stride;d+=e-n}z.length>0?(u.push(z),a&&_.push(x)):u.every(e=>0===e.length)&&(c=new_chunk(),u=[],z=[],_=[],x=[])}if(u.length>0){if(r&&t)throw Error("Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.");let[e,s]=this.findLongestCommonSequence(u,_),n=this.decode(e);c.text=n,a&&(c.words=this.collateWordTimestamps(e,s,o)),l.push(c)}let m=Object.create(null),T=l.map(e=>e.text).join("");if(t||s){for(let e=0;e<l.length;++e){let n=l[e];t||delete n.timestamp,s||delete n.language}if(a){let e=[];for(let t of l)for(let s of t.words)e.push(s);m={chunks:e}}else m={chunks:l}}return[T,m]}findLongestCommonSequence(e,t=null){let s=e[0],n=s.length,r=[],i=Array.isArray(t)&&t.length>0,o=i?[]:null,a=i?t[0]:null;for(let l=1;l<e.length;++l){let c=e[l],d=0,h=[n,n,0,0],u=c.length;for(let e=1;e<n+u;++e){let t=e/1e4,r=Math.max(0,n-e),i=Math.min(n,n+u-e),o=s.slice(r,i),a=Math.max(0,e-n),l=Math.min(u,e),_=c.slice(a,l);if(o.length!==_.length)throw Error("There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference.");let p=o.filter((e,t)=>e===_[t]).length,k=p/e+t;p>1&&k>d&&(d=k,h=[r,i,a,l])}let[_,p,k,g]=h,f=Math.floor((p+_)/2),m=Math.floor((g+k)/2);r.push(...s.slice(0,f)),n=(s=c.slice(m)).length,i&&(o.push(...a.slice(0,f)),a=t[l].slice(m))}return(r.push(...s),i)?(o.push(...a),[r,o]):[r,[]]}collateWordTimestamps(e,t,s){let[n,r,i]=this.combineTokensIntoWords(e,s),o=[];for(let e=0;e<n.length;++e){let s=i[e];o.push({text:n[e],timestamp:[t[s.at(0)][0],t[s.at(-1)][1]]})}return o}combineTokensIntoWords(e,t,s="\"'“\xa1\xbf([{-",n="\"'.。,,!!??::”)]}、"){let r,i,o;return["chinese","japanese","thai","lao","myanmar"].includes(t=t??"english")?[r,i,o]=this.splitTokensOnUnicode(e):[r,i,o]=this.splitTokensOnSpaces(e),this.mergePunctuations(r,i,o,s,n)}decode(e,t){let s;return t&&t.decode_with_timestamps?(e instanceof o.es&&(e=prepareTensorForDecode(e)),s=this.decodeWithTimestamps(e,t)):s=super.decode(e,t),s}decodeWithTimestamps(e,t){let s=t?.time_precision??.02,n=Array.from(this.all_special_ids).at(-1)+1,r=[[]];for(let t of e)if(t>=n){let e=(0,i.NM)((t-n)*s,2);r.push(`<|${e}|>`),r.push([])}else r[r.length-1].push(t);return(r=r.map(e=>"string"==typeof e?e:super.decode(e,t))).join("")}splitTokensOnUnicode(e){let t=this.decode(e,{decode_with_timestamps:!0}),s=[],n=[],r=[],i=[],o=[],a=0;for(let l=0;l<e.length;++l){let c=e[l];i.push(c),o.push(l);let d=this.decode(i,{decode_with_timestamps:!0});d.includes("�")&&"�"!==t[a+d.indexOf("�")]||(s.push(d),n.push(i),r.push(o),i=[],o=[],a+=d.length)}return[s,n,r]}splitTokensOnSpaces(e){let[t,s,n]=this.splitTokensOnUnicode(e),r=[],i=[],o=[],a=RegExp(`^[${c}]$`,"gu");for(let e=0;e<t.length;++e){let l=t[e],c=s[e],d=n[e],h=c[0]>=this.model.tokens_to_ids.get("<|endoftext|>"),u=l.startsWith(" "),_=l.trim(),p=a.test(_);if(h||u||p||0===r.length)r.push(l),i.push(c),o.push(d);else{let e=r.length-1;r[e]+=l,i[e].push(...c),o[e].push(...d)}}return[r,i,o]}mergePunctuations(e,t,s,r,i){let o=structuredClone(e),a=structuredClone(t),l=structuredClone(s),c=o.length-2,d=o.length-1;for(;c>=0;)o[c].startsWith(" ")&&r.includes(o[c].trim())?(o[d]=o[c]+o[d],a[d]=(0,n.eG)(a[c],a[d]),l[d]=(0,n.eG)(l[c],l[d]),o[c]="",a[c]=[],l[c]=[]):d=c,--c;for(c=0,d=1;d<o.length;)!o[c].endsWith(" ")&&i.includes(o[d])?(o[c]+=o[d],a[c]=(0,n.eG)(a[c],a[d]),l[c]=(0,n.eG)(l[c],l[d]),o[d]="",a[d]=[],l[d]=[]):c=d,++d;return[o.filter(e=>e),a.filter(e=>e.length>0),l.filter(e=>e.length>0)]}get_decoder_prompt_ids({language:e=null,task:t=null,no_timestamps:s=!0}={}){let n=[];if(e){e=e.toLowerCase();let t=k.get(e);if(void 0===t){if(p.has(e))t=e;else{let t=2===e.length,s=t?p.keys():p.values();throw Error(`Language "${e}" is not supported. Must be one of: ${JSON.stringify(s)}`)}}let s=this.model.tokens_to_ids.get(`<|${t}|>`);if(void 0===s)throw Error(`Unable to find language "${t}" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.`);n.push(s)}else n.push(null);if(t){if("transcribe"!==(t=t.toLowerCase())&&"translate"!==t)throw Error(`Task "${t}" is not supported. Must be one of: ["transcribe", "translate"]`);let e=this.model.tokens_to_ids.get(`<|${t}|>`);if(void 0===e)throw Error(`Unable to find task "${t}" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.`);n.push(e)}else n.push(null);if(s){let e=this.model.tokens_to_ids.get("<|notimestamps|>");if(void 0===e)throw Error('Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at https://github.com/xenova/transformers.js/issues/new/choose.');n.push(e)}return n.map((e,t)=>[t+1,e]).filter(e=>null!==e[1])}};let CodeGenTokenizer=class CodeGenTokenizer extends PreTrainedTokenizer{};let CLIPTokenizer=class CLIPTokenizer extends PreTrainedTokenizer{};let SiglipTokenizer=class SiglipTokenizer extends PreTrainedTokenizer{};let MarianTokenizer=class MarianTokenizer extends PreTrainedTokenizer{constructor(e,t){super(e,t),this.languageRegex=/^(>>\w+<<)\s*/g,this.supported_language_codes=this.model.vocab.filter(e=>this.languageRegex.test(e)),console.warn('WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.')}_encode_text(e){if(null===e)return null;let[t,...s]=e.trim().split(this.languageRegex);if(0===s.length)return super._encode_text(t);if(2===s.length){let[e,t]=s;return this.supported_language_codes.includes(e)||console.warn(`Unsupported language code "${e}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`),(0,n.eG)([e],super._encode_text(t))}}};let Wav2Vec2CTCTokenizer=class Wav2Vec2CTCTokenizer extends PreTrainedTokenizer{};let BlenderbotTokenizer=class BlenderbotTokenizer extends PreTrainedTokenizer{_default_chat_template="{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"};let BlenderbotSmallTokenizer=class BlenderbotSmallTokenizer extends BlenderbotTokenizer{};let SpeechT5Tokenizer=class SpeechT5Tokenizer extends PreTrainedTokenizer{};let NougatTokenizer=class NougatTokenizer extends PreTrainedTokenizer{};let VitsTokenizer=class VitsTokenizer extends PreTrainedTokenizer{constructor(e,t){super(e,t),this.decoder=new VitsDecoder({})}};let AutoTokenizer=class AutoTokenizer{static TOKENIZER_CLASS_MAPPING={T5Tokenizer,DistilBertTokenizer,CamembertTokenizer,DebertaTokenizer,DebertaV2Tokenizer,BertTokenizer,HerbertTokenizer,ConvBertTokenizer,RoFormerTokenizer,XLMTokenizer,ElectraTokenizer,MobileBertTokenizer,SqueezeBertTokenizer,AlbertTokenizer,GPT2Tokenizer,BartTokenizer,MBartTokenizer,MBart50Tokenizer,RobertaTokenizer,WhisperTokenizer,CodeGenTokenizer,CLIPTokenizer,SiglipTokenizer,MarianTokenizer,BloomTokenizer,NllbTokenizer,M2M100Tokenizer,LlamaTokenizer,CodeLlamaTokenizer,XLMRobertaTokenizer,MPNetTokenizer,FalconTokenizer,GPTNeoXTokenizer,EsmTokenizer,Wav2Vec2CTCTokenizer,BlenderbotTokenizer,BlenderbotSmallTokenizer,SpeechT5Tokenizer,NougatTokenizer,VitsTokenizer,PreTrainedTokenizer};static async from_pretrained(e,{quantized:t=!0,progress_callback:s=null,config:n=null,cache_dir:r=null,local_files_only:i=!1,revision:o="main",legacy:a=null}={}){let[l,c]=await loadTokenizer(e,{quantized:t,progress_callback:s,config:n,cache_dir:r,local_files_only:i,revision:o,legacy:a}),d=c.tokenizer_class?.replace(/Fast$/,"")??"PreTrainedTokenizer",h=this.TOKENIZER_CLASS_MAPPING[d];return h||(console.warn(`Unknown tokenizer class "${d}", attempting to construct from base class.`),h=PreTrainedTokenizer),new h(l,c)}}}}]);