diff --git "a/assets/worker-CEIicWCS.js" "b/assets/worker-CEIicWCS.js" --- "a/assets/worker-CEIicWCS.js" +++ "b/assets/worker-CEIicWCS.js" @@ -2372,4 +2372,4 @@ If a question does not make any sense, or is not factually coherent, explain why ' }}{% endfor %}{% if add_generation_prompt %}{{'model '}}{% endif %}`)}}class Rb extends Te{}function Oo(t,e,r,n){if(!("language_codes"in t)||!Array.isArray(t.language_codes))throw new Error("Tokenizer must have `language_codes` attribute set and it should be an array of language ids.");if(!("languageRegex"in t)||!(t.languageRegex instanceof RegExp))throw new Error("Tokenizer must have `languageRegex` attribute set and it should be a regular expression.");if(!("lang_to_token"in t)||typeof t.lang_to_token!="function")throw new Error("Tokenizer must have `lang_to_token` attribute set and it should be a function.");const a=n.src_lang,s=n.tgt_lang;if(!t.language_codes.includes(s))throw new Error(`Target language code "${s}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);if(a!==void 0){if(!t.language_codes.includes(a))throw new Error(`Source language code "${a}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);for(const i of t.post_processor.config.single)if("SpecialToken"in i&&t.languageRegex.test(i.SpecialToken.id)){i.SpecialToken.id=t.lang_to_token(a);break}}return n.forced_bos_token_id=t.model.convert_tokens_to_ids([t.lang_to_token(s)])[0],t._call(e,r)}class Pb extends Te{constructor(e,r){super(e,r),this.languageRegex=/^[a-z]{3}_[A-Z][a-z]{3}$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)),this.lang_to_token=n=>n}_build_translation_inputs(e,r,n){return Oo(this,e,r,n)}}class Bb extends Te{constructor(e,r){super(e,r),this.languageRegex=/^__[a-z]{2,3}__$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)).map(n=>n.slice(2,-2)),this.lang_to_token=n=>`__${n}__`}_build_translation_inputs(e,r,n){return Oo(this,e,r,n)}}class Db extends Te{constructor(){super(...arguments);N(this,"_default_chat_template",'{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}')}get timestamp_begin(){return this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0]+1}_decode_asr(r,{return_timestamps:n=!1,return_language:a=!1,time_precision:s=null,force_full_sequences:i=!0}={}){if(s===null)throw Error("Must specify time_precision");let o=null;const u=n==="word";function l(){return{language:o,timestamp:[null,null],text:""}}const c=[];let h=l(),m=0;const _=this.timestamp_begin;let p=[],b=[],v=!1,S=null;const x=new Set(this.all_special_ids);for(const I of r){const R=I.tokens,B=u?I.token_timestamps:null;let W=null,j=_;if("stride"in I){const[ae,ne,ie]=I.stride;if(m-=ne,S=ae-ie,ne&&(j=ne/s+_),ie)for(let D=R.length-1;D>=0;--D){const M=Number(R[D]);if(M>=_){if(W!==null&&(M-_)*s=_){const ie=(ne-_)*s+m,D=ss(ie,2);if(W!==null&&ne>=W)v=!0;else if(v||p.length>0&&ne0?(p.push(q),u&&b.push(le)):p.every(ae=>ae.length===0)&&(h=l(),p=[],q=[],b=[],le=[])}if(p.length>0){if(i&&n)throw new Error("Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.");const[I,R]=this.findLongestCommonSequence(p,b),B=this.decode(I);h.text=B,u&&(h.words=this.collateWordTimestamps(I,R,o)),c.push(h)}let T=Object.create(null);const C=c.map(I=>I.text).join("");if(n||a){for(let I=0;I0;let u=o?[]:null,l=o?n[0]:null;for(let c=1;cD===ae[M]).length,ie=ne/I+R;ne>1&&ie>m&&(m=ie,_=[B,W,q,le])}const[b,v,S,x]=_,T=Math.floor((v+b)/2),C=Math.floor((x+S)/2);i.push(...a.slice(0,T)),a=h.slice(C),s=a.length,o&&(u.push(...l.slice(0,T)),l=n[c].slice(C))}return i.push(...a),o?(u.push(...l),[i,u]):[i,[]]}collateWordTimestamps(r,n,a){const[s,i,o]=this.combineTokensIntoWords(r,a),u=[];for(let l=0;l=s){const u=((o-s)*a).toFixed(2);i.push(`<|${u}|>`),i.push([])}else i[i.length-1].push(o);return i=i.map(o=>typeof o=="string"?o:super.decode(o,n)),i.join("")}splitTokensOnUnicode(r){const n=this.decode(r,{decode_with_timestamps:!0}),a="�",s=[],i=[],o=[];let u=[],l=[],c=0;for(let h=0;h=this.model.tokens_to_ids.get("<|endoftext|>"),b=h.startsWith(" "),v=h.trim(),S=l.test(v);if(p||b||S||i.length===0)i.push(h),o.push(m),u.push(_);else{const x=i.length-1;i[x]+=h,o[x].push(...m),u[x].push(..._)}}return[i,o,u]}mergePunctuations(r,n,a,s,i){const o=structuredClone(r),u=structuredClone(n),l=structuredClone(a);let c=o.length-2,h=o.length-1;for(;c>=0;)o[c].startsWith(" ")&&s.includes(o[c].trim())?(o[h]=o[c]+o[h],u[h]=lt(u[c],u[h]),l[h]=lt(l[c],l[h]),o[c]="",u[c]=[],l[c]=[]):h=c,--c;for(c=0,h=1;hm),u.filter(m=>m.length>0),l.filter(m=>m.length>0)]}get_decoder_prompt_ids({language:r=null,task:n=null,no_timestamps:a=!0}={}){const s=[];if(r){const i=mm(r),o=this.model.tokens_to_ids.get(`<|${i}|>`);if(o===void 0)throw new Error(`Unable to find language "${i}" in model vocabulary. Please report this issue at ${Io}.`);s.push(o)}else s.push(null);if(n){if(n=n.toLowerCase(),n!=="transcribe"&&n!=="translate")throw new Error(`Task "${n}" is not supported. Must be one of: ["transcribe", "translate"]`);const i=this.model.tokens_to_ids.get(`<|${n}|>`);if(i===void 0)throw new Error(`Unable to find task "${n}" in model vocabulary. Please report this issue at ${Io}.`);s.push(i)}else s.push(null);if(a){const i=this.model.tokens_to_ids.get("<|notimestamps|>");if(i===void 0)throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${Io}.`);s.push(i)}return s.map((i,o)=>[o+1,i]).filter(i=>i[1]!==null)}}class Nb extends Te{}class Fb extends Te{}class Lb extends Te{}class Ub extends Te{constructor(e,r){super(e,r),this.languageRegex=/^(>>\w+<<)\s*/g,this.supported_language_codes=this.model.vocab.filter(n=>this.languageRegex.test(n)),console.warn('WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.')}_encode_text(e){if(e===null)return null;const[r,...n]=e.trim().split(this.languageRegex);if(n.length===0)return super._encode_text(r);if(n.length===2){const[a,s]=n;return this.supported_language_codes.includes(a)||console.warn(`Unsupported language code "${a}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`),lt([a],super._encode_text(s))}}}class Wb extends Te{}class Cm extends Te{constructor(){super(...arguments);N(this,"_default_chat_template","{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}")}}class Vb extends Cm{}class Gb extends Te{}class Hb extends Te{}class jb extends Te{constructor(e,r){super(e,r),this.decoder=new Jw({})}}class qb extends Te{}class Im{static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:s=!1,revision:i="main",legacy:o=null}={}){var m;const[u,l]=await gm(e,{progress_callback:r,config:n,cache_dir:a,local_files_only:s,revision:i,legacy:o}),c=((m=l.tokenizer_class)==null?void 0:m.replace(/Fast$/,""))??"PreTrainedTokenizer";let h=this.TOKENIZER_CLASS_MAPPING[c];return h||(console.warn(`Unknown tokenizer class "${c}", attempting to construct from base class.`),h=Te),new h(u,l)}}N(Im,"TOKENIZER_CLASS_MAPPING",{T5Tokenizer:vb,DistilBertTokenizer:_b,CamembertTokenizer:yb,DebertaTokenizer:pb,DebertaV2Tokenizer:hb,BertTokenizer:ub,HerbertTokenizer:fb,ConvBertTokenizer:mb,RoFormerTokenizer:gb,XLMTokenizer:wb,ElectraTokenizer:bb,MobileBertTokenizer:db,SqueezeBertTokenizer:cb,AlbertTokenizer:lb,GPT2Tokenizer:km,BartTokenizer:$b,MBartTokenizer:Em,MBart50Tokenizer:xb,RobertaTokenizer:Sb,WhisperTokenizer:Db,CodeGenTokenizer:Nb,CLIPTokenizer:Fb,SiglipTokenizer:Lb,MarianTokenizer:Ub,BloomTokenizer:kb,NllbTokenizer:Pb,M2M100Tokenizer:Bb,LlamaTokenizer:Tm,CodeLlamaTokenizer:Eb,XLMRobertaTokenizer:Tb,MPNetTokenizer:Cb,FalconTokenizer:Ib,GPTNeoXTokenizer:Ab,EsmTokenizer:Mb,Wav2Vec2CTCTokenizer:Wb,BlenderbotTokenizer:Cm,BlenderbotSmallTokenizer:Vb,SpeechT5Tokenizer:Gb,NougatTokenizer:Hb,VitsTokenizer:jb,Qwen2Tokenizer:Ob,GemmaTokenizer:zb,Grok1Tokenizer:Rb,CohereTokenizer:qb,PreTrainedTokenizer:Te});async function Kb(t,e){return await Er(t,"config.json",!0,e)}function yn(t){const e={};let r={};switch(t.model_type){case"llava":case"paligemma":r=yn(t.text_config);break;case"moondream1":r=yn(t.phi_config);break;case"musicgen":r=yn(t.decoder);break;case"gpt2":case"gptj":case"codegen":case"gpt_bigcode":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"phi":case"phi3":case"falcon":e.num_heads="num_attention_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size";break;case"llama":case"mistral":case"starcoder2":case"qwen2":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size",e.num_attention_heads="num_attention_heads";break;case"gemma":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.dim_kv="head_dim";break;case"openelm":e.num_heads="num_kv_heads",e.num_layers="num_transformer_layers",e.dim_kv="head_dim";break;case"gpt_neo":e.num_heads="num_heads",e.num_layers="num_layers",e.hidden_size="hidden_size";break;case"bloom":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="hidden_size";break;case"mpt":e.num_heads="n_heads",e.num_layers="n_layers",e.hidden_size="d_model";break;case"t5":case"mt5":case"longt5":e.num_decoder_layers="num_decoder_layers",e.num_decoder_heads="num_heads",e.decoder_dim_kv="d_kv",e.num_encoder_layers="num_layers",e.num_encoder_heads="num_heads",e.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="d_model",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="d_model";break;case"speecht5":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="hidden_size",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="hidden_size";break;case"trocr":e.num_encoder_layers=e.num_decoder_layers="decoder_layers",e.num_encoder_heads=e.num_decoder_heads="decoder_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="d_model";break;case"musicgen_decoder":e.num_encoder_layers=e.num_decoder_layers="num_hidden_layers",e.num_encoder_heads=e.num_decoder_heads="num_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="hidden_size";break;case"vision-encoder-decoder":const a=yn(t.encoder),s=yn(t.decoder),i="num_decoder_layers"in s,o={};return i?(o.num_decoder_layers=s.num_layers,o.num_decoder_heads=s.num_heads,o.decoder_hidden_size=s.hidden_size,o.num_encoder_layers=a.num_layers,o.num_encoder_heads=a.num_heads,o.encoder_hidden_size=a.hidden_size):(o.num_layers=s.num_layers,o.num_heads=s.num_heads,o.hidden_size=s.hidden_size),o}const n={...r,...Dr(t,["model_type","multi_query","is_encoder_decoder"])};for(const a in e)n[a]=t[e[a]];return n}function Am(t,{prefix:e="past_key_values",encoder_add_pkv:r=!0}={}){const n={},a=t.normalized_config,s=1;if(a.is_encoder_decoder&&r){const i=a.encoder_dim_kv??a.encoder_hidden_size/a.num_encoder_heads,o=a.decoder_dim_kv??a.decoder_hidden_size/a.num_decoder_heads,u=[s,a.num_encoder_heads,0,i],l=[s,a.num_decoder_heads,0,o];for(let c=0;c=1&&i[i.length-1]>=this.timestamp_begin,u=i.length<2||i[i.length-2]>=this.timestamp_begin;if(o&&(u?s.subarray(this.timestamp_begin).fill(-1/0):s.subarray(0,this.eos_token_id).fill(-1/0)),e[n].length===this.begin_index&&this.max_initial_timestamp_index!==null){const m=this.timestamp_begin+this.max_initial_timestamp_index;s.subarray(m+1).fill(-1/0)}const l=Y_(s),c=Math.log(l.subarray(this.timestamp_begin).map(Math.exp).reduce((m,_)=>m+_)),h=mr(l.subarray(0,this.timestamp_begin))[0];c>h&&s.subarray(0,this.timestamp_begin).fill(-1/0)}return r}}class tv extends dr{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const r=e.length,n=[];for(let s=0;s1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,r){if(r.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${r.dims[0]} for the logits and ${e.length} for the input ids.`);const n=e.length,a=r.slice([0,n],null),s=r.slice([n,r.dims[0]],null);for(let i=0;i1)throw new Error(`\`top_p\` must be a float > 0 and < 1, but is ${e}`);if(!Number.isInteger(n)||n<1)throw new Error(`\`min_tokens_to_keep\` must be a positive integer, but is ${n}`);this.top_p=e,this.filter_value=r,this.min_tokens_to_keep=n}}class lv extends zo{constructor(e,{filter_value:r=-1/0,min_tokens_to_keep:n=1}={}){if(super(),!Number.isInteger(e)||e<0)throw new Error(`\`top_k\` must be a positive integer, but is ${e}`);this.top_k=Math.max(e,n),this.filter_value=r}}class zm{constructor(e){N(this,"max_length",20);N(this,"max_new_tokens",null);N(this,"min_length",0);N(this,"min_new_tokens",null);N(this,"early_stopping",!1);N(this,"max_time",null);N(this,"do_sample",!1);N(this,"num_beams",1);N(this,"num_beam_groups",1);N(this,"penalty_alpha",null);N(this,"use_cache",!0);N(this,"temperature",1);N(this,"top_k",50);N(this,"top_p",1);N(this,"typical_p",1);N(this,"epsilon_cutoff",0);N(this,"eta_cutoff",0);N(this,"diversity_penalty",0);N(this,"repetition_penalty",1);N(this,"encoder_repetition_penalty",1);N(this,"length_penalty",1);N(this,"no_repeat_ngram_size",0);N(this,"bad_words_ids",null);N(this,"force_words_ids",null);N(this,"renormalize_logits",!1);N(this,"constraints",null);N(this,"forced_bos_token_id",null);N(this,"forced_eos_token_id",null);N(this,"remove_invalid_values",!1);N(this,"exponential_decay_length_penalty",null);N(this,"suppress_tokens",null);N(this,"begin_suppress_tokens",null);N(this,"forced_decoder_ids",null);N(this,"guidance_scale",null);N(this,"num_return_sequences",1);N(this,"output_attentions",!1);N(this,"output_hidden_states",!1);N(this,"output_scores",!1);N(this,"return_dict_in_generate",!1);N(this,"pad_token_id",null);N(this,"bos_token_id",null);N(this,"eos_token_id",null);N(this,"encoder_no_repeat_ngram_size",0);N(this,"decoder_start_token_id",null);N(this,"generation_kwargs",{});Object.assign(this,Dr(e,Object.getOwnPropertyNames(this)))}}class Po extends wt{_call(e,r){throw Error("StoppingCriteria needs to be subclassed")}}class Bo extends wt{constructor(){super(),this.criteria=[]}push(e){this.criteria.push(e)}extend(e){e instanceof Bo?e=e.criteria:e instanceof Po&&(e=[e]),this.criteria.push(...e)}_call(e,r){const n=new Array(e.length).fill(!1);for(const a of this.criteria){const s=a(e,r);for(let i=0;ir.length>=this.max_length)}}class cv extends Po{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,r){return e.map(n=>{const a=n.at(-1);return this.eos_token_id.some(s=>a==s)})}}class Si extends wt{constructor(e){super(),this.generation_config=e}_call(e,r=-1){return this.sample(e,r)}sample(e,r){throw Error("sample should be implemented in subclasses.")}getLogits(e,r){let n=e.dims.at(-1),a=e.data;if(r===-1)a=a.slice(-n);else{let s=r*n;a=a.slice(s,s+n)}return a}randomSelect(e){let r=e.reduce((a,s)=>a+s,0),n=Math.random()*r;for(let a=0;a1)return new fv(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new pv(e)}}class pv extends Si{sample(e,r=-1){let n=this.getLogits(e,r);return[[mr(n)[1],0]]}}class hv extends Si{sample(e,r=-1){let n=e.dims.at(-1);this.generation_config.top_k>0&&(n=Math.min(this.generation_config.top_k,n));const a=this.getLogits(e,r),s=wu(a,n),i=Bn(s.map(o=>o[1]));return Array.from({length:this.generation_config.num_beams},()=>{const o=this.randomSelect(i);return[s[o][0],Math.log(i[o])]})}}class fv extends Si{sample(e,r=-1){let n=e.dims.at(-1);this.generation_config.top_k>0&&(n=Math.min(this.generation_config.top_k,n));const a=this.getLogits(e,r),s=wu(a,n),i=Bn(s.map(o=>o[1]));return Array.from({length:this.generation_config.num_beams},(o,u)=>[s[u][0],Math.log(i[u])])}}class mv extends zm{constructor(){super(...arguments);N(this,"return_timestamps",null);N(this,"return_token_timestamps",null);N(this,"num_frames",null);N(this,"alignment_heads",null);N(this,"task",null);N(this,"language",null);N(this,"no_timestamps_token_id",null);N(this,"prompt_ids",null);N(this,"is_multilingual",null);N(this,"lang_to_id",null);N(this,"task_to_id",null);N(this,"max_initial_timestamp_index",1)}}const $e={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,MaskGeneration:5,ImageTextToText:6,Musicgen:7},ki=new Map,Rm=new Map,sa=new Map;async function gv(t,e,r){let n=r.device;n&&typeof n!="string"&&(n.hasOwnProperty(e)?n=n[e]:(console.warn(`Device not specified for ${e}. Using the default device.`),n=null));const a=Oy(n);let s=r.dtype;if(typeof s!="string"&&(s&&s.hasOwnProperty(e)?s=s[e]:(s=Qb[a[0]],console.warn(`Dtype not specified for ${e}. Using the default dtype: ${s}.`))),Om.hasOwnProperty(s)){if(s===Tt.fp16&&!await Yb())throw new Error("The device does not support fp16.")}else throw new Error(`Invalid dtype: ${s}. Should be one of: ${Object.keys(Tt).join(", ")}`);const i=Om[s],o=`${r.subfolder??""}/${e}${i}.onnx`,u={...r.session_options};u.executionProviders??(u.executionProviders=a);const l=Ga(t,o,!0,r);let c=[];if(r.use_external_data_format){if(Br.IS_NODE_ENV)throw new Error("External data format is not yet supported in Node.js");const m=`${e}${i}.onnx_data`,_=`${r.subfolder??""}/${m}`;c.push(new Promise(async(p,b)=>{const v=await Ga(t,_,!0,r);p({path:m,data:v})}))}else u.externalData!==void 0&&(c=u.externalData.map(async m=>{if(typeof m.data=="string"){const _=await Ga(t,m.data,!0,r);return{...m,data:_}}return m}));if(c.length>0&&(u.externalData=await Promise.all(c)),n==="webgpu"){const m=Am(r.config,{prefix:"present"});if(Object.keys(m).length>0){const _={};for(const p in m)_[p]="gpu-buffer";u.preferredOutputLocation=_}}return{buffer:await l,session_options:u}}async function Xr(t,e,r){const n=Object.keys(e),a=await Promise.all(n.map(async i=>gv(t,e[i],r))),s={};for(let i=0;i0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${n.join(", ")}.`);const a=Object.keys(e).length,s=t.inputNames.length;if(a>s){let i=Object.keys(e).filter(o=>!t.inputNames.includes(o));console.warn(`WARNING: Too many inputs were provided (${a} > ${s}). The following inputs will be ignored: "${i.join(", ")}".`)}return r}async function Mr(t,e){const r=_v(t,e);try{const n=Object.fromEntries(Object.entries(r).map(([s,i])=>[s,i.ort_tensor]));let a=await t.run(n);return a=Pm(a),a}catch(n){throw console.error(`An error occurred during model execution: "${n}".`),console.error("Inputs given to model:",r),n}}function Pm(t){for(let e in t)Zf(t[e])?t[e]=new fe(t[e]):typeof t[e]=="object"&&Pm(t[e]);return t}function Bm(t){if(t instanceof fe)return t;if(t.length===0)throw Error("items must be non-empty");if(Array.isArray(t[0])){if(t.some(e=>e.length!==t[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new fe("int64",BigInt64Array.from(t.flat().map(e=>BigInt(e))),[t.length,t[0].length])}else return new fe("int64",BigInt64Array.from(t.map(e=>BigInt(e))),[1,t.length])}function Dm(t){return new fe("bool",[t],[1])}async function Nm(t,e){let{encoder_outputs:r,past_key_values:n}=e;if(!r){const u=Dr(e,t.sessions.model.inputNames);r=(await oa(t,u)).last_hidden_state}const{input_ids:a,decoder_input_ids:s,...i}=e;return i.input_ids=s,i.encoder_hidden_states=r,t.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(i.encoder_attention_mask=e.attention_mask),await Do(t,i,!0)}async function oa(t,e){const r=t.sessions.model,n=Object.create(null);for(const a of r.inputNames)n[a]=e[a];return r.inputNames.includes("token_type_ids")&&!n.token_type_ids&&(n.token_type_ids=new fe("int64",new BigInt64Array(n.input_ids.data.length),n.input_ids.dims)),await Mr(r,n)}async function Do(t,e,r=!1){const n=t.sessions[r?"decoder_model_merged":"model"],{past_key_values:a,...s}=e;n.inputNames.includes("use_cache_branch")&&(s.use_cache_branch=Dm(!!a)),n.inputNames.includes("position_ids")&&s.attention_mask&&!s.position_ids&&(s.position_ids=wv(s,a)),t.addPastKeyValues(s,a);const i=Dr(s,n.inputNames);return await Mr(n,i)}async function yv(t,{input_ids:e=null,attention_mask:r=null,pixel_values:n=null,position_ids:a=null,inputs_embeds:s=null,past_key_values:i=null,generation_config:o=null,logits_processor:u=null,...l}){if(!s){if(s=await t.encode_text({input_ids:e}),n&&e.dims[1]!==1){const h=await t.encode_image({pixel_values:n});({inputs_embeds:s,attention_mask:r}=t._merge_input_ids_with_image_features({image_features:h,inputs_embeds:s,input_ids:e,attention_mask:r}))}else if(i&&n&&e.dims[1]===1){const h=e.dims[1],m=Object.values(i)[0].dims.at(-2);r=ur([Zn([e.dims[0],m]),r.slice(null,[r.dims[1]-h,r.dims[1]])],1)}}return await Do(t,{inputs_embeds:s,past_key_values:i,attention_mask:r,position_ids:a,generation_config:o,logits_processor:u},!0)}function wv(t,e=null){const{input_ids:r,inputs_embeds:n,attention_mask:a}=t,[s,i]=a.dims,o=new BigInt64Array(a.data.length);for(let l=0;ls.dims[1])){if(ao==t.config.image_token_index)){const o=t.config.num_image_tokens;if(!o)throw new Error("`num_image_tokens` is missing in the model configuration.");const u=s.dims[1]-(a-o);r.input_ids=s.slice(null,[-u,null]),r.attention_mask=Zn([1,a+u])}}}return r}function bv(t,e,r,n){const{...a}=r;return r.past_key_values&&(e=e.map(i=>[i.at(-1)])),a.decoder_input_ids=Bm(e),a}class re extends wt{constructor(r,n){super();N(this,"main_input_name","input_ids");N(this,"forward_params",["input_ids","attention_mask"]);this.config=r,this.sessions=n;const a=sa.get(this.constructor),s=ki.get(a);this.can_generate=!1,this._forward=null,this._prepare_inputs_for_generation=null,s===$e.DecoderOnly?(this.can_generate=!0,this._forward=Do,this._prepare_inputs_for_generation=Fm):s===$e.Seq2Seq||s===$e.Vision2Seq||s===$e.Musicgen?(this.can_generate=!0,this._forward=Nm,this._prepare_inputs_for_generation=bv):s===$e.EncoderDecoder?this._forward=Nm:s===$e.ImageTextToText?(this.can_generate=!0,this._forward=yv,this._prepare_inputs_for_generation=Fm):this._forward=oa,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){var n;const r=[];for(const a of Object.values(this.sessions))(n=a==null?void 0:a.handler)!=null&&n.dispose&&r.push(a.handler.dispose());return await Promise.all(r)}static async from_pretrained(r,{progress_callback:n=null,config:a=null,cache_dir:s=null,local_files_only:i=!1,revision:o="main",model_file_name:u=null,subfolder:l="onnx",device:c=null,dtype:h=null,use_external_data_format:m=null,session_options:_={}}={}){let p={progress_callback:n,config:a,cache_dir:s,local_files_only:i,revision:o,model_file_name:u,subfolder:l,device:c,dtype:h,use_external_data_format:m,session_options:_};const b=sa.get(this),v=ki.get(b);p.config=await Mm.from_pretrained(r,p);let S;return v===$e.DecoderOnly?S=await Promise.all([Xr(r,{model:p.model_file_name??"model"},p),Er(r,"generation_config.json",!1,p)]):v===$e.Seq2Seq||v===$e.Vision2Seq?S=await Promise.all([Xr(r,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},p),Er(r,"generation_config.json",!1,p)]):v===$e.MaskGeneration?S=await Promise.all([Xr(r,{model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"},p)]):v===$e.EncoderDecoder?S=await Promise.all([Xr(r,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},p)]):v===$e.ImageTextToText?S=await Promise.all([Xr(r,{embed_tokens:"embed_tokens",vision_encoder:"vision_encoder",decoder_model_merged:"decoder_model_merged"},p),Er(r,"generation_config.json",!1,p)]):v===$e.Musicgen?S=await Promise.all([Xr(r,{model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"},p),Er(r,"generation_config.json",!1,p)]):(v!==$e.EncoderOnly&&console.warn(`Model type for '${b??(a==null?void 0:a.model_type)}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`),S=await Promise.all([Xr(r,{model:p.model_file_name??"model"},p)])),new this(p.config,...S)}async _call(r){return await this.forward(r)}async forward(r){return await this._forward(this,r)}_get_logits_warper(r){const n=new Ro;return r.temperature!==null&&r.temperature!==1&&n.push(new ov(r.temperature)),r.top_k!==null&&r.top_k!==0&&n.push(new lv(r.top_k)),r.top_p!==null&&r.top_p<1&&n.push(new uv(r.top_p)),n}_get_logits_processor(r,n,a=null){const s=new Ro;if(r.repetition_penalty!==null&&r.repetition_penalty!==1&&s.push(new rv(r.repetition_penalty)),r.no_repeat_ngram_size!==null&&r.no_repeat_ngram_size>0&&s.push(new tv(r.no_repeat_ngram_size)),r.bad_words_ids!==null&&s.push(new iv(r.bad_words_ids,r.eos_token_id)),r.min_length!==null&&r.eos_token_id!==null&&r.min_length>0&&s.push(new nv(r.min_length,r.eos_token_id)),r.min_new_tokens!==null&&r.eos_token_id!==null&&r.min_new_tokens>0&&s.push(new av(n,r.min_new_tokens,r.eos_token_id)),r.forced_bos_token_id!==null&&s.push(new Xb(r.forced_bos_token_id)),r.forced_eos_token_id!==null&&s.push(new Jb(r.max_length,r.forced_eos_token_id)),r.begin_suppress_tokens!==null){const i=n>1||r.forced_bos_token_id===null?n:n+1;s.push(new Zb(r.begin_suppress_tokens,i))}return r.guidance_scale!==null&&r.guidance_scale>1&&s.push(new sv(r.guidance_scale)),a!==null&&s.extend(a),s}_prepare_generation_config(r,n,a=zm){const s={...this.config};for(const o of["decoder","generator","text_config"])o in s&&Object.assign(s,s[o]);const i=new a(s);return"generation_config"in this&&Object.assign(i,this.generation_config),r&&Object.assign(i,r),n&&Object.assign(i,Dr(n,Object.getOwnPropertyNames(i))),i}_get_stopping_criteria(r,n=null){const a=new Bo;return r.max_length!==null&&a.push(new dv(r.max_length,this.config.max_position_embeddings??null)),r.eos_token_id!==null&&a.push(new cv(r.eos_token_id)),n&&a.extend(n),a}_validate_model_class(){if(!this.can_generate){const r=[Gg,Hg,Vg,Wg],n=sa.get(this.constructor),a=new Set,s=this.config.model_type;for(const o of r){const u=o.get(s);u&&a.add(u[0])}let i=`The current model class (${n}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw a.size>0&&(i+=` Please use the following class instead: ${[...a].join(", ")}`),Error(i)}}prepare_inputs_for_generation(...r){return this._prepare_inputs_for_generation(this,...r)}_update_model_kwargs_for_generation({generated_input_ids:r,outputs:n,model_inputs:a,is_encoder_decoder:s}){return a.past_key_values=this.getPastKeyValues(n,a.past_key_values),a.input_ids=new fe("int64",r.flat(),[r.length,1]),s||(a.attention_mask=ur([a.attention_mask,Zn([a.attention_mask.dims[0],1])],1)),a.position_ids=null,a}_prepare_model_inputs({inputs:r,bos_token_id:n,model_kwargs:a}){const s=Dr(a,this.forward_params),i=this.main_input_name;if(i in s){if(r)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else s[i]=r;return{inputs_tensor:s[i],model_inputs:s,model_input_name:i}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:r,model_inputs:n,model_input_name:a,generation_config:s}){const i=Dr(n,this.sessions.model.inputNames);let{last_hidden_state:o}=await oa(this,i);return s.guidance_scale!==null&&s.guidance_scale>1&&(o=ur([o,Fy(o,0)],0),"attention_mask"in n&&(n.attention_mask=ur([n.attention_mask,Wy(n.attention_mask)],0))),n.encoder_outputs=o,n}_prepare_decoder_input_ids_for_generation({batch_size:r,model_input_name:n,model_kwargs:a,decoder_start_token_id:s,bos_token_id:i,generation_config:o}){let{decoder_input_ids:u,...l}=a;if(!u)if(s??(s=i),this.config.model_type==="musicgen")u=Array.from({length:r*this.config.decoder.num_codebooks},()=>[s]);else if(Array.isArray(s)){if(s.length!==r)throw new Error(`\`decoder_start_token_id\` expcted to have length ${r} but got ${s.length}`);u=s}else u=Array.from({length:r},()=>[s]);return u=Bm(u),a.decoder_attention_mask=Ly(u),{input_ids:u,model_inputs:l}}async generate({inputs:r=null,generation_config:n=null,logits_processor:a=null,stopping_criteria:s=null,streamer:i=null,...o}){this._validate_model_class(),n=this._prepare_generation_config(n,o);let{inputs_tensor:u,model_inputs:l,model_input_name:c}=this._prepare_model_inputs({inputs:r,model_kwargs:o});const h=this.config.is_encoder_decoder;h&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:u,model_inputs:l,model_input_name:c,generation_config:n})));let m;h?{input_ids:m,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[c].dims.at(0),model_input_name:c,model_kwargs:l,decoder_start_token_id:n.decoder_start_token_id,bos_token_id:n.bos_token_id,generation_config:n}):m=l[c];let _=m.dims.at(-1);n.max_new_tokens!==null&&(n.max_length=_+n.max_new_tokens);const p=this._get_logits_processor(n,_,a),b=this._get_stopping_criteria(n,s),v=l[c].dims.at(0),S=Si.getSampler(n),x=new Array(v).fill(0),T=m.tolist();i&&i.put(T);let C=null;for(;;){l=this.prepare_inputs_for_generation(T,l,n);const R=await this.forward(l),B=R.logits.slice(null,-1,null),W=p(T,B),j=[];for(let le=0;lele)){n.return_dict_in_generate&&(C=this.getPastKeyValues(R,l.past_key_values,!1));break}l=this._update_model_kwargs_for_generation({generated_input_ids:j,outputs:R,model_inputs:l,is_encoder_decoder:h})}i&&i.end();const I=new fe("int64",T.flat(),[T.length,T[0].length]);return n.return_dict_in_generate?{sequences:I,past_key_values:C}:I}addAttentionsToBeam(r,n){if(this.config.is_encoder_decoder){if(!n.cross_attentions||n.cross_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce cross-attentions. This is most likely because the model was not exported with `output_attentions=True`.");r.cross_attentions||(r.cross_attentions=[]),r.cross_attentions.push(n.cross_attentions)}if(!n.decoder_attentions||n.decoder_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce decoder-attentions. This is most likely because the model was not exported with `output_attentions=True`.");r.decoder_attentions||(r.decoder_attentions=[]),r.decoder_attentions.push(n.decoder_attentions)}groupBeams(r){const n=Object.create(null);for(const a of r)n[a.id]===void 0?n[a.id]=[a]:n[a.id].push(a);return Object.values(n)}getPastKeyValues(r,n,a=!0){const s=Object.create(null);for(const i in r)if(i.startsWith("present")){let o=i.replace("present","past_key_values");if(n&&i.includes("encoder"))s[o]=n[o];else{if(a&&n){const u=n[o];u.location==="gpu-buffer"&&u.dispose()}s[o]=r[i]}}return s}getAttentions(r){const n=Object.create(null);for(const a of["cross_attentions","decoder_attentions"]){const s=[];for(const i in r)if(i.startsWith(a)){const o=i.split(".").pop();s[o]=r[i]}n[a]=s}return n}addPastKeyValues(r,n){if(n)Object.assign(r,n);else{const a=this.custom_config.kv_cache_dtype??"float32",s=a==="float16"?new Uint16Array:[],i=Am(this.config);for(const o in i)r[o]=new fe(a,s,i[o])}}}class Vt{}class ua extends re{}class vv extends ua{}class $v extends ua{async _call(e){return new _t(await super._call(e))}}class xv extends ua{async _call(e){return new ze(await super._call(e))}}class Sv extends ua{async _call(e){return new gt(await super._call(e))}}class kv extends ua{async _call(e){return new xt(await super._call(e))}}class Ev extends re{}class Tv extends Ev{}class la extends re{}class Cv extends la{}class Iv extends la{async _call(e){return new _t(await super._call(e))}}class Av extends la{async _call(e){return new ze(await super._call(e))}}class Mv extends la{async _call(e){return new gt(await super._call(e))}}class Ov extends la{async _call(e){return new xt(await super._call(e))}}class da extends re{}class zv extends da{}class Rv extends da{async _call(e){return new _t(await super._call(e))}}class Pv extends da{async _call(e){return new ze(await super._call(e))}}class Bv extends da{async _call(e){return new gt(await super._call(e))}}class Dv extends da{async _call(e){return new xt(await super._call(e))}}class ca extends re{}class Nv extends ca{}class Fv extends ca{async _call(e){return new _t(await super._call(e))}}class Lv extends ca{async _call(e){return new ze(await super._call(e))}}class Uv extends ca{async _call(e){return new gt(await super._call(e))}}class Wv extends ca{async _call(e){return new xt(await super._call(e))}}class pa extends re{}class Vv extends pa{}class Gv extends pa{async _call(e){return new _t(await super._call(e))}}class Hv extends pa{async _call(e){return new ze(await super._call(e))}}class jv extends pa{async _call(e){return new gt(await super._call(e))}}class qv extends pa{async _call(e){return new xt(await super._call(e))}}class ha extends re{}class Kv extends ha{}class Yv extends ha{async _call(e){return new _t(await super._call(e))}}class Qv extends ha{async _call(e){return new ze(await super._call(e))}}class Xv extends ha{async _call(e){return new gt(await super._call(e))}}class Jv extends ha{async _call(e){return new xt(await super._call(e))}}class fa extends re{}class Zv extends fa{}class e2 extends fa{async _call(e){return new _t(await super._call(e))}}class t2 extends fa{async _call(e){return new ze(await super._call(e))}}class r2 extends fa{async _call(e){return new gt(await super._call(e))}}class n2 extends fa{async _call(e){return new xt(await super._call(e))}}class ma extends re{}class a2 extends ma{}class i2 extends ma{async _call(e){return new ze(await super._call(e))}}class s2 extends ma{async _call(e){return new gt(await super._call(e))}}class o2 extends ma{async _call(e){return new xt(await super._call(e))}}class u2 extends ma{async _call(e){return new _t(await super._call(e))}}class Ei extends re{}class l2 extends Ei{}class d2 extends Ei{async _call(e){return new _t(await super._call(e))}}class c2 extends Ei{async _call(e){return new ze(await super._call(e))}}class p2 extends Ei{async _call(e){return new gt(await super._call(e))}}class Ti extends re{}class h2 extends Ti{}class f2 extends Ti{async _call(e){return new _t(await super._call(e))}}class m2 extends Ti{async _call(e){return new ze(await super._call(e))}}class g2 extends Ti{async _call(e){return new xt(await super._call(e))}}class ga extends re{}class _2 extends ga{}class y2 extends ga{async _call(e){return new _t(await super._call(e))}}class w2 extends ga{async _call(e){return new ze(await super._call(e))}}class b2 extends ga{async _call(e){return new gt(await super._call(e))}}class v2 extends ga{async _call(e){return new xt(await super._call(e))}}class Ci extends re{}class $2 extends Ci{}class x2 extends Ci{async _call(e){return new _t(await super._call(e))}}class S2 extends Ci{async _call(e){return new ze(await super._call(e))}}class k2 extends Ci{async _call(e){return new xt(await super._call(e))}}class Ii extends re{}class E2 extends Ii{}class T2 extends Ii{async _call(e){return new ze(await super._call(e))}}class C2 extends Ii{async _call(e){return new xt(await super._call(e))}}class I2 extends Ii{async _call(e){return new _t(await super._call(e))}}class Lm extends re{constructor(r,n,a){super(r,n);N(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"]);this.generation_config=a}}class A2 extends Lm{}class M2 extends Lm{}class Um extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class O2 extends Um{}class z2 extends Um{}class Wm extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class R2 extends Wm{}class P2 extends Wm{}class No extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class B2 extends No{}class D2 extends No{}class N2 extends No{async _call(e){return new ze(await super._call(e))}}class Ai extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class F2 extends Ai{}class L2 extends Ai{}class U2 extends Ai{async _call(e){return new ze(await super._call(e))}}class W2 extends Ai{}class Vm extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class V2 extends Vm{}class G2 extends Vm{}class Gm extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class H2 extends Gm{}class j2 extends Gm{}class _a extends re{}class q2 extends _a{}class K2 extends _a{async _call(e){return new _t(await super._call(e))}}class Y2 extends _a{async _call(e){return new ze(await super._call(e))}}class Q2 extends _a{async _call(e){return new gt(await super._call(e))}}class X2 extends _a{async _call(e){return new xt(await super._call(e))}}class ya extends re{}class J2 extends ya{}class Z2 extends ya{async _call(e){return new _t(await super._call(e))}}class e1 extends ya{async _call(e){return new ze(await super._call(e))}}class t1 extends ya{async _call(e){return new gt(await super._call(e))}}class r1 extends ya{async _call(e){return new xt(await super._call(e))}}class wa extends re{}class n1 extends wa{}class a1 extends wa{async _call(e){return new _t(await super._call(e))}}class i1 extends wa{async _call(e){return new ze(await super._call(e))}}class s1 extends wa{async _call(e){return new gt(await super._call(e))}}class o1 extends wa{async _call(e){return new xt(await super._call(e))}}class Hm extends re{}class u1 extends Hm{}class l1 extends Hm{}class jm extends re{constructor(r,n,a){super(r,n);N(this,"requires_attention_mask",!1);N(this,"main_input_name","input_features");N(this,"forward_params",["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"]);this.generation_config=a}}class d1 extends jm{}class qm extends jm{_prepare_generation_config(e,r){return super._prepare_generation_config(e,r,mv)}_retrieve_init_tokens(e){const r=[e.decoder_start_token_id];let n=e.language;const a=e.task;if(e.is_multilingual){n||(console.warn("No language specified - defaulting to English (en)."),n="en");const i=`<|${mm(n)}|>`;r.push(e.lang_to_id[i]),r.push(e.task_to_id[a??"transcribe"])}else if(n||a)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&r.at(-1)!==e.no_timestamps_token_id?r.push(e.no_timestamps_token_id):e.return_timestamps&&r.at(-1)===e.no_timestamps_token_id&&(console.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),r.pop()),r.filter(s=>s!=null)}async generate({inputs:e=null,generation_config:r=null,logits_processor:n=null,stopping_criteria:a=null,...s}){r=this._prepare_generation_config(r,s);const i=this._retrieve_init_tokens(r);return r.return_timestamps&&(n??(n=new Ro),n.push(new ev(r,i))),await super.generate({inputs:e,generation_config:r,logits_processor:n,decoder_input_ids:i,...s})}_extract_token_timestamps(e,r,n=null,a=.02){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");let s=this.config.median_filter_width;s===void 0&&(console.warn("Model config has no `median_filter_width`, using default value of 7."),s=7);const i=e.cross_attentions.map(l=>{let c=Array.from({length:this.config.decoder_layers},(v,S)=>ur(l.map(x=>x[S]),2)),h=Jn(r.map(([v,S])=>n?c[v].slice(null,S,null,[0,n]):c[v].slice(null,S)));h=h.transpose(1,0,2,3);let[m,_]=By(h,-2,0,!0),p=h.clone();for(let v=0;vh[S+1]-h[S]),p=lt([1],_).map(v=>!!v),b=[];for(let v=0;vm.findIndex(_=>_==s)),u=o.every(m=>m===-1),l=o.every(m=>m!==-1);if(!u&&!l)throw new Error("Every input should contain either 0 or 1 image token.");if(u)return{inputs_embeds:e,attention_mask:a};const c=[],h=[];for(let m=0;ms*i,1);e.input_labels=new fe("int64",new BigInt64Array(a).fill(1n),n)}const r={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(r.input_points=e.input_points),e.input_labels&&(r.input_labels=e.input_labels),e.input_boxes&&(r.input_boxes=e.input_boxes),await Mr(this.sessions.prompt_encoder_mask_decoder,r)}async _call(e){return new Q$(await super._call(e))}}class Q$ extends Vt{constructor({iou_scores:e,pred_masks:r}){super(),this.iou_scores=e,this.pred_masks=r}}class Mg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class X$ extends Mg{}class J$ extends Mg{}class Og extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Z$ extends Og{}class ex extends Og{}class Jr extends re{}class tx extends Jr{}class rx extends Jr{async _call(e){return new wn(await super._call(e))}}class nx extends Jr{async _call(e){return new ze(await super._call(e))}}class ax extends Jr{async _call(e){return new gt(await super._call(e))}}class Lo extends re{}class ix extends Lo{}class sx extends Lo{async _call(e){return new wn(await super._call(e))}}class ox extends Lo{async _call(e){return new ze(await super._call(e))}}class Oi extends re{}class ux extends Oi{}class lx extends Oi{async _call(e){return new wn(await super._call(e))}}class dx extends Oi{async _call(e){return new ze(await super._call(e))}}class cx extends Oi{async _call(e){return new gt(await super._call(e))}}class Uo extends re{}class px extends Uo{}class hx extends Uo{async _call(e){return new wn(await super._call(e))}}class fx extends Uo{async _call(e){return new ze(await super._call(e))}}class mx extends Jr{}class gx extends Jr{async _call(e){return new wn(await super._call(e))}}class _x extends Jr{async _call(e){return new ze(await super._call(e))}}class ba extends re{}class yx extends ba{}class wx extends ba{async _call(e){return new wn(await super._call(e))}}class bx extends ba{async _call(e){return new ze(await super._call(e))}}class vx extends ba{async _call(e){return new mS(await super._call(e))}}class $x extends ba{async _call(e){return new gt(await super._call(e))}}class zg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class xx extends zg{}class Sx extends zg{async generate_speech(e,r,{threshold:n=.5,minlenratio:a=0,maxlenratio:s=20,vocoder:i=null}={}){const o={input_ids:e},{encoder_outputs:u,encoder_attention_mask:l}=await oa(this,o),c=u.dims[1]/this.config.reduction_factor,h=Math.floor(c*s),m=Math.floor(c*a),_=this.config.num_mel_bins;let p=[],b=null,v=null,S=0;for(;;){++S;const C=Dm(!!v);let I;v?I=v.output_sequence_out:I=new fe("float32",new Float32Array(_),[1,1,_]);let R={use_cache_branch:C,output_sequence:I,encoder_attention_mask:l,speaker_embeddings:r,encoder_hidden_states:u};this.addPastKeyValues(R,b),v=await Mr(this.sessions.decoder_model_merged,R),b=this.getPastKeyValues(v,b);const{prob:B,spectrum:W}=v;if(p.push(W),S>=m&&(Array.from(B.data).filter(j=>j>=n).length>0||S>=h))break}const x=ur(p),{waveform:T}=await Mr(i.sessions.model,{spectrogram:x});return{spectrogram:x,waveform:T}}}class kx extends re{constructor(){super(...arguments);N(this,"main_input_name","spectrogram")}}class Ex extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Tx extends Ex{}class Rg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Cx extends Rg{}class Ix extends Rg{}class Pg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Ax extends Pg{}class Mx extends Pg{}class Bg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Ox extends Bg{}class zx extends Bg{}class Wo extends re{}class Rx extends Wo{}class Px extends Wo{static async from_pretrained(e,r={}){return r.model_file_name??(r.model_file_name="text_model"),super.from_pretrained(e,r)}}class Bx extends Wo{static async from_pretrained(e,r={}){return r.model_file_name??(r.model_file_name="audio_model"),super.from_pretrained(e,r)}}class Dx extends re{}class Dg extends Dx{async _call(e){return new _S(await super._call(e))}}class Ng extends re{}class Nx extends Ng{}class Fx extends Ng{}class Fg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Lx extends Fg{}class Ux extends Fg{}class Lg extends re{}class Wx extends Lg{}class Vx extends Lg{async _call(e){return new ze(await super._call(e))}}class Ug extends re{constructor(r,n,a){super(r,n);N(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"]);this.generation_config=a}_apply_and_filter_by_delay_pattern_mask(r){const[n,a]=r.dims,s=this.config.decoder.num_codebooks,i=a-s;let o=0;for(let c=0;c0&&_<=i&&(r.data[o++]=r.data[c])}const u=Math.floor(n/s),l=o/(u*s);return new fe(r.type,r.data.slice(0,o),[u,s,l])}prepare_inputs_for_generation(r,n,a){let s=structuredClone(r);for(let o=0;o=u&&(s[o][u]=BigInt(this.config.decoder.pad_token_id));return a.guidance_scale!==null&&a.guidance_scale>1&&(s=s.concat(s)),super.prepare_inputs_for_generation(s,n,a)}async generate(r){const n=await super.generate(r),a=this._apply_and_filter_by_delay_pattern_mask(n).unsqueeze_(0),{audio_values:s}=await Mr(this.sessions.encodec_decode,{audio_codes:a});return s}}class Vo{static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:s=!1,revision:i="main",model_file_name:o=null,subfolder:u="onnx",device:l=null,dtype:c=null,use_external_data_format:h=null,session_options:m={}}={}){let _={progress_callback:r,config:n,cache_dir:a,local_files_only:s,revision:i,model_file_name:o,subfolder:u,device:l,dtype:c,use_external_data_format:h,session_options:m};if(_.config=await Mm.from_pretrained(e,_),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);for(let p of this.MODEL_CLASS_MAPPINGS){const b=p.get(_.config.model_type);if(b)return await b[1].from_pretrained(e,_)}if(this.BASE_IF_FAIL)return console.warn(`Unknown model class "${_.config.model_type}", attempting to construct from base class.`),await re.from_pretrained(e,_);throw Error(`Unsupported model type: ${_.config.model_type}`)}}N(Vo,"MODEL_CLASS_MAPPINGS",null),N(Vo,"BASE_IF_FAIL",!1);const Gx=new Map([["bert",["BertModel",vv]],["nomic_bert",["NomicBertModel",Tv]],["roformer",["RoFormerModel",Cv]],["electra",["ElectraModel",Nv]],["esm",["EsmModel",l2]],["convbert",["ConvBertModel",zv]],["camembert",["CamembertModel",Vv]],["deberta",["DebertaModel",Kv]],["deberta-v2",["DebertaV2Model",Zv]],["mpnet",["MPNetModel",_2]],["albert",["AlbertModel",E2]],["distilbert",["DistilBertModel",a2]],["roberta",["RobertaModel",q2]],["xlm",["XLMModel",J2]],["xlm-roberta",["XLMRobertaModel",n1]],["clap",["ClapModel",Rx]],["clip",["CLIPModel",f1]],["clipseg",["CLIPSegModel",$1]],["chinese_clip",["ChineseCLIPModel",v1]],["siglip",["SiglipModel",_1]],["mobilebert",["MobileBertModel",h2]],["squeezebert",["SqueezeBertModel",$2]],["wav2vec2",["Wav2Vec2Model",tx]],["wav2vec2-bert",["Wav2Vec2BertModel",px]],["unispeech",["UniSpeechModel",ix]],["unispeech-sat",["UniSpeechSatModel",ux]],["hubert",["HubertModel",mx]],["wavlm",["WavLMModel",yx]],["audio-spectrogram-transformer",["ASTModel",u1]],["vits",["VitsModel",Dg]],["detr",["DetrModel",g$]],["table-transformer",["TableTransformerModel",b$]],["vit",["ViTModel",e$]],["fastvit",["FastViTModel",r$]],["mobilevit",["MobileViTModel",s$]],["mobilevitv2",["MobileViTV2Model",u$]],["owlvit",["OwlViTModel",d$]],["owlv2",["Owlv2Model",p$]],["beit",["BeitModel",f$]],["deit",["DeiTModel",x$]],["convnext",["ConvNextModel",F$]],["convnextv2",["ConvNextV2Model",U$]],["dinov2",["Dinov2Model",V$]],["resnet",["ResNetModel",k$]],["swin",["SwinModel",T$]],["swin2sr",["Swin2SRModel",I$]],["donut-swin",["DonutSwinModel",N$]],["yolos",["YolosModel",H$]],["dpt",["DPTModel",M$]],["glpn",["GLPNModel",P$]],["hifigan",["SpeechT5HifiGan",kx]],["efficientnet",["EfficientNetModel",Wx]]]),Hx=new Map([["t5",["T5Model",A2]],["longt5",["LongT5Model",O2]],["mt5",["MT5Model",R2]],["bart",["BartModel",B2]],["mbart",["MBartModel",F2]],["marian",["MarianModel",X$]],["whisper",["WhisperModel",d1]],["m2m_100",["M2M100Model",Z$]],["blenderbot",["BlenderbotModel",V2]],["blenderbot-small",["BlenderbotSmallModel",H2]]]),jx=new Map([["bloom",["BloomModel",K1]],["gpt2",["GPT2Model",S1]],["gptj",["GPTJModel",A1]],["gpt_bigcode",["GPTBigCodeModel",O1]],["gpt_neo",["GPTNeoModel",E1]],["gpt_neox",["GPTNeoXModel",C1]],["codegen",["CodeGenModel",R1]],["llama",["LlamaModel",B1]],["gemma",["GemmaModel",N1]],["openelm",["OpenELMModel",L1]],["qwen2",["Qwen2Model",W1]],["phi",["PhiModel",G1]],["phi3",["Phi3Model",j1]],["mpt",["MptModel",Q1]],["opt",["OPTModel",J1]],["mistral",["MistralModel",Cx]],["starcoder2",["Starcoder2Model",Ax]],["falcon",["FalconModel",Ox]],["stablelm",["StableLmModel",Lx]]]),Wg=new Map([["speecht5",["SpeechT5ForSpeechToText",xx]],["whisper",["WhisperForConditionalGeneration",qm]]]),qx=new Map([["speecht5",["SpeechT5ForTextToSpeech",Sx]]]),Kx=new Map([["vits",["VitsModel",Dg]],["musicgen",["MusicgenForConditionalGeneration",Ug]]]),Yx=new Map([["bert",["BertForSequenceClassification",xv]],["roformer",["RoFormerForSequenceClassification",Av]],["electra",["ElectraForSequenceClassification",Lv]],["esm",["EsmForSequenceClassification",c2]],["convbert",["ConvBertForSequenceClassification",Pv]],["camembert",["CamembertForSequenceClassification",Hv]],["deberta",["DebertaForSequenceClassification",Qv]],["deberta-v2",["DebertaV2ForSequenceClassification",t2]],["mpnet",["MPNetForSequenceClassification",w2]],["albert",["AlbertForSequenceClassification",T2]],["distilbert",["DistilBertForSequenceClassification",i2]],["roberta",["RobertaForSequenceClassification",Y2]],["xlm",["XLMForSequenceClassification",e1]],["xlm-roberta",["XLMRobertaForSequenceClassification",i1]],["bart",["BartForSequenceClassification",N2]],["mbart",["MBartForSequenceClassification",U2]],["mobilebert",["MobileBertForSequenceClassification",m2]],["squeezebert",["SqueezeBertForSequenceClassification",S2]]]),Qx=new Map([["bert",["BertForTokenClassification",Sv]],["roformer",["RoFormerForTokenClassification",Mv]],["electra",["ElectraForTokenClassification",Uv]],["esm",["EsmForTokenClassification",p2]],["convbert",["ConvBertForTokenClassification",Bv]],["camembert",["CamembertForTokenClassification",jv]],["deberta",["DebertaForTokenClassification",Xv]],["deberta-v2",["DebertaV2ForTokenClassification",r2]],["mpnet",["MPNetForTokenClassification",b2]],["distilbert",["DistilBertForTokenClassification",s2]],["roberta",["RobertaForTokenClassification",Q2]],["xlm",["XLMForTokenClassification",t1]],["xlm-roberta",["XLMRobertaForTokenClassification",s1]]]),Vg=new Map([["t5",["T5ForConditionalGeneration",M2]],["longt5",["LongT5ForConditionalGeneration",z2]],["mt5",["MT5ForConditionalGeneration",P2]],["bart",["BartForConditionalGeneration",D2]],["mbart",["MBartForConditionalGeneration",L2]],["marian",["MarianMTModel",J$]],["m2m_100",["M2M100ForConditionalGeneration",ex]],["blenderbot",["BlenderbotForConditionalGeneration",G2]],["blenderbot-small",["BlenderbotSmallForConditionalGeneration",j2]]]),Gg=new Map([["bloom",["BloomForCausalLM",Y1]],["gpt2",["GPT2LMHeadModel",k1]],["gptj",["GPTJForCausalLM",M1]],["gpt_bigcode",["GPTBigCodeForCausalLM",z1]],["gpt_neo",["GPTNeoForCausalLM",T1]],["gpt_neox",["GPTNeoXForCausalLM",I1]],["codegen",["CodeGenForCausalLM",P1]],["llama",["LlamaForCausalLM",D1]],["gemma",["GemmaForCausalLM",F1]],["openelm",["OpenELMForCausalLM",U1]],["qwen2",["Qwen2ForCausalLM",V1]],["phi",["PhiForCausalLM",H1]],["phi3",["Phi3ForCausalLM",q1]],["mpt",["MptForCausalLM",X1]],["opt",["OPTForCausalLM",Z1]],["mbart",["MBartForCausalLM",W2]],["mistral",["MistralForCausalLM",Ix]],["starcoder2",["Starcoder2ForCausalLM",Mx]],["falcon",["FalconForCausalLM",zx]],["trocr",["TrOCRForCausalLM",Tx]],["stablelm",["StableLmForCausalLM",Ux]]]),Xx=new Map([["bert",["BertForMaskedLM",$v]],["roformer",["RoFormerForMaskedLM",Iv]],["electra",["ElectraForMaskedLM",Fv]],["esm",["EsmForMaskedLM",d2]],["convbert",["ConvBertForMaskedLM",Rv]],["camembert",["CamembertForMaskedLM",Gv]],["deberta",["DebertaForMaskedLM",Yv]],["deberta-v2",["DebertaV2ForMaskedLM",e2]],["mpnet",["MPNetForMaskedLM",y2]],["albert",["AlbertForMaskedLM",I2]],["distilbert",["DistilBertForMaskedLM",u2]],["roberta",["RobertaForMaskedLM",K2]],["xlm",["XLMWithLMHeadModel",Z2]],["xlm-roberta",["XLMRobertaForMaskedLM",a1]],["mobilebert",["MobileBertForMaskedLM",f2]],["squeezebert",["SqueezeBertForMaskedLM",x2]]]),Jx=new Map([["bert",["BertForQuestionAnswering",kv]],["roformer",["RoFormerForQuestionAnswering",Ov]],["electra",["ElectraForQuestionAnswering",Wv]],["convbert",["ConvBertForQuestionAnswering",Dv]],["camembert",["CamembertForQuestionAnswering",qv]],["deberta",["DebertaForQuestionAnswering",Jv]],["deberta-v2",["DebertaV2ForQuestionAnswering",n2]],["mpnet",["MPNetForQuestionAnswering",v2]],["albert",["AlbertForQuestionAnswering",C2]],["distilbert",["DistilBertForQuestionAnswering",o2]],["roberta",["RobertaForQuestionAnswering",X2]],["xlm",["XLMForQuestionAnswering",r1]],["xlm-roberta",["XLMRobertaForQuestionAnswering",o1]],["mobilebert",["MobileBertForQuestionAnswering",g2]],["squeezebert",["SqueezeBertForQuestionAnswering",k2]]]),Hg=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",c1]]]),Zx=new Map([["llava",["LlavaForConditionalGeneration",Km]],["moondream1",["Moondream1ForConditionalGeneration",h1]]]),eS=new Map([["vit",["ViTForImageClassification",t$]],["fastvit",["FastViTForImageClassification",n$]],["mobilevit",["MobileViTForImageClassification",o$]],["mobilevitv2",["MobileViTV2ForImageClassification",l$]],["beit",["BeitForImageClassification",m$]],["deit",["DeiTForImageClassification",S$]],["convnext",["ConvNextForImageClassification",L$]],["convnextv2",["ConvNextV2ForImageClassification",W$]],["dinov2",["Dinov2ForImageClassification",G$]],["resnet",["ResNetForImageClassification",E$]],["swin",["SwinForImageClassification",C$]],["segformer",["SegformerForImageClassification",Nx]],["efficientnet",["EfficientNetForImageClassification",Vx]]]),tS=new Map([["detr",["DetrForObjectDetection",_$]],["table-transformer",["TableTransformerForObjectDetection",v$]],["yolos",["YolosForObjectDetection",j$]]]),rS=new Map([["owlvit",["OwlViTForObjectDetection",c$]],["owlv2",["Owlv2ForObjectDetection",h$]]]),nS=new Map([["detr",["DetrForSegmentation",y$]],["clipseg",["CLIPSegForImageSegmentation",x1]]]),aS=new Map([["segformer",["SegformerForSemanticSegmentation",Fx]]]),iS=new Map([["sam",["SamModel",Y$]]]),sS=new Map([["wav2vec2",["Wav2Vec2ForCTC",rx]],["wav2vec2-bert",["Wav2Vec2BertForCTC",hx]],["unispeech",["UniSpeechForCTC",sx]],["unispeech-sat",["UniSpeechSatForCTC",lx]],["wavlm",["WavLMForCTC",wx]],["hubert",["HubertForCTC",gx]]]),oS=new Map([["wav2vec2",["Wav2Vec2ForSequenceClassification",nx]],["wav2vec2-bert",["Wav2Vec2BertForSequenceClassification",fx]],["unispeech",["UniSpeechForSequenceClassification",ox]],["unispeech-sat",["UniSpeechSatForSequenceClassification",dx]],["wavlm",["WavLMForSequenceClassification",bx]],["hubert",["HubertForSequenceClassification",_x]],["audio-spectrogram-transformer",["ASTForAudioClassification",l1]]]),uS=new Map([["wavlm",["WavLMForXVector",vx]]]),lS=new Map([["unispeech-sat",["UniSpeechSatForAudioFrameClassification",cx]],["wavlm",["WavLMForAudioFrameClassification",$x]],["wav2vec2",["Wav2Vec2ForAudioFrameClassification",ax]]]),dS=new Map([["vitmatte",["VitMatteForImageMatting",i$]]]),cS=new Map([["swin2sr",["Swin2SRForImageSuperResolution",A$]]]),pS=new Map([["dpt",["DPTForDepthEstimation",O$]],["depth_anything",["DepthAnythingForDepthEstimation",R$]],["glpn",["GLPNForDepthEstimation",B$]]]),hS=new Map([["clip",["CLIPVisionModelWithProjection",g1]],["siglip",["SiglipVisionModel",w1]]]),jg=[[Gx,$e.EncoderOnly],[Hx,$e.EncoderDecoder],[jx,$e.DecoderOnly],[Yx,$e.EncoderOnly],[Qx,$e.EncoderOnly],[Vg,$e.Seq2Seq],[Wg,$e.Seq2Seq],[Gg,$e.DecoderOnly],[Xx,$e.EncoderOnly],[Jx,$e.EncoderOnly],[Hg,$e.Vision2Seq],[Zx,$e.ImageTextToText],[eS,$e.EncoderOnly],[nS,$e.EncoderOnly],[aS,$e.EncoderOnly],[dS,$e.EncoderOnly],[cS,$e.EncoderOnly],[pS,$e.EncoderOnly],[tS,$e.EncoderOnly],[rS,$e.EncoderOnly],[iS,$e.MaskGeneration],[sS,$e.EncoderOnly],[oS,$e.EncoderOnly],[qx,$e.Seq2Seq],[Kx,$e.EncoderOnly],[uS,$e.EncoderOnly],[lS,$e.EncoderOnly],[hS,$e.EncoderOnly]];for(const[t,e]of jg)for(const[r,n]of t.values())ki.set(r,e),sa.set(n,r),Rm.set(r,n);const fS=[["MusicgenForConditionalGeneration",Ug,$e.Musicgen],["CLIPTextModelWithProjection",m1,$e.EncoderOnly],["SiglipTextModel",y1,$e.EncoderOnly],["ClapTextModelWithProjection",Px,$e.EncoderOnly],["ClapAudioModelWithProjection",Bx,$e.EncoderOnly]];for(const[t,e,r]of fS)ki.set(t,r),sa.set(e,t),Rm.set(t,e);class qg extends Vo{}N(qg,"MODEL_CLASS_MAPPINGS",jg.map(e=>e[0])),N(qg,"BASE_IF_FAIL",!0);class ze extends Vt{constructor({logits:e}){super(),this.logits=e}}class mS extends Vt{constructor({logits:e,embeddings:r}){super(),this.logits=e,this.embeddings=r}}class gt extends Vt{constructor({logits:e}){super(),this.logits=e}}class _t extends Vt{constructor({logits:e}){super(),this.logits=e}}class xt extends Vt{constructor({start_logits:e,end_logits:r}){super(),this.start_logits=e,this.end_logits=r}}class wn extends Vt{constructor({logits:e}){super(),this.logits=e}}class gS extends Vt{constructor({alphas:e}){super(),this.alphas=e}}class _S extends Vt{constructor({waveform:e,spectrogram:r}){super(),this.waveform=e,this.spectrogram=r}}if(!(typeof self<"u")){if(!Ge)throw new Error("Unable to load image processing library.")}function Kg(t){if(t<1)return new Float64Array;if(t===1)return new Float64Array([1]);const e=t-1,r=Math.PI/e,n=new Float64Array(t);for(let a=0;a2595*Math.log10(1+t/700),kaldi:t=>1127*Math.log(1+t/700),slaney:(t,e=1e3,r=15,n=27/Math.log(6.4))=>t>=e?r+Math.log(t/e)*n:3*t/200};function Go(t,e="htk"){const r=yS[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}const wS={htk:t=>700*(10**(t/2595)-1),kaldi:t=>700*(Math.exp(t/1127)-1),slaney:(t,e=1e3,r=15,n=Math.log(6.4)/27)=>t>=r?e*Math.exp(n*(t-r)):200*t/3};function bS(t,e="htk"){const r=wS[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}function vS(t,e){const r=Float64Array.from({length:e.length-1},(i,o)=>e[o+1]-e[o]),n=Array.from({length:t.length},()=>new Array(e.length));for(let i=0;inew Array(t.length));for(let i=0;it+n*s)}function va(t,e,r,n,a,s=null,i="htk",o=!1){if(s!==null&&s!=="slaney")throw new Error('norm must be one of null or "slaney"');const u=Go(r,i),l=Go(n,i),c=Yg(u,l,e+2);let h=bS(c,i),m;if(o){const p=a/(t*2);m=Go(Float64Array.from({length:t},(b,v)=>v*p),i),h=c}else m=Yg(0,Math.floor(a/2),t);const _=vS(m,h);if(s!==null&&s==="slaney")for(let p=0;pa)throw Error(`frame_length (${r}) may not be larger than fft_length (${a})`);if(C!==r)throw new Error(`Length of the window (${C}) must equal frame_length (${r})`);if(n<=0)throw new Error("hop_length must be greater than zero");if(s===null&&c!==null)throw new Error("You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram. Specify `power` to fix this issue.");if(i){if(o!=="reflect")throw new Error(`pad_mode="${o}" not implemented yet.`);const M=Math.floor((a-1)/2)+1;t=$S(t,M,M)}const I=Math.floor(1+Math.floor((t.length-r)/n)),R=u?Math.floor(a/2)+1:a;let B=I,W=I;S!==null&&(S>I?x&&(W=S):W=B=S);const j=new X_(a),q=new Float64Array(a),le=new Float64Array(j.outputBufferSize),ae=new Array(B);for(let M=0;M=1;--ee)q[ee]-=l*q[ee-1];q[0]*=1-l}for(let ee=0;eeMath.pow(o,.85));break;default:throw new Error(`Unknown window type ${e}.`)}if(r&&(i=i.subarray(0,t)),n===null)return i;if(t>n)throw new Error(`Length of the window (${t}) may not be larger than frame_length (${n})`);return i}function kS([t,e,r,n]){return[t-r/2,e-n/2,t+r/2,e+n/2]}function Ho(t,e=.5,r=null,n=!1){const a=t.logits,s=t.pred_boxes,[i,o,u]=a.dims;if(r!==null&&r.length!==i)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");let l=[];for(let c=0;ce&&S.push(T)}else{let T=mr(v.data)[1];if(T===u-1||(x=Bn(v.data),x[T]I*h[(R+1)%2])),m.boxes.push(C),m.classes.push(T),m.scores.push(x[T])}}l.push(m)}return l}function $a(t,e){var r;if(!(t instanceof Float32Array||t instanceof Float64Array))throw new Error(`${e} expects input to be a Float32Array or a Float64Array, but got ${((r=t==null?void 0:t.constructor)==null?void 0:r.name)??typeof t} instead. If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.`)}function Xg(t,e,r=0,n=null){const a=t/e;let s=Z_(a)*e;return n!==null&&s>n&&(s=Math.floor(a)*e),ss?l=Math.floor(s*u/a):s>a&&(u=Math.floor(a*l/s)),await e.resize(l,u,{resample:n}))}async crop_margin(e,r=200){const n=e.clone().grayscale(),a=bu(n.data)[0],i=mr(n.data)[0]-a;if(i===0)return e;const o=r/255;let u=n.width,l=n.height,c=0,h=0;const m=n.data;for(let _=0;_this.preprocess(s)));return{pixel_values:Jn(n.map(s=>s.pixel_values),0),original_sizes:n.map(s=>s.original_size),reshaped_input_sizes:n.map(s=>s.reshaped_input_size)}}}class ES extends Qe{post_process_semantic_segmentation(e,r=null){const n=e.logits,a=n.dims[0];if(r!==null&&r.length!==a)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");const s=[];for(let i=0;im[T]&&(m[T]=x[T],_[T]=S)}const p=new Array(u.dims[0]),b=h.data;for(let S=0;SS!==void 0);s.push({segmentation:h,labels:v})}return s}}class Jg extends Qe{}class TS extends Jg{}class CS extends Qe{}class IS extends Qe{}class Zg extends Qe{}class AS extends Zg{}class MS extends Qe{}class OS extends Qe{}class e_ extends Qe{constructor(e){super(e),this.crop_pct=this.config.crop_pct??224/256}async resize(e){var n;const r=(n=this.size)==null?void 0:n.shortest_edge;if(r===void 0)throw new Error("Size dictionary must contain 'shortest_edge' key.");if(r<384){const a=Math.floor(r/this.crop_pct),[s,i]=this.get_resize_output_image_size(e,{shortest_edge:a});e=await e.resize(s,i,{resample:this.resample}),e=await e.center_crop(r,r)}else e=await e.resize(r,r,{resample:this.resample});return e}}class zS extends e_{}class RS extends Qe{}class PS extends Qe{}class BS extends Qe{constructor(e){super(e),this.include_top=this.config.include_top??!0,this.include_top&&(this.image_std=this.image_std.map(r=>r*r))}}class t_ extends Qe{}class DS extends t_{}class r_ extends Qe{post_process_object_detection(...e){return Ho(...e)}}class NS extends r_{}class FS extends Qe{}class LS extends Qe{}class n_ extends Qe{pad_image(e,r,n,a={}){const[s,i,o]=r;let u=this.image_mean;Array.isArray(this.image_mean)||(u=new Array(o).fill(u));let l=this.image_std;Array.isArray(l)||(l=new Array(o).fill(u));const c=u.map((h,m)=>-h/l[m]);return super.pad_image(e,r,n,{center:!0,constant_values:c,...a})}}class US extends n_{}class WS extends Qe{async _call(e){const r=await super._call(e),n=[r.pixel_values.dims[0],64,64],a=new fe("int64",new BigInt64Array(n.reduce((s,i)=>s*i)).fill(1n),n);return{...r,pixel_mask:a}}post_process_object_detection(...e){return Ho(...e)}remove_low_and_no_objects(e,r,n,a){let s=[],i=[],o=[];for(let u=0;un&&(s.push(c),i.push(_),o.push(h))}return[s,i,o]}check_segment_validity(e,r,n,a=.5,s=.8){let i=[],o=0,u=0;const l=r[n].data;for(let h=0;h=a&&++u;let c=o>0&&u>0;return c&&(c=o/u>s),[c,i]}compute_segments(e,r,n,a,s,i=null,o=null){let[u,l]=o??e[0].dims,c=new fe("int32",new Int32Array(u*l),[u,l]),h=[];if(o!==null)for(let v=0;v_[T]&&(m[T]=v,_[T]=x[T])}let p=0;const b=c.data;for(let v=0;va!==r.dims[s]))throw Error(`The first ${n.length} dimensions of 'input_points' and 'input_labels' must be the same.`);return new fe("int64",e.flat(1/0).map(BigInt),n)}async _call(e,{input_points:r=null,input_labels:n=null,input_boxes:a=null}={}){const s=await super._call(e);if(r&&(s.input_points=this.reshape_input_points(r,s.original_sizes,s.reshaped_input_sizes)),n){if(!s.input_points)throw Error("`input_points` must be provided if `input_labels` are provided.");s.input_labels=this.add_input_labels(n,s.input_points)}return a&&(s.input_boxes=this.reshape_input_points(a,s.original_sizes,s.reshaped_input_sizes,!0)),s}async post_process_masks(e,r,n,{mask_threshold:a=0,binarize:s=!0,pad_size:i=null}={}){const o=[];i=i??this.pad_size;const u=[i.height,i.width];for(let l=0;la&&(p[b]=1);m=new fe("bool",p,m.dims)}o.push(m)}return o}generate_crop_boxes(e,r,{crop_n_layers:n=0,overlap_ratio:a=512/1500,points_per_crop:s=32,crop_n_points_downscale_factor:i=1}={}){}}class HS extends Qe{pad_image(e,r,n,a={}){const[s,i,o]=r;return super.pad_image(e,r,{width:i+(n-i%n)%n,height:s+(n-s%n)%n},{mode:"symmetric",center:!1,constant_values:-1,...a})}}class jS extends Qe{async _call(e,r){Array.isArray(e)||(e=[e]),Array.isArray(r)||(r=[r]);const n=await Promise.all(e.map(i=>this.preprocess(i))),a=await Promise.all(r.map(i=>this.preprocess(i,{do_normalize:!1,do_convert_rgb:!1,do_convert_grayscale:!0})));return{pixel_values:Jn(n.map((i,o)=>ur([i.pixel_values,a[o].pixel_values],0)),0),original_sizes:n.map(i=>i.original_size),reshaped_input_sizes:n.map(i=>i.reshaped_input_size)}}}class qS extends Zr{constructor(e){var r;super(e),(r=this.config).mel_filters??(r.mel_filters=va(Math.floor(1+this.config.n_fft/2),this.config.feature_size,0,8e3,this.config.sampling_rate,"slaney","slaney")),this.window=Ri(this.config.n_fft,"hann")}_extract_fbank_features(e){const{data:r,dims:n}=zi(e,this.window,this.config.n_fft,this.config.hop_length,{power:2,mel_filters:this.config.mel_filters,log_mel:"log10",max_num_frames:this.config.nb_max_frames}),a=mr(r)[0];for(let s=0;sthis.config.n_samples?(console.warn("Attempting to extract features for audio longer than 30 seconds. If using a pipeline to extract transcript from a long audio clip, remember to specify `chunk_length_s` and/or `stride_length_s`."),r=e.slice(0,this.config.n_samples)):(r=new Float32Array(this.config.n_samples),r.set(e));const{data:n,dims:a}=this._extract_fbank_features(r);return{input_features:new fe("float32",n,[1,...a])}}}class KS extends Zr{_zero_mean_unit_var_norm(e){const n=e.reduce((s,i)=>s+i,0)/e.length,a=e.reduce((s,i)=>s+(i-n)**2,0)/e.length;return e.map(s=>(s-n)/Math.sqrt(a+1e-7))}async _call(e){$a(e,"Wav2Vec2FeatureExtractor"),e instanceof Float64Array&&(e=new Float32Array(e));let r=e;this.config.do_normalize&&(r=this._zero_mean_unit_var_norm(r));const n=[1,r.length];return{input_values:new fe("float32",r,n),attention_mask:new fe("int64",new BigInt64Array(r.length).fill(1n),n)}}}class YS extends Zr{constructor(e){super(e);const r=this.config.sampling_rate,n=va(256,this.config.num_mel_bins,20,Math.floor(r/2),r,null,"kaldi",!0);for(let a=0;an*32768),zi(e,this.window,400,160,{fft_length:512,power:2,center:!1,preemphasis:.97,mel_filters:this.mel_filters,log_mel:"log",mel_floor:1192092955078125e-22,remove_dc_offset:!0,max_num_frames:r,transpose:!0})}async _call(e,{padding:r=!0,pad_to_multiple_of:n=2,do_normalize_per_mel_bins:a=!0,return_attention_mask:s=!0}={}){$a(e,"SeamlessM4TFeatureExtractor");let{data:i,dims:o}=this._extract_fbank_features(e,this.config.max_length);if(a){const[b,v]=o;for(let S=0;S0){const x=new Float32Array(v*(b+S));x.set(i),x.fill(this.config.padding_value,i.length);const T=b+S;i=x,o=[T,v],s&&(u=new fe("int64",new BigInt64Array(T),[1,T]),u.data.fill(1n,0,b))}}const[l,c]=o,h=this.config.stride;if(l%h!==0)throw new Error(`The number of frames (${l}) must be a multiple of the stride (${h}).`);const _=new fe("float32",i,o).view(1,Math.floor(l/h),c*h),p={input_features:_};if(s){const b=_.dims[1],v=new BigInt64Array(b);if(u){const S=u.data;for(let x=1,T=0;x0)if(n==="rand_trunc"){i=!0;const u=Math.floor(Math.random()*(o+1));e=e.subarray(u,u+r),s=this._extract_fbank_features(e,this.mel_filters_slaney,this.config.nb_max_samples),s.dims=[1,...s.dims]}else throw new Error(`Truncation strategy "${n}" not implemented`);else{if(o<0){let u=new Float64Array(r);if(u.set(e),a==="repeat")for(let l=e.length;lprocess.stdout.write(t):t=>console.log(t);class s3 extends a3{constructor(e,{skip_prompt:r=!1,callback_function:n=null,decode_kwargs:a={},...s}={}){super(),this.tokenizer=e,this.skip_prompt=r,this.callback_function=n??i3,this.decode_kwargs={...a,...s},this.token_cache=[],this.print_len=0,this.next_tokens_are_prompt=!0}put(e){if(e.length>1)throw Error("TextStreamer only supports batch size of 1");const r=e[0];if(this.skip_prompt&&this.next_tokens_are_prompt){this.next_tokens_are_prompt=!1;return}this.token_cache=lt(this.token_cache,r);const n=this.tokenizer.decode(this.token_cache,this.decode_kwargs);let a;n.endsWith(` `)?(a=n.slice(this.print_len),this.token_cache=[],this.print_len=0):n.length>0&&wm(n.charCodeAt(n.length-1))?(a=n.slice(this.print_len),this.print_len+=a.length):(a=n.slice(this.print_len,n.lastIndexOf(" ")+1),this.print_len+=a.length),this.on_finalized_text(a,!1)}end(){let e;this.token_cache.length>0?(e=this.tokenizer.decode(this.token_cache,this.decode_kwargs).slice(this.print_len),this.token_cache=[],this.print_len=0):e="",this.next_tokens_are_prompt=!0,this.on_finalized_text(e,!0)}on_finalized_text(e,r){e.length>0&&this.callback_function(e),r&&this.callback_function(` -`)}}class o3 extends s3{constructor(e,r){super(e,{skip_prompt:!0,skip_special_tokens:!0}),this.cb=r}on_finalized_text(e){this.cb(e)}}const u3=32;class vn{static async getInstance(e=null){return this.model_id="onnx-community/whisper-base",this.tokenizer??(this.tokenizer=Im.from_pretrained(this.model_id,{progress_callback:e})),this.processor??(this.processor=qo.from_pretrained(this.model_id,{progress_callback:e})),this.model??(this.model=qm.from_pretrained(this.model_id,{dtype:{encoder_model:"fp32",decoder_model_merged:"q4"},device:"webgpu",progress_callback:e})),Promise.all([this.tokenizer,this.processor,this.model])}}N(vn,"model_id",null),N(vn,"tokenizer",null),N(vn,"processor",null),N(vn,"model",null);let Ko=!1;async function l3({audio:t,language:e}){if(Ko)return;Ko=!0,self.postMessage({status:"start"});const[r,n,a]=await vn.getInstance();let s,i=0;const o=m=>{s??(s=performance.now());let _;i++>0&&(_=i/(performance.now()-s)*1e3),self.postMessage({status:"update",output:m,tps:_,numTokens:i})},u=new o3(r,o),l=await n(t),c=await a.generate({...l,max_new_tokens:u3,language:e,streamer:u}),h=r.batch_decode(c,{skip_special_tokens:!0});self.postMessage({status:"complete",output:h}),Ko=!1}async function d3(){self.postMessage({status:"loading",data:"Loading model..."});const[t,e,r]=await vn.getInstance(n=>{self.postMessage(n)});self.postMessage({status:"loading",data:"Compiling shaders and warming up model..."}),await r.generate({input_features:im([1,80,3e3],0),max_new_tokens:1}),self.postMessage({status:"ready"})}self.addEventListener("message",async t=>{const{type:e,data:r}=t.data;switch(e){case"load":d3();break;case"generate":l3(r);break}})})(); +`)}}class o3 extends s3{constructor(e,r){super(e,{skip_prompt:!0,skip_special_tokens:!0}),this.cb=r}on_finalized_text(e){this.cb(e)}}const u3=96;class vn{static async getInstance(e=null){return this.model_id="onnx-community/whisper-base",this.tokenizer??(this.tokenizer=Im.from_pretrained(this.model_id,{progress_callback:e})),this.processor??(this.processor=qo.from_pretrained(this.model_id,{progress_callback:e})),this.model??(this.model=qm.from_pretrained(this.model_id,{dtype:{encoder_model:"fp32",decoder_model_merged:"q4"},device:"webgpu",progress_callback:e})),Promise.all([this.tokenizer,this.processor,this.model])}}N(vn,"model_id",null),N(vn,"tokenizer",null),N(vn,"processor",null),N(vn,"model",null);let Ko=!1;async function l3({audio:t,language:e}){if(Ko)return;Ko=!0,self.postMessage({status:"start"});const[r,n,a]=await vn.getInstance();let s,i=0;const o=m=>{s??(s=performance.now());let _;i++>0&&(_=i/(performance.now()-s)*1e3),self.postMessage({status:"update",output:m,tps:_,numTokens:i})},u=new o3(r,o),l=await n(t),c=await a.generate({...l,max_new_tokens:u3,language:e,streamer:u}),h=r.batch_decode(c,{skip_special_tokens:!0});self.postMessage({status:"complete",output:h}),Ko=!1}async function d3(){self.postMessage({status:"loading",data:"Loading model..."});const[t,e,r]=await vn.getInstance(n=>{self.postMessage(n)});self.postMessage({status:"loading",data:"Compiling shaders and warming up model..."}),await r.generate({input_features:im([1,80,3e3],0),max_new_tokens:1}),self.postMessage({status:"ready"})}self.addEventListener("message",async t=>{const{type:e,data:r}=t.data;switch(e){case"load":d3();break;case"generate":l3(r);break}})})();