Spaces:
Sleeping
Sleeping
// Licensed to the Apache Software Foundation (ASF) under one | |
// or more contributor license agreements. See the NOTICE file | |
// distributed with this work for additional information | |
// regarding copyright ownership. The ASF licenses this file | |
// to you under the Apache License, Version 2.0 (the | |
// "License"); you may not use this file except in compliance | |
// with the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, | |
// software distributed under the License is distributed on an | |
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
// KIND, either express or implied. See the License for the | |
// specific language governing permissions and limitations | |
// under the License. | |
import { Type } from './enum.js'; | |
import { clampRange } from './util/vector.js'; | |
import { DataType, strideForType } from './type.js'; | |
import { Data, makeData, DataProps } from './data.js'; | |
import { BigIntArray, TypedArray, TypedArrayDataType } from './interfaces.js'; | |
import { | |
isChunkedValid, | |
computeChunkOffsets, | |
computeChunkNullCounts, | |
sliceChunks, | |
wrapChunkedCall1, | |
wrapChunkedCall2, | |
wrapChunkedIndexOf, | |
} from './util/chunk.js'; | |
import { BigInt64Array, BigUint64Array } from './util/compat.js'; | |
import { instance as getVisitor } from './visitor/get.js'; | |
import { instance as setVisitor } from './visitor/set.js'; | |
import { instance as indexOfVisitor } from './visitor/indexof.js'; | |
import { instance as iteratorVisitor } from './visitor/iterator.js'; | |
import { instance as byteLengthVisitor } from './visitor/bytelength.js'; | |
// @ts-ignore | |
import type { vectorFromArray } from './factories.js'; | |
export interface Vector<T extends DataType = any> { | |
/// | |
// Virtual properties for the TypeScript compiler. | |
// These do not exist at runtime. | |
/// | |
readonly TType: T['TType']; | |
readonly TArray: T['TArray']; | |
readonly TValue: T['TValue']; | |
/** | |
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Symbol/isConcatSpreadable | |
*/ | |
[Symbol.isConcatSpreadable]: true; | |
} | |
const visitorsByTypeId = {} as { [typeId: number]: { get: any; set: any; indexOf: any; byteLength: any } }; | |
const vectorPrototypesByTypeId = {} as { [typeId: number]: any }; | |
/** | |
* Array-like data structure. Use the convenience method {@link makeVector} and {@link vectorFromArray} to create vectors. | |
*/ | |
export class Vector<T extends DataType = any> { | |
constructor(input: readonly (Data<T> | Vector<T>)[]) { | |
const data: Data<T>[] = input[0] instanceof Vector | |
? (input as Vector<T>[]).flatMap(x => x.data) | |
: input as Data<T>[]; | |
if (data.length === 0 || data.some((x) => !(x instanceof Data))) { | |
throw new TypeError('Vector constructor expects an Array of Data instances.'); | |
} | |
const type = data[0]?.type; | |
switch (data.length) { | |
case 0: this._offsets = [0]; break; | |
case 1: { | |
// special case for unchunked vectors | |
const { get, set, indexOf, byteLength } = visitorsByTypeId[type.typeId]; | |
const unchunkedData = data[0]; | |
this.isValid = (index: number) => isChunkedValid(unchunkedData, index); | |
this.get = (index: number) => get(unchunkedData, index); | |
this.set = (index: number, value: T) => set(unchunkedData, index, value); | |
this.indexOf = (index: number) => indexOf(unchunkedData, index); | |
this.getByteLength = (index: number) => byteLength(unchunkedData, index); | |
this._offsets = [0, unchunkedData.length]; | |
break; | |
} | |
default: | |
Object.setPrototypeOf(this, vectorPrototypesByTypeId[type.typeId]); | |
this._offsets = computeChunkOffsets(data); | |
break; | |
} | |
this.data = data; | |
this.type = type; | |
this.stride = strideForType(type); | |
this.numChildren = type.children?.length ?? 0; | |
this.length = this._offsets[this._offsets.length - 1]; | |
} | |
declare protected _offsets: number[] | Uint32Array; | |
declare protected _nullCount: number; | |
declare protected _byteLength: number; | |
/** | |
* The {@link DataType `DataType`} of this Vector. | |
*/ | |
public declare readonly type: T; | |
/** | |
* The primitive {@link Data `Data`} instances for this Vector's elements. | |
*/ | |
public declare readonly data: ReadonlyArray<Data<T>>; | |
/** | |
* The number of elements in this Vector. | |
*/ | |
public declare readonly length: number; | |
/** | |
* The number of primitive values per Vector element. | |
*/ | |
public declare readonly stride: number; | |
/** | |
* The number of child Vectors if this Vector is a nested dtype. | |
*/ | |
public declare readonly numChildren: number; | |
/** | |
* The aggregate size (in bytes) of this Vector's buffers and/or child Vectors. | |
*/ | |
public get byteLength() { | |
if (this._byteLength === -1) { | |
this._byteLength = this.data.reduce((byteLength, data) => byteLength + data.byteLength, 0); | |
} | |
return this._byteLength; | |
} | |
/** | |
* The number of null elements in this Vector. | |
*/ | |
public get nullCount() { | |
if (this._nullCount === -1) { | |
this._nullCount = computeChunkNullCounts(this.data); | |
} | |
return this._nullCount; | |
} | |
/** | |
* The Array or TypedAray constructor used for the JS representation | |
* of the element's values in {@link Vector.prototype.toArray `toArray()`}. | |
*/ | |
public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; } | |
/** | |
* The name that should be printed when the Vector is logged in a message. | |
*/ | |
public get [Symbol.toStringTag]() { | |
return `${this.VectorName}<${this.type[Symbol.toStringTag]}>`; | |
} | |
/** | |
* The name of this Vector. | |
*/ | |
public get VectorName() { return `${Type[this.type.typeId]}Vector`; } | |
/** | |
* Check whether an element is null. | |
* @param index The index at which to read the validity bitmap. | |
*/ | |
// @ts-ignore | |
public isValid(index: number): boolean { return false; } | |
/** | |
* Get an element value by position. | |
* @param index The index of the element to read. | |
*/ | |
// @ts-ignore | |
public get(index: number): T['TValue'] | null { return null; } | |
/** | |
* Set an element value by position. | |
* @param index The index of the element to write. | |
* @param value The value to set. | |
*/ | |
// @ts-ignore | |
public set(index: number, value: T['TValue'] | null): void { return; } | |
/** | |
* Retrieve the index of the first occurrence of a value in an Vector. | |
* @param element The value to locate in the Vector. | |
* @param offset The index at which to begin the search. If offset is omitted, the search starts at index 0. | |
*/ | |
// @ts-ignore | |
public indexOf(element: T['TValue'], offset?: number): number { return -1; } | |
public includes(element: T['TValue'], offset?: number): boolean { return this.indexOf(element, offset) > 0; } | |
/** | |
* Get the size in bytes of an element by index. | |
* @param index The index at which to get the byteLength. | |
*/ | |
// @ts-ignore | |
public getByteLength(index: number): number { return 0; } | |
/** | |
* Iterator for the Vector's elements. | |
*/ | |
public [Symbol.iterator](): IterableIterator<T['TValue'] | null> { | |
return iteratorVisitor.visit(this); | |
} | |
/** | |
* Combines two or more Vectors of the same type. | |
* @param others Additional Vectors to add to the end of this Vector. | |
*/ | |
public concat(...others: Vector<T>[]): Vector<T> { | |
return new Vector(this.data.concat(others.flatMap((x) => x.data).flat(Number.POSITIVE_INFINITY))); | |
} | |
/** | |
* Return a zero-copy sub-section of this Vector. | |
* @param start The beginning of the specified portion of the Vector. | |
* @param end The end of the specified portion of the Vector. This is exclusive of the element at the index 'end'. | |
*/ | |
public slice(begin?: number, end?: number): Vector<T> { | |
return new Vector(clampRange(this, begin, end, ({ data, _offsets }, begin, end) => | |
sliceChunks(data, _offsets, begin, end) | |
)); | |
} | |
public toJSON() { return [...this]; } | |
/** | |
* Return a JavaScript Array or TypedArray of the Vector's elements. | |
* | |
* @note If this Vector contains a single Data chunk and the Vector's type is a | |
* primitive numeric type corresponding to one of the JavaScript TypedArrays, this | |
* method returns a zero-copy slice of the underlying TypedArray values. If there's | |
* more than one chunk, the resulting TypedArray will be a copy of the data from each | |
* chunk's underlying TypedArray values. | |
* | |
* @returns An Array or TypedArray of the Vector's elements, based on the Vector's DataType. | |
*/ | |
public toArray(): T['TArray'] { | |
const { type, data, length, stride, ArrayType } = this; | |
// Fast case, return subarray if possible | |
switch (type.typeId) { | |
case Type.Int: | |
case Type.Float: | |
case Type.Decimal: | |
case Type.Time: | |
case Type.Timestamp: | |
switch (data.length) { | |
case 0: return new ArrayType(); | |
case 1: return data[0].values.subarray(0, length * stride); | |
default: return data.reduce((memo, { values, length: chunk_length }) => { | |
memo.array.set(values.subarray(0, chunk_length * stride), memo.offset); | |
memo.offset += chunk_length * stride; | |
return memo; | |
}, { array: new ArrayType(length * stride), offset: 0 }).array; | |
} | |
} | |
// Otherwise if not primitive, slow copy | |
return [...this] as T['TArray']; | |
} | |
/** | |
* Returns a string representation of the Vector. | |
* | |
* @returns A string representation of the Vector. | |
*/ | |
public toString() { | |
return `[${[...this].join(',')}]`; | |
} | |
/** | |
* Returns a child Vector by name, or null if this Vector has no child with the given name. | |
* @param name The name of the child to retrieve. | |
*/ | |
public getChild<R extends keyof T['TChildren']>(name: R) { | |
return this.getChildAt(this.type.children?.findIndex((f) => f.name === name)); | |
} | |
/** | |
* Returns a child Vector by index, or null if this Vector has no child at the supplied index. | |
* @param index The index of the child to retrieve. | |
*/ | |
public getChildAt<R extends DataType = any>(index: number): Vector<R> | null { | |
if (index > -1 && index < this.numChildren) { | |
return new Vector(this.data.map(({ children }) => children[index] as Data<R>)); | |
} | |
return null; | |
} | |
public get isMemoized(): boolean { | |
if (DataType.isDictionary(this.type)) { | |
return this.data[0].dictionary!.isMemoized; | |
} | |
return false; | |
} | |
/** | |
* Adds memoization to the Vector's {@link get} method. For dictionary | |
* vectors, this method return a vector that memoizes only the dictionary | |
* values. | |
* | |
* Memoization is very useful when decoding a value is expensive such as | |
* Uft8. The memoization creates a cache of the size of the Vector and | |
* therfore increases memory usage. | |
* | |
* @returns A new vector that memoizes calls to {@link get}. | |
*/ | |
public memoize(): MemoizedVector<T> { | |
if (DataType.isDictionary(this.type)) { | |
const dictionary = new MemoizedVector(this.data[0].dictionary!); | |
const newData = this.data.map((data) => { | |
const cloned = data.clone(); | |
cloned.dictionary = dictionary; | |
return cloned; | |
}); | |
return new Vector(newData); | |
} | |
return new MemoizedVector(this); | |
} | |
/** | |
* Returns a vector without memoization of the {@link get} method. If this | |
* vector is not memoized, this method returns this vector. | |
* | |
* @returns A a vector without memoization. | |
*/ | |
public unmemoize(): Vector<T> { | |
if (DataType.isDictionary(this.type) && this.isMemoized) { | |
const dictionary = this.data[0].dictionary!.unmemoize(); | |
const newData = this.data.map((data) => { | |
const newData = data.clone(); | |
newData.dictionary = dictionary; | |
return newData; | |
}); | |
return new Vector(newData); | |
} | |
return this; | |
} | |
// Initialize this static property via an IIFE so bundlers don't tree-shake | |
// out this logic, but also so we're still compliant with `"sideEffects": false` | |
protected static [Symbol.toStringTag] = ((proto: Vector) => { | |
(proto as any).type = DataType.prototype; | |
(proto as any).data = []; | |
(proto as any).length = 0; | |
(proto as any).stride = 1; | |
(proto as any).numChildren = 0; | |
(proto as any)._nullCount = -1; | |
(proto as any)._byteLength = -1; | |
(proto as any)._offsets = new Uint32Array([0]); | |
(proto as any)[Symbol.isConcatSpreadable] = true; | |
const typeIds: Type[] = Object.keys(Type) | |
.map((T: any) => Type[T] as any) | |
.filter((T: any) => typeof T === 'number' && T !== Type.NONE); | |
for (const typeId of typeIds) { | |
const get = getVisitor.getVisitFnByTypeId(typeId); | |
const set = setVisitor.getVisitFnByTypeId(typeId); | |
const indexOf = indexOfVisitor.getVisitFnByTypeId(typeId); | |
const byteLength = byteLengthVisitor.getVisitFnByTypeId(typeId); | |
visitorsByTypeId[typeId] = { get, set, indexOf, byteLength }; | |
vectorPrototypesByTypeId[typeId] = Object.create(proto, { | |
['isValid']: { value: wrapChunkedCall1(isChunkedValid) }, | |
['get']: { value: wrapChunkedCall1(getVisitor.getVisitFnByTypeId(typeId)) }, | |
['set']: { value: wrapChunkedCall2(setVisitor.getVisitFnByTypeId(typeId)) }, | |
['indexOf']: { value: wrapChunkedIndexOf(indexOfVisitor.getVisitFnByTypeId(typeId)) }, | |
['getByteLength']: { value: wrapChunkedCall1(byteLengthVisitor.getVisitFnByTypeId(typeId)) }, | |
}); | |
} | |
return 'Vector'; | |
})(Vector.prototype); | |
} | |
class MemoizedVector<T extends DataType = any> extends Vector<T> { | |
public constructor(vector: Vector<T>) { | |
super(vector.data); | |
const get = this.get; | |
const set = this.set; | |
const slice = this.slice; | |
const cache = new Array<T['TValue'] | null>(this.length); | |
Object.defineProperty(this, 'get', { | |
value(index: number) { | |
const cachedValue = cache[index]; | |
if (cachedValue !== undefined) { | |
return cachedValue; | |
} | |
const value = get.call(this, index); | |
cache[index] = value; | |
return value; | |
} | |
}); | |
Object.defineProperty(this, 'set', { | |
value(index: number, value: T['TValue'] | null) { | |
set.call(this, index, value); | |
cache[index] = value; | |
} | |
}); | |
Object.defineProperty(this, 'slice', { | |
value: (begin?: number, end?: number) => new MemoizedVector(slice.call(this, begin, end)) | |
}); | |
Object.defineProperty(this, 'isMemoized', { value: true }); | |
Object.defineProperty(this, 'unmemoize', { | |
value: () => new Vector(this.data) | |
}); | |
Object.defineProperty(this, 'memoize', { | |
value: () => this | |
}); | |
} | |
} | |
import * as dtypes from './type.js'; | |
/** | |
* Creates a Vector without data copies. | |
* | |
* @example | |
* ```ts | |
* const vector = makeVector(new Int32Array([1, 2, 3])); | |
* ``` | |
*/ | |
export function makeVector<T extends TypedArray | BigIntArray>(data: T | readonly T[]): Vector<TypedArrayDataType<T>>; | |
export function makeVector<T extends DataView>(data: T | readonly T[]): Vector<dtypes.Int8>; | |
export function makeVector<T extends DataType>(data: Data<T> | readonly Data<T>[]): Vector<T>; | |
export function makeVector<T extends DataType>(data: Vector<T> | readonly Vector<T>[]): Vector<T>; | |
export function makeVector<T extends DataType>(data: DataProps<T> | readonly DataProps<T>[]): Vector<T>; | |
export function makeVector(init: any) { | |
if (init) { | |
if (init instanceof Data) { return new Vector([init]); } | |
if (init instanceof Vector) { return new Vector(init.data); } | |
if (init.type instanceof DataType) { return new Vector([makeData(init)]); } | |
if (Array.isArray(init)) { | |
return new Vector(init.flatMap(v => unwrapInputs(v))); | |
} | |
if (ArrayBuffer.isView(init)) { | |
if (init instanceof DataView) { | |
init = new Uint8Array(init.buffer); | |
} | |
const props = { offset: 0, length: init.length, nullCount: 0, data: init }; | |
if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); } | |
if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); } | |
if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); } | |
if (init instanceof BigInt64Array) { return new Vector([makeData({ ...props, type: new dtypes.Int64 })]); } | |
if (init instanceof Uint8Array || init instanceof Uint8ClampedArray) { return new Vector([makeData({ ...props, type: new dtypes.Uint8 })]); } | |
if (init instanceof Uint16Array) { return new Vector([makeData({ ...props, type: new dtypes.Uint16 })]); } | |
if (init instanceof Uint32Array) { return new Vector([makeData({ ...props, type: new dtypes.Uint32 })]); } | |
if (init instanceof BigUint64Array) { return new Vector([makeData({ ...props, type: new dtypes.Uint64 })]); } | |
if (init instanceof Float32Array) { return new Vector([makeData({ ...props, type: new dtypes.Float32 })]); } | |
if (init instanceof Float64Array) { return new Vector([makeData({ ...props, type: new dtypes.Float64 })]); } | |
throw new Error('Unrecognized input'); | |
} | |
} | |
throw new Error('Unrecognized input'); | |
} | |
function unwrapInputs(x: any) { | |
return x instanceof Data ? [x] : (x instanceof Vector ? x.data : makeVector(x).data); | |
} | |