supercat666's picture
add igv
78c921d
raw
history blame
18.7 kB
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
import { Type } from './enum.js';
import { clampRange } from './util/vector.js';
import { DataType, strideForType } from './type.js';
import { Data, makeData, DataProps } from './data.js';
import { BigIntArray, TypedArray, TypedArrayDataType } from './interfaces.js';
import {
isChunkedValid,
computeChunkOffsets,
computeChunkNullCounts,
sliceChunks,
wrapChunkedCall1,
wrapChunkedCall2,
wrapChunkedIndexOf,
} from './util/chunk.js';
import { BigInt64Array, BigUint64Array } from './util/compat.js';
import { instance as getVisitor } from './visitor/get.js';
import { instance as setVisitor } from './visitor/set.js';
import { instance as indexOfVisitor } from './visitor/indexof.js';
import { instance as iteratorVisitor } from './visitor/iterator.js';
import { instance as byteLengthVisitor } from './visitor/bytelength.js';
// @ts-ignore
import type { vectorFromArray } from './factories.js';
export interface Vector<T extends DataType = any> {
///
// Virtual properties for the TypeScript compiler.
// These do not exist at runtime.
///
readonly TType: T['TType'];
readonly TArray: T['TArray'];
readonly TValue: T['TValue'];
/**
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Symbol/isConcatSpreadable
*/
[Symbol.isConcatSpreadable]: true;
}
const visitorsByTypeId = {} as { [typeId: number]: { get: any; set: any; indexOf: any; byteLength: any } };
const vectorPrototypesByTypeId = {} as { [typeId: number]: any };
/**
* Array-like data structure. Use the convenience method {@link makeVector} and {@link vectorFromArray} to create vectors.
*/
export class Vector<T extends DataType = any> {
constructor(input: readonly (Data<T> | Vector<T>)[]) {
const data: Data<T>[] = input[0] instanceof Vector
? (input as Vector<T>[]).flatMap(x => x.data)
: input as Data<T>[];
if (data.length === 0 || data.some((x) => !(x instanceof Data))) {
throw new TypeError('Vector constructor expects an Array of Data instances.');
}
const type = data[0]?.type;
switch (data.length) {
case 0: this._offsets = [0]; break;
case 1: {
// special case for unchunked vectors
const { get, set, indexOf, byteLength } = visitorsByTypeId[type.typeId];
const unchunkedData = data[0];
this.isValid = (index: number) => isChunkedValid(unchunkedData, index);
this.get = (index: number) => get(unchunkedData, index);
this.set = (index: number, value: T) => set(unchunkedData, index, value);
this.indexOf = (index: number) => indexOf(unchunkedData, index);
this.getByteLength = (index: number) => byteLength(unchunkedData, index);
this._offsets = [0, unchunkedData.length];
break;
}
default:
Object.setPrototypeOf(this, vectorPrototypesByTypeId[type.typeId]);
this._offsets = computeChunkOffsets(data);
break;
}
this.data = data;
this.type = type;
this.stride = strideForType(type);
this.numChildren = type.children?.length ?? 0;
this.length = this._offsets[this._offsets.length - 1];
}
declare protected _offsets: number[] | Uint32Array;
declare protected _nullCount: number;
declare protected _byteLength: number;
/**
* The {@link DataType `DataType`} of this Vector.
*/
public declare readonly type: T;
/**
* The primitive {@link Data `Data`} instances for this Vector's elements.
*/
public declare readonly data: ReadonlyArray<Data<T>>;
/**
* The number of elements in this Vector.
*/
public declare readonly length: number;
/**
* The number of primitive values per Vector element.
*/
public declare readonly stride: number;
/**
* The number of child Vectors if this Vector is a nested dtype.
*/
public declare readonly numChildren: number;
/**
* The aggregate size (in bytes) of this Vector's buffers and/or child Vectors.
*/
public get byteLength() {
if (this._byteLength === -1) {
this._byteLength = this.data.reduce((byteLength, data) => byteLength + data.byteLength, 0);
}
return this._byteLength;
}
/**
* The number of null elements in this Vector.
*/
public get nullCount() {
if (this._nullCount === -1) {
this._nullCount = computeChunkNullCounts(this.data);
}
return this._nullCount;
}
/**
* The Array or TypedAray constructor used for the JS representation
* of the element's values in {@link Vector.prototype.toArray `toArray()`}.
*/
public get ArrayType(): T['ArrayType'] { return this.type.ArrayType; }
/**
* The name that should be printed when the Vector is logged in a message.
*/
public get [Symbol.toStringTag]() {
return `${this.VectorName}<${this.type[Symbol.toStringTag]}>`;
}
/**
* The name of this Vector.
*/
public get VectorName() { return `${Type[this.type.typeId]}Vector`; }
/**
* Check whether an element is null.
* @param index The index at which to read the validity bitmap.
*/
// @ts-ignore
public isValid(index: number): boolean { return false; }
/**
* Get an element value by position.
* @param index The index of the element to read.
*/
// @ts-ignore
public get(index: number): T['TValue'] | null { return null; }
/**
* Set an element value by position.
* @param index The index of the element to write.
* @param value The value to set.
*/
// @ts-ignore
public set(index: number, value: T['TValue'] | null): void { return; }
/**
* Retrieve the index of the first occurrence of a value in an Vector.
* @param element The value to locate in the Vector.
* @param offset The index at which to begin the search. If offset is omitted, the search starts at index 0.
*/
// @ts-ignore
public indexOf(element: T['TValue'], offset?: number): number { return -1; }
public includes(element: T['TValue'], offset?: number): boolean { return this.indexOf(element, offset) > 0; }
/**
* Get the size in bytes of an element by index.
* @param index The index at which to get the byteLength.
*/
// @ts-ignore
public getByteLength(index: number): number { return 0; }
/**
* Iterator for the Vector's elements.
*/
public [Symbol.iterator](): IterableIterator<T['TValue'] | null> {
return iteratorVisitor.visit(this);
}
/**
* Combines two or more Vectors of the same type.
* @param others Additional Vectors to add to the end of this Vector.
*/
public concat(...others: Vector<T>[]): Vector<T> {
return new Vector(this.data.concat(others.flatMap((x) => x.data).flat(Number.POSITIVE_INFINITY)));
}
/**
* Return a zero-copy sub-section of this Vector.
* @param start The beginning of the specified portion of the Vector.
* @param end The end of the specified portion of the Vector. This is exclusive of the element at the index 'end'.
*/
public slice(begin?: number, end?: number): Vector<T> {
return new Vector(clampRange(this, begin, end, ({ data, _offsets }, begin, end) =>
sliceChunks(data, _offsets, begin, end)
));
}
public toJSON() { return [...this]; }
/**
* Return a JavaScript Array or TypedArray of the Vector's elements.
*
* @note If this Vector contains a single Data chunk and the Vector's type is a
* primitive numeric type corresponding to one of the JavaScript TypedArrays, this
* method returns a zero-copy slice of the underlying TypedArray values. If there's
* more than one chunk, the resulting TypedArray will be a copy of the data from each
* chunk's underlying TypedArray values.
*
* @returns An Array or TypedArray of the Vector's elements, based on the Vector's DataType.
*/
public toArray(): T['TArray'] {
const { type, data, length, stride, ArrayType } = this;
// Fast case, return subarray if possible
switch (type.typeId) {
case Type.Int:
case Type.Float:
case Type.Decimal:
case Type.Time:
case Type.Timestamp:
switch (data.length) {
case 0: return new ArrayType();
case 1: return data[0].values.subarray(0, length * stride);
default: return data.reduce((memo, { values, length: chunk_length }) => {
memo.array.set(values.subarray(0, chunk_length * stride), memo.offset);
memo.offset += chunk_length * stride;
return memo;
}, { array: new ArrayType(length * stride), offset: 0 }).array;
}
}
// Otherwise if not primitive, slow copy
return [...this] as T['TArray'];
}
/**
* Returns a string representation of the Vector.
*
* @returns A string representation of the Vector.
*/
public toString() {
return `[${[...this].join(',')}]`;
}
/**
* Returns a child Vector by name, or null if this Vector has no child with the given name.
* @param name The name of the child to retrieve.
*/
public getChild<R extends keyof T['TChildren']>(name: R) {
return this.getChildAt(this.type.children?.findIndex((f) => f.name === name));
}
/**
* Returns a child Vector by index, or null if this Vector has no child at the supplied index.
* @param index The index of the child to retrieve.
*/
public getChildAt<R extends DataType = any>(index: number): Vector<R> | null {
if (index > -1 && index < this.numChildren) {
return new Vector(this.data.map(({ children }) => children[index] as Data<R>));
}
return null;
}
public get isMemoized(): boolean {
if (DataType.isDictionary(this.type)) {
return this.data[0].dictionary!.isMemoized;
}
return false;
}
/**
* Adds memoization to the Vector's {@link get} method. For dictionary
* vectors, this method return a vector that memoizes only the dictionary
* values.
*
* Memoization is very useful when decoding a value is expensive such as
* Uft8. The memoization creates a cache of the size of the Vector and
* therfore increases memory usage.
*
* @returns A new vector that memoizes calls to {@link get}.
*/
public memoize(): MemoizedVector<T> {
if (DataType.isDictionary(this.type)) {
const dictionary = new MemoizedVector(this.data[0].dictionary!);
const newData = this.data.map((data) => {
const cloned = data.clone();
cloned.dictionary = dictionary;
return cloned;
});
return new Vector(newData);
}
return new MemoizedVector(this);
}
/**
* Returns a vector without memoization of the {@link get} method. If this
* vector is not memoized, this method returns this vector.
*
* @returns A a vector without memoization.
*/
public unmemoize(): Vector<T> {
if (DataType.isDictionary(this.type) && this.isMemoized) {
const dictionary = this.data[0].dictionary!.unmemoize();
const newData = this.data.map((data) => {
const newData = data.clone();
newData.dictionary = dictionary;
return newData;
});
return new Vector(newData);
}
return this;
}
// Initialize this static property via an IIFE so bundlers don't tree-shake
// out this logic, but also so we're still compliant with `"sideEffects": false`
protected static [Symbol.toStringTag] = ((proto: Vector) => {
(proto as any).type = DataType.prototype;
(proto as any).data = [];
(proto as any).length = 0;
(proto as any).stride = 1;
(proto as any).numChildren = 0;
(proto as any)._nullCount = -1;
(proto as any)._byteLength = -1;
(proto as any)._offsets = new Uint32Array([0]);
(proto as any)[Symbol.isConcatSpreadable] = true;
const typeIds: Type[] = Object.keys(Type)
.map((T: any) => Type[T] as any)
.filter((T: any) => typeof T === 'number' && T !== Type.NONE);
for (const typeId of typeIds) {
const get = getVisitor.getVisitFnByTypeId(typeId);
const set = setVisitor.getVisitFnByTypeId(typeId);
const indexOf = indexOfVisitor.getVisitFnByTypeId(typeId);
const byteLength = byteLengthVisitor.getVisitFnByTypeId(typeId);
visitorsByTypeId[typeId] = { get, set, indexOf, byteLength };
vectorPrototypesByTypeId[typeId] = Object.create(proto, {
['isValid']: { value: wrapChunkedCall1(isChunkedValid) },
['get']: { value: wrapChunkedCall1(getVisitor.getVisitFnByTypeId(typeId)) },
['set']: { value: wrapChunkedCall2(setVisitor.getVisitFnByTypeId(typeId)) },
['indexOf']: { value: wrapChunkedIndexOf(indexOfVisitor.getVisitFnByTypeId(typeId)) },
['getByteLength']: { value: wrapChunkedCall1(byteLengthVisitor.getVisitFnByTypeId(typeId)) },
});
}
return 'Vector';
})(Vector.prototype);
}
class MemoizedVector<T extends DataType = any> extends Vector<T> {
public constructor(vector: Vector<T>) {
super(vector.data);
const get = this.get;
const set = this.set;
const slice = this.slice;
const cache = new Array<T['TValue'] | null>(this.length);
Object.defineProperty(this, 'get', {
value(index: number) {
const cachedValue = cache[index];
if (cachedValue !== undefined) {
return cachedValue;
}
const value = get.call(this, index);
cache[index] = value;
return value;
}
});
Object.defineProperty(this, 'set', {
value(index: number, value: T['TValue'] | null) {
set.call(this, index, value);
cache[index] = value;
}
});
Object.defineProperty(this, 'slice', {
value: (begin?: number, end?: number) => new MemoizedVector(slice.call(this, begin, end))
});
Object.defineProperty(this, 'isMemoized', { value: true });
Object.defineProperty(this, 'unmemoize', {
value: () => new Vector(this.data)
});
Object.defineProperty(this, 'memoize', {
value: () => this
});
}
}
import * as dtypes from './type.js';
/**
* Creates a Vector without data copies.
*
* @example
* ```ts
* const vector = makeVector(new Int32Array([1, 2, 3]));
* ```
*/
export function makeVector<T extends TypedArray | BigIntArray>(data: T | readonly T[]): Vector<TypedArrayDataType<T>>;
export function makeVector<T extends DataView>(data: T | readonly T[]): Vector<dtypes.Int8>;
export function makeVector<T extends DataType>(data: Data<T> | readonly Data<T>[]): Vector<T>;
export function makeVector<T extends DataType>(data: Vector<T> | readonly Vector<T>[]): Vector<T>;
export function makeVector<T extends DataType>(data: DataProps<T> | readonly DataProps<T>[]): Vector<T>;
export function makeVector(init: any) {
if (init) {
if (init instanceof Data) { return new Vector([init]); }
if (init instanceof Vector) { return new Vector(init.data); }
if (init.type instanceof DataType) { return new Vector([makeData(init)]); }
if (Array.isArray(init)) {
return new Vector(init.flatMap(v => unwrapInputs(v)));
}
if (ArrayBuffer.isView(init)) {
if (init instanceof DataView) {
init = new Uint8Array(init.buffer);
}
const props = { offset: 0, length: init.length, nullCount: 0, data: init };
if (init instanceof Int8Array) { return new Vector([makeData({ ...props, type: new dtypes.Int8 })]); }
if (init instanceof Int16Array) { return new Vector([makeData({ ...props, type: new dtypes.Int16 })]); }
if (init instanceof Int32Array) { return new Vector([makeData({ ...props, type: new dtypes.Int32 })]); }
if (init instanceof BigInt64Array) { return new Vector([makeData({ ...props, type: new dtypes.Int64 })]); }
if (init instanceof Uint8Array || init instanceof Uint8ClampedArray) { return new Vector([makeData({ ...props, type: new dtypes.Uint8 })]); }
if (init instanceof Uint16Array) { return new Vector([makeData({ ...props, type: new dtypes.Uint16 })]); }
if (init instanceof Uint32Array) { return new Vector([makeData({ ...props, type: new dtypes.Uint32 })]); }
if (init instanceof BigUint64Array) { return new Vector([makeData({ ...props, type: new dtypes.Uint64 })]); }
if (init instanceof Float32Array) { return new Vector([makeData({ ...props, type: new dtypes.Float32 })]); }
if (init instanceof Float64Array) { return new Vector([makeData({ ...props, type: new dtypes.Float64 })]); }
throw new Error('Unrecognized input');
}
}
throw new Error('Unrecognized input');
}
function unwrapInputs(x: any) {
return x instanceof Data ? [x] : (x instanceof Vector ? x.data : makeVector(x).data);
}