File size: 1,642 Bytes
9705b6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
const { TokenTextSplitter } = require('langchain/text_splitter');
/**
* Splits a given text by token chunks, based on the provided parameters for the TokenTextSplitter.
* Note: limit or memoize use of this function as its calculation is expensive.
*
* @param {Object} obj - Configuration object for the text splitting operation.
* @param {string} obj.text - The text to be split.
* @param {string} [obj.encodingName='cl100k_base'] - Encoding name. Defaults to 'cl100k_base'.
* @param {number} [obj.chunkSize=1] - The token size of each chunk. Defaults to 1.
* @param {number} [obj.chunkOverlap=0] - The number of chunk elements to be overlapped between adjacent chunks. Defaults to 0.
* @param {number} [obj.returnSize] - If specified and not 0, slices the return array from the end by this amount.
*
* @returns {Promise<Array>} Returns a promise that resolves to an array of text chunks.
* If no text is provided, an empty array is returned.
* If returnSize is specified and not 0, slices the return array from the end by returnSize.
*
* @async
* @function tokenSplit
*/
async function tokenSplit({
text,
encodingName = 'cl100k_base',
chunkSize = 1,
chunkOverlap = 0,
returnSize,
}) {
if (!text) {
return [];
}
const splitter = new TokenTextSplitter({
encodingName,
chunkSize,
chunkOverlap,
});
if (!returnSize) {
return await splitter.splitText(text);
}
const splitText = await splitter.splitText(text);
if (returnSize && returnSize > 0 && splitText.length > 0) {
return splitText.slice(-Math.abs(returnSize));
}
return splitText;
}
module.exports = tokenSplit;
|