File size: 4,671 Bytes
646830a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
import { JSONLoader } from "langchain/document_loaders/fs/json";
import { TextLoader } from "langchain/document_loaders/fs/text";
import { CSVLoader } from "langchain/document_loaders/fs/csv";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
// 2. Import OpenAI language model and other related modules
import { OpenAI } from "langchain/llms/openai";
import { RetrievalQAChain } from "langchain/chains";
import { HNSWLib } from "langchain/vectorstores/hnswlib";
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
// 3. Import Tiktoken for token counting
import { Tiktoken } from "@dqbd/tiktoken/lite";
import { load } from "@dqbd/tiktoken/load";
import registry from "@dqbd/tiktoken/registry.json" assert { type: "json" };
import models from "@dqbd/tiktoken/model_to_encoding.json" assert { type: "json" };
// 4. Import dotenv for loading environment variables and fs for file system operations
import dotenv from "dotenv";
import fs from "fs";
dotenv.config();
// 5. Initialize the document loader with supported file formats
const loader = new DirectoryLoader("./documents", {
".json": (path) => new JSONLoader(path),
".txt": (path) => new TextLoader(path),
".csv": (path) => new CSVLoader(path),
".pdf": (path) => new PDFLoader(path),
});
// 6. Load documents from the specified directory
console.log("Loading docs...");
const docs = await loader.load();
console.log("Docs loaded.");
// 7. Define a function to calculate the cost of tokenizing the documents
async function calculateCost() {
const modelName = "text-embedding-ada-002";
const modelKey = models[modelName];
const model = await load(registry[modelKey]);
const encoder = new Tiktoken(
model.bpe_ranks,
model.special_tokens,
model.pat_str
);
const tokens = encoder.encode(JSON.stringify(docs));
const tokenCount = tokens.length;
const ratePerThousandTokens = 0.0004;
const cost = (tokenCount / 1000) * ratePerThousandTokens;
encoder.free();
return cost;
}
const VECTOR_STORE_PATH = "Documents.index";
const question = "Tell me about these docs";
// 8. Define a function to normalize the content of the documents
function normalizeDocuments(docs) {
return docs.map((doc) => {
if (typeof doc.pageContent === "string") {
return doc.pageContent;
} else if (Array.isArray(doc.pageContent)) {
return doc.pageContent.join("\n");
}
});
}
// 9. Define the main function to run the entire process
export const run = async () => {
// 10. Calculate the cost of tokenizing the documents
console.log("Calculating cost...");
const cost = await calculateCost();
console.log("Cost calculated:", cost);
// 11. Check if the cost is within the acceptable limit
if (cost <= 1) {
// 12. Initialize the OpenAI language model
const model = new OpenAI({});
let vectorStore;
// 13. Check if an existing vector store is available
console.log("Checking for existing vector store...");
if (fs.existsSync(VECTOR_STORE_PATH)) {
// 14. Load the existing vector store
console.log("Loading existing vector store...");
vectorStore = await HNSWLib.load(
VECTOR_STORE_PATH,
new OpenAIEmbeddings()
);
console.log("Vector store loaded.");
} else {
// 15. Create a new vector store if one does not exist
console.log("Creating new vector store...");
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
});
const normalizedDocs = normalizeDocuments(docs);
const splitDocs = await textSplitter.createDocuments(normalizedDocs);
// 16. Generate the vector store from the documents
vectorStore = await HNSWLib.fromDocuments(
splitDocs,
new OpenAIEmbeddings()
);
// 17. Save the vector store to the specified path
await vectorStore.save(VECTOR_STORE_PATH);
console.log("Vector store created.");
}
// 18. Create a retrieval chain using the language model and vector store
console.log("Creating retrieval chain...");
const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever());
// 19. Query the retrieval chain with the specified question
console.log("Querying chain...");
const res = await chain.call({ query: question });
console.log({ res });
} else {
// 20. If the cost exceeds the limit, skip the embedding process
console.log("The cost of embedding exceeds $1. Skipping embeddings.");
}
};
// 21. Run the main function
run(); |