Create main.js
Browse files
main.js
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
|
2 |
+
import { JSONLoader } from "langchain/document_loaders/fs/json";
|
3 |
+
import { TextLoader } from "langchain/document_loaders/fs/text";
|
4 |
+
import { CSVLoader } from "langchain/document_loaders/fs/csv";
|
5 |
+
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
|
6 |
+
|
7 |
+
// 2. Import OpenAI language model and other related modules
|
8 |
+
import { OpenAI } from "langchain/llms/openai";
|
9 |
+
import { RetrievalQAChain } from "langchain/chains";
|
10 |
+
import { HNSWLib } from "langchain/vectorstores/hnswlib";
|
11 |
+
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
|
12 |
+
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
|
13 |
+
|
14 |
+
// 3. Import Tiktoken for token counting
|
15 |
+
import { Tiktoken } from "@dqbd/tiktoken/lite";
|
16 |
+
import { load } from "@dqbd/tiktoken/load";
|
17 |
+
import registry from "@dqbd/tiktoken/registry.json" assert { type: "json" };
|
18 |
+
import models from "@dqbd/tiktoken/model_to_encoding.json" assert { type: "json" };
|
19 |
+
|
20 |
+
// 4. Import dotenv for loading environment variables and fs for file system operations
|
21 |
+
import dotenv from "dotenv";
|
22 |
+
import fs from "fs";
|
23 |
+
dotenv.config();
|
24 |
+
|
25 |
+
// 5. Initialize the document loader with supported file formats
|
26 |
+
const loader = new DirectoryLoader("./documents", {
|
27 |
+
".json": (path) => new JSONLoader(path),
|
28 |
+
".txt": (path) => new TextLoader(path),
|
29 |
+
".csv": (path) => new CSVLoader(path),
|
30 |
+
".pdf": (path) => new PDFLoader(path),
|
31 |
+
});
|
32 |
+
|
33 |
+
// 6. Load documents from the specified directory
|
34 |
+
console.log("Loading docs...");
|
35 |
+
const docs = await loader.load();
|
36 |
+
console.log("Docs loaded.");
|
37 |
+
|
38 |
+
// 7. Define a function to calculate the cost of tokenizing the documents
|
39 |
+
async function calculateCost() {
|
40 |
+
const modelName = "text-embedding-ada-002";
|
41 |
+
const modelKey = models[modelName];
|
42 |
+
const model = await load(registry[modelKey]);
|
43 |
+
const encoder = new Tiktoken(
|
44 |
+
model.bpe_ranks,
|
45 |
+
model.special_tokens,
|
46 |
+
model.pat_str
|
47 |
+
);
|
48 |
+
const tokens = encoder.encode(JSON.stringify(docs));
|
49 |
+
const tokenCount = tokens.length;
|
50 |
+
const ratePerThousandTokens = 0.0004;
|
51 |
+
const cost = (tokenCount / 1000) * ratePerThousandTokens;
|
52 |
+
encoder.free();
|
53 |
+
return cost;
|
54 |
+
}
|
55 |
+
|
56 |
+
const VECTOR_STORE_PATH = "Documents.index";
|
57 |
+
const question = "Tell me about these docs";
|
58 |
+
|
59 |
+
// 8. Define a function to normalize the content of the documents
|
60 |
+
function normalizeDocuments(docs) {
|
61 |
+
return docs.map((doc) => {
|
62 |
+
if (typeof doc.pageContent === "string") {
|
63 |
+
return doc.pageContent;
|
64 |
+
} else if (Array.isArray(doc.pageContent)) {
|
65 |
+
return doc.pageContent.join("\n");
|
66 |
+
}
|
67 |
+
});
|
68 |
+
}
|
69 |
+
|
70 |
+
// 9. Define the main function to run the entire process
|
71 |
+
export const run = async () => {
|
72 |
+
// 10. Calculate the cost of tokenizing the documents
|
73 |
+
console.log("Calculating cost...");
|
74 |
+
const cost = await calculateCost();
|
75 |
+
console.log("Cost calculated:", cost);
|
76 |
+
|
77 |
+
// 11. Check if the cost is within the acceptable limit
|
78 |
+
if (cost <= 1) {
|
79 |
+
// 12. Initialize the OpenAI language model
|
80 |
+
const model = new OpenAI({});
|
81 |
+
|
82 |
+
let vectorStore;
|
83 |
+
|
84 |
+
// 13. Check if an existing vector store is available
|
85 |
+
console.log("Checking for existing vector store...");
|
86 |
+
if (fs.existsSync(VECTOR_STORE_PATH)) {
|
87 |
+
// 14. Load the existing vector store
|
88 |
+
console.log("Loading existing vector store...");
|
89 |
+
vectorStore = await HNSWLib.load(
|
90 |
+
VECTOR_STORE_PATH,
|
91 |
+
new OpenAIEmbeddings()
|
92 |
+
);
|
93 |
+
console.log("Vector store loaded.");
|
94 |
+
} else {
|
95 |
+
// 15. Create a new vector store if one does not exist
|
96 |
+
console.log("Creating new vector store...");
|
97 |
+
const textSplitter = new RecursiveCharacterTextSplitter({
|
98 |
+
chunkSize: 1000,
|
99 |
+
});
|
100 |
+
const normalizedDocs = normalizeDocuments(docs);
|
101 |
+
const splitDocs = await textSplitter.createDocuments(normalizedDocs);
|
102 |
+
|
103 |
+
// 16. Generate the vector store from the documents
|
104 |
+
vectorStore = await HNSWLib.fromDocuments(
|
105 |
+
splitDocs,
|
106 |
+
new OpenAIEmbeddings()
|
107 |
+
);
|
108 |
+
// 17. Save the vector store to the specified path
|
109 |
+
await vectorStore.save(VECTOR_STORE_PATH);
|
110 |
+
|
111 |
+
console.log("Vector store created.");
|
112 |
+
}
|
113 |
+
|
114 |
+
// 18. Create a retrieval chain using the language model and vector store
|
115 |
+
console.log("Creating retrieval chain...");
|
116 |
+
const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever());
|
117 |
+
|
118 |
+
// 19. Query the retrieval chain with the specified question
|
119 |
+
console.log("Querying chain...");
|
120 |
+
const res = await chain.call({ query: question });
|
121 |
+
console.log({ res });
|
122 |
+
} else {
|
123 |
+
// 20. If the cost exceeds the limit, skip the embedding process
|
124 |
+
console.log("The cost of embedding exceeds $1. Skipping embeddings.");
|
125 |
+
}
|
126 |
+
};
|
127 |
+
|
128 |
+
// 21. Run the main function
|
129 |
+
run();
|