nahi002 commited on
Commit
646830a
1 Parent(s): 6316e88

Create main.js

Browse files
Files changed (1) hide show
  1. main.js +129 -0
main.js ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
2
+ import { JSONLoader } from "langchain/document_loaders/fs/json";
3
+ import { TextLoader } from "langchain/document_loaders/fs/text";
4
+ import { CSVLoader } from "langchain/document_loaders/fs/csv";
5
+ import { PDFLoader } from "langchain/document_loaders/fs/pdf";
6
+
7
+ // 2. Import OpenAI language model and other related modules
8
+ import { OpenAI } from "langchain/llms/openai";
9
+ import { RetrievalQAChain } from "langchain/chains";
10
+ import { HNSWLib } from "langchain/vectorstores/hnswlib";
11
+ import { OpenAIEmbeddings } from "langchain/embeddings/openai";
12
+ import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
13
+
14
+ // 3. Import Tiktoken for token counting
15
+ import { Tiktoken } from "@dqbd/tiktoken/lite";
16
+ import { load } from "@dqbd/tiktoken/load";
17
+ import registry from "@dqbd/tiktoken/registry.json" assert { type: "json" };
18
+ import models from "@dqbd/tiktoken/model_to_encoding.json" assert { type: "json" };
19
+
20
+ // 4. Import dotenv for loading environment variables and fs for file system operations
21
+ import dotenv from "dotenv";
22
+ import fs from "fs";
23
+ dotenv.config();
24
+
25
+ // 5. Initialize the document loader with supported file formats
26
+ const loader = new DirectoryLoader("./documents", {
27
+ ".json": (path) => new JSONLoader(path),
28
+ ".txt": (path) => new TextLoader(path),
29
+ ".csv": (path) => new CSVLoader(path),
30
+ ".pdf": (path) => new PDFLoader(path),
31
+ });
32
+
33
+ // 6. Load documents from the specified directory
34
+ console.log("Loading docs...");
35
+ const docs = await loader.load();
36
+ console.log("Docs loaded.");
37
+
38
+ // 7. Define a function to calculate the cost of tokenizing the documents
39
+ async function calculateCost() {
40
+ const modelName = "text-embedding-ada-002";
41
+ const modelKey = models[modelName];
42
+ const model = await load(registry[modelKey]);
43
+ const encoder = new Tiktoken(
44
+ model.bpe_ranks,
45
+ model.special_tokens,
46
+ model.pat_str
47
+ );
48
+ const tokens = encoder.encode(JSON.stringify(docs));
49
+ const tokenCount = tokens.length;
50
+ const ratePerThousandTokens = 0.0004;
51
+ const cost = (tokenCount / 1000) * ratePerThousandTokens;
52
+ encoder.free();
53
+ return cost;
54
+ }
55
+
56
+ const VECTOR_STORE_PATH = "Documents.index";
57
+ const question = "Tell me about these docs";
58
+
59
+ // 8. Define a function to normalize the content of the documents
60
+ function normalizeDocuments(docs) {
61
+ return docs.map((doc) => {
62
+ if (typeof doc.pageContent === "string") {
63
+ return doc.pageContent;
64
+ } else if (Array.isArray(doc.pageContent)) {
65
+ return doc.pageContent.join("\n");
66
+ }
67
+ });
68
+ }
69
+
70
+ // 9. Define the main function to run the entire process
71
+ export const run = async () => {
72
+ // 10. Calculate the cost of tokenizing the documents
73
+ console.log("Calculating cost...");
74
+ const cost = await calculateCost();
75
+ console.log("Cost calculated:", cost);
76
+
77
+ // 11. Check if the cost is within the acceptable limit
78
+ if (cost <= 1) {
79
+ // 12. Initialize the OpenAI language model
80
+ const model = new OpenAI({});
81
+
82
+ let vectorStore;
83
+
84
+ // 13. Check if an existing vector store is available
85
+ console.log("Checking for existing vector store...");
86
+ if (fs.existsSync(VECTOR_STORE_PATH)) {
87
+ // 14. Load the existing vector store
88
+ console.log("Loading existing vector store...");
89
+ vectorStore = await HNSWLib.load(
90
+ VECTOR_STORE_PATH,
91
+ new OpenAIEmbeddings()
92
+ );
93
+ console.log("Vector store loaded.");
94
+ } else {
95
+ // 15. Create a new vector store if one does not exist
96
+ console.log("Creating new vector store...");
97
+ const textSplitter = new RecursiveCharacterTextSplitter({
98
+ chunkSize: 1000,
99
+ });
100
+ const normalizedDocs = normalizeDocuments(docs);
101
+ const splitDocs = await textSplitter.createDocuments(normalizedDocs);
102
+
103
+ // 16. Generate the vector store from the documents
104
+ vectorStore = await HNSWLib.fromDocuments(
105
+ splitDocs,
106
+ new OpenAIEmbeddings()
107
+ );
108
+ // 17. Save the vector store to the specified path
109
+ await vectorStore.save(VECTOR_STORE_PATH);
110
+
111
+ console.log("Vector store created.");
112
+ }
113
+
114
+ // 18. Create a retrieval chain using the language model and vector store
115
+ console.log("Creating retrieval chain...");
116
+ const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever());
117
+
118
+ // 19. Query the retrieval chain with the specified question
119
+ console.log("Querying chain...");
120
+ const res = await chain.call({ query: question });
121
+ console.log({ res });
122
+ } else {
123
+ // 20. If the cost exceeds the limit, skip the embedding process
124
+ console.log("The cost of embedding exceeds $1. Skipping embeddings.");
125
+ }
126
+ };
127
+
128
+ // 21. Run the main function
129
+ run();