Spaces:

nahi002
/

jini1.1

Runtime error

App Files Files Community

jini1.1 / main.js

nahi002's picture

Create main.js

646830a about 1 year ago

history blame contribute delete

4.67 kB

	import { DirectoryLoader } from "langchain/document_loaders/fs/directory";
	import { JSONLoader } from "langchain/document_loaders/fs/json";
	import { TextLoader } from "langchain/document_loaders/fs/text";
	import { CSVLoader } from "langchain/document_loaders/fs/csv";
	import { PDFLoader } from "langchain/document_loaders/fs/pdf";

	// 2. Import OpenAI language model and other related modules
	import { OpenAI } from "langchain/llms/openai";
	import { RetrievalQAChain } from "langchain/chains";
	import { HNSWLib } from "langchain/vectorstores/hnswlib";
	import { OpenAIEmbeddings } from "langchain/embeddings/openai";
	import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

	// 3. Import Tiktoken for token counting
	import { Tiktoken } from "@dqbd/tiktoken/lite";
	import { load } from "@dqbd/tiktoken/load";
	import registry from "@dqbd/tiktoken/registry.json" assert { type: "json" };
	import models from "@dqbd/tiktoken/model_to_encoding.json" assert { type: "json" };

	// 4. Import dotenv for loading environment variables and fs for file system operations
	import dotenv from "dotenv";
	import fs from "fs";
	dotenv.config();

	// 5. Initialize the document loader with supported file formats
	const loader = new DirectoryLoader("./documents", {
	".json": (path) => new JSONLoader(path),
	".txt": (path) => new TextLoader(path),
	".csv": (path) => new CSVLoader(path),
	".pdf": (path) => new PDFLoader(path),
	});

	// 6. Load documents from the specified directory
	console.log("Loading docs...");
	const docs = await loader.load();
	console.log("Docs loaded.");

	// 7. Define a function to calculate the cost of tokenizing the documents
	async function calculateCost() {
	const modelName = "text-embedding-ada-002";
	const modelKey = models[modelName];
	const model = await load(registry[modelKey]);
	const encoder = new Tiktoken(
	model.bpe_ranks,
	model.special_tokens,
	model.pat_str
	);
	const tokens = encoder.encode(JSON.stringify(docs));
	const tokenCount = tokens.length;
	const ratePerThousandTokens = 0.0004;
	const cost = (tokenCount / 1000) * ratePerThousandTokens;
	encoder.free();
	return cost;
	}

	const VECTOR_STORE_PATH = "Documents.index";
	const question = "Tell me about these docs";

	// 8. Define a function to normalize the content of the documents
	function normalizeDocuments(docs) {
	return docs.map((doc) => {
	if (typeof doc.pageContent === "string") {
	return doc.pageContent;
	} else if (Array.isArray(doc.pageContent)) {
	return doc.pageContent.join("\n");
	}
	});
	}

	// 9. Define the main function to run the entire process
	export const run = async () => {
	// 10. Calculate the cost of tokenizing the documents
	console.log("Calculating cost...");
	const cost = await calculateCost();
	console.log("Cost calculated:", cost);

	// 11. Check if the cost is within the acceptable limit
	if (cost <= 1) {
	// 12. Initialize the OpenAI language model
	const model = new OpenAI({});

	let vectorStore;

	// 13. Check if an existing vector store is available
	console.log("Checking for existing vector store...");
	if (fs.existsSync(VECTOR_STORE_PATH)) {
	// 14. Load the existing vector store
	console.log("Loading existing vector store...");
	vectorStore = await HNSWLib.load(
	VECTOR_STORE_PATH,
	new OpenAIEmbeddings()
	);
	console.log("Vector store loaded.");
	} else {
	// 15. Create a new vector store if one does not exist
	console.log("Creating new vector store...");
	const textSplitter = new RecursiveCharacterTextSplitter({
	chunkSize: 1000,
	});
	const normalizedDocs = normalizeDocuments(docs);
	const splitDocs = await textSplitter.createDocuments(normalizedDocs);

	// 16. Generate the vector store from the documents
	vectorStore = await HNSWLib.fromDocuments(
	splitDocs,
	new OpenAIEmbeddings()
	);
	// 17. Save the vector store to the specified path
	await vectorStore.save(VECTOR_STORE_PATH);

	console.log("Vector store created.");
	}

	// 18. Create a retrieval chain using the language model and vector store
	console.log("Creating retrieval chain...");
	const chain = RetrievalQAChain.fromLLM(model, vectorStore.asRetriever());

	// 19. Query the retrieval chain with the specified question
	console.log("Querying chain...");
	const res = await chain.call({ query: question });
	console.log({ res });
	} else {
	// 20. If the cost exceeds the limit, skip the embedding process
	console.log("The cost of embedding exceeds $1. Skipping embeddings.");
	}
	};

	// 21. Run the main function
	run();