matt HOFFNER commited on
Commit
73a1dae
β€’
1 Parent(s): d072c37

add docHandle

Browse files
package-lock.json CHANGED
@@ -17,6 +17,7 @@
17
  "@xenova/transformers": "^2.1.1",
18
  "eslint": "8.40.0",
19
  "eslint-config-next": "13.4.2",
 
20
  "langchain": "^0.0.90",
21
  "next": "13.4.2",
22
  "react": "18.2.0",
@@ -25,6 +26,7 @@
25
  "uuid": "^9.0.0"
26
  },
27
  "devDependencies": {
 
28
  "@types/uuid": "^9.0.1"
29
  }
30
  },
@@ -1354,11 +1356,30 @@
1354
  "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.1.tgz",
1355
  "integrity": "sha512-LG4opVs2ANWZ1TJoKc937iMmNstM/d0ae1vNbnBvBhqCSezgVUOzcLCqbI5elV8Vy6WKwKjaqR+zO9VKirBBCA=="
1356
  },
 
 
 
 
 
 
 
 
 
 
1357
  "node_modules/@types/json5": {
1358
  "version": "0.0.29",
1359
  "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz",
1360
  "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ=="
1361
  },
 
 
 
 
 
 
 
 
 
1362
  "node_modules/@types/long": {
1363
  "version": "4.0.2",
1364
  "resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
@@ -3144,16 +3165,16 @@
3144
  "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow=="
3145
  },
3146
  "node_modules/fs-extra": {
3147
- "version": "10.1.0",
3148
- "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz",
3149
- "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==",
3150
  "dependencies": {
3151
  "graceful-fs": "^4.2.0",
3152
  "jsonfile": "^6.0.1",
3153
  "universalify": "^2.0.0"
3154
  },
3155
  "engines": {
3156
- "node": ">=12"
3157
  }
3158
  },
3159
  "node_modules/fs.realpath": {
@@ -5422,6 +5443,19 @@
5422
  "node": ">= 8.0.0"
5423
  }
5424
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
5425
  "node_modules/run-applescript": {
5426
  "version": "5.0.0",
5427
  "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-5.0.0.tgz",
 
17
  "@xenova/transformers": "^2.1.1",
18
  "eslint": "8.40.0",
19
  "eslint-config-next": "13.4.2",
20
+ "fs-extra": "^11.1.1",
21
  "langchain": "^0.0.90",
22
  "next": "13.4.2",
23
  "react": "18.2.0",
 
26
  "uuid": "^9.0.0"
27
  },
28
  "devDependencies": {
29
+ "@types/fs-extra": "^11.0.1",
30
  "@types/uuid": "^9.0.1"
31
  }
32
  },
 
1356
  "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.1.tgz",
1357
  "integrity": "sha512-LG4opVs2ANWZ1TJoKc937iMmNstM/d0ae1vNbnBvBhqCSezgVUOzcLCqbI5elV8Vy6WKwKjaqR+zO9VKirBBCA=="
1358
  },
1359
+ "node_modules/@types/fs-extra": {
1360
+ "version": "11.0.1",
1361
+ "resolved": "https://registry.npmjs.org/@types/fs-extra/-/fs-extra-11.0.1.tgz",
1362
+ "integrity": "sha512-MxObHvNl4A69ofaTRU8DFqvgzzv8s9yRtaPPm5gud9HDNvpB3GPQFvNuTWAI59B9huVGV5jXYJwbCsmBsOGYWA==",
1363
+ "dev": true,
1364
+ "dependencies": {
1365
+ "@types/jsonfile": "*",
1366
+ "@types/node": "*"
1367
+ }
1368
+ },
1369
  "node_modules/@types/json5": {
1370
  "version": "0.0.29",
1371
  "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz",
1372
  "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ=="
1373
  },
1374
+ "node_modules/@types/jsonfile": {
1375
+ "version": "6.1.1",
1376
+ "resolved": "https://registry.npmjs.org/@types/jsonfile/-/jsonfile-6.1.1.tgz",
1377
+ "integrity": "sha512-GSgiRCVeapDN+3pqA35IkQwasaCh/0YFH5dEF6S88iDvEn901DjOeH3/QPY+XYP1DFzDZPvIvfeEgk+7br5png==",
1378
+ "dev": true,
1379
+ "dependencies": {
1380
+ "@types/node": "*"
1381
+ }
1382
+ },
1383
  "node_modules/@types/long": {
1384
  "version": "4.0.2",
1385
  "resolved": "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz",
 
3165
  "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow=="
3166
  },
3167
  "node_modules/fs-extra": {
3168
+ "version": "11.1.1",
3169
+ "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-11.1.1.tgz",
3170
+ "integrity": "sha512-MGIE4HOvQCeUCzmlHs0vXpih4ysz4wg9qiSAu6cd42lVwPbTM1TjV7RusoyQqMmk/95gdQZX72u+YW+c3eEpFQ==",
3171
  "dependencies": {
3172
  "graceful-fs": "^4.2.0",
3173
  "jsonfile": "^6.0.1",
3174
  "universalify": "^2.0.0"
3175
  },
3176
  "engines": {
3177
+ "node": ">=14.14"
3178
  }
3179
  },
3180
  "node_modules/fs.realpath": {
 
5443
  "node": ">= 8.0.0"
5444
  }
5445
  },
5446
+ "node_modules/rollup-plugin-typescript2/node_modules/fs-extra": {
5447
+ "version": "10.1.0",
5448
+ "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz",
5449
+ "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==",
5450
+ "dependencies": {
5451
+ "graceful-fs": "^4.2.0",
5452
+ "jsonfile": "^6.0.1",
5453
+ "universalify": "^2.0.0"
5454
+ },
5455
+ "engines": {
5456
+ "node": ">=12"
5457
+ }
5458
+ },
5459
  "node_modules/run-applescript": {
5460
  "version": "5.0.0",
5461
  "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-5.0.0.tgz",
package.json CHANGED
@@ -17,6 +17,7 @@
17
  "@xenova/transformers": "^2.1.1",
18
  "eslint": "8.40.0",
19
  "eslint-config-next": "13.4.2",
 
20
  "langchain": "^0.0.90",
21
  "next": "13.4.2",
22
  "react": "18.2.0",
@@ -25,6 +26,7 @@
25
  "uuid": "^9.0.0"
26
  },
27
  "devDependencies": {
 
28
  "@types/uuid": "^9.0.1"
29
  }
30
  }
 
17
  "@xenova/transformers": "^2.1.1",
18
  "eslint": "8.40.0",
19
  "eslint-config-next": "13.4.2",
20
+ "fs-extra": "^11.1.1",
21
  "langchain": "^0.0.90",
22
  "next": "13.4.2",
23
  "react": "18.2.0",
 
26
  "uuid": "^9.0.0"
27
  },
28
  "devDependencies": {
29
+ "@types/fs-extra": "^11.0.1",
30
  "@types/uuid": "^9.0.1"
31
  }
32
  }
src/pages/api/docHandle.ts ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { NextApiRequest, NextApiResponse } from 'next';
2
+
3
+ import {
4
+ readHNSWLibModelFromLocal,
5
+ storesDir,
6
+ vectorStoreToHNSWLibModel,
7
+ } from '@/utils/file-handler';
8
+ import fs from 'fs-extra';
9
+ import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
10
+ import { HNSWLib } from 'langchain/vectorstores/hnswlib';
11
+ import { XenovaTransformersEmbeddings } from '../../embed/hf'
12
+
13
+ async function handleDocs(text: string) {
14
+ const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000 });
15
+ const docs = await textSplitter.createDocuments([text]);
16
+ console.log(docs);
17
+
18
+ const vectorStore = await HNSWLib.fromDocuments(docs, new XenovaTransformersEmbeddings());
19
+ console.log(vectorStore);
20
+
21
+ return vectorStore;
22
+ }
23
+
24
+ export default async function handler(
25
+ req: NextApiRequest,
26
+ res: NextApiResponse,
27
+ ) {
28
+ const { text } = JSON.parse(req.body);
29
+ // console.log(text);
30
+
31
+ if (!text) {
32
+ return res.status(400).json({ message: 'No question in the request' });
33
+ }
34
+
35
+ const exists = await fs.exists(storesDir);
36
+ console.log(exists);
37
+
38
+ if (exists) {
39
+ console.log('read from ' + storesDir);
40
+ const model = await readHNSWLibModelFromLocal();
41
+ return res.status(200).send({
42
+ ...model,
43
+ });
44
+ }
45
+
46
+ const vectorStore = await handleDocs(text);
47
+ const model = await vectorStoreToHNSWLibModel(vectorStore);
48
+ res.status(200).send({
49
+ ...model,
50
+ });
51
+ }
52
+
53
+ export const config = {
54
+ api: {
55
+ bodyParser: true, // Disallow body parsing, consume as stream
56
+ },
57
+ };
src/utils/file-handler.ts ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fs from 'fs-extra';
2
+ import type { OpenAIEmbeddings } from 'langchain/embeddings/openai';
3
+ import {
4
+ HNSWLib,
5
+ type HNSWLib as StoreTypeHNSWLib,
6
+ } from 'langchain/vectorstores/hnswlib';
7
+ import path from 'path';
8
+
9
+ const ifDev = process.env.NODE_ENV === 'development';
10
+ // in prod mode, only allowed to write to /tmp/
11
+ // https://vercel.com/guides/how-can-i-use-files-in-serverless-functions
12
+ export const storesDir = ifDev ? 'tmp/hnswlib-stores' : '/tmp/hnswlib-stores';
13
+
14
+ type HNSWLibModel = {
15
+ args: string;
16
+ docstore: string;
17
+ hnswlibIndex: string;
18
+ };
19
+
20
+ const HNSWLibModelFilesName = {
21
+ args: 'args.json',
22
+ docstore: 'docstore.json',
23
+ hnswlibIndex: 'hnswlib.index',
24
+ };
25
+
26
+ // looking forward to a better way to transfrom hnswlibStore <=> indexes
27
+ export async function HNSWLibModelToVectorStore(
28
+ model: HNSWLibModel,
29
+ embeddings: OpenAIEmbeddings,
30
+ ) {
31
+ await saveHNSWLibModelToLocal(model);
32
+ // load from dir
33
+ const vectorStore = await HNSWLib.load(storesDir, embeddings);
34
+ return vectorStore;
35
+ }
36
+
37
+ export async function saveHNSWLibModelToLocal(model: HNSWLibModel) {
38
+ // save model to /tmp/
39
+ await Promise.all(
40
+ Object.keys(HNSWLibModelFilesName).map((key) => {
41
+ const fullPath = path.join(
42
+ storesDir,
43
+ (HNSWLibModelFilesName as Record<string, string>)[key],
44
+ );
45
+ console.log(fullPath);
46
+ const data = (model as Record<string, string>)[key];
47
+ console.log(data);
48
+
49
+ return fs.writeFile(fullPath, data);
50
+ }),
51
+ );
52
+ }
53
+
54
+ export async function vectorStoreToHNSWLibModel(
55
+ store: StoreTypeHNSWLib,
56
+ ): Promise<HNSWLibModel> {
57
+ await store.save(storesDir);
58
+ return await readHNSWLibModelFromLocal();
59
+ }
60
+
61
+ export async function readHNSWLibModelFromLocal(): Promise<HNSWLibModel> {
62
+ const [args, docstore, hnswlibIndex] = await Promise.all([
63
+ fs.readFile(path.join(storesDir, HNSWLibModelFilesName.args), 'utf-8'),
64
+ fs.readFile(path.join(storesDir, HNSWLibModelFilesName.docstore), 'utf-8'),
65
+ fs.readFile(
66
+ path.join(storesDir, HNSWLibModelFilesName.hnswlibIndex),
67
+ 'hex',
68
+ ),
69
+ ]);
70
+ return {
71
+ args,
72
+ docstore,
73
+ hnswlibIndex,
74
+ };
75
+ }
src/utils/index.ts CHANGED
@@ -89,3 +89,4 @@ export function throttle<T extends (...args: any[]) => any>(
89
 
90
  export const DEFAULT_TEMPERATURE =
91
  parseFloat(process.env.NEXT_PUBLIC_DEFAULT_TEMPERATURE || "1");
 
 
89
 
90
  export const DEFAULT_TEMPERATURE =
91
  parseFloat(process.env.NEXT_PUBLIC_DEFAULT_TEMPERATURE || "1");
92
+