File size: 3,833 Bytes
8d50bff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
sys.path.append(".")
from server.knowledge_base.migrate import (create_tables, reset_tables, import_from_db,
                                           folder2db, prune_db_docs, prune_folder_files)
from configs.model_config import NLTK_DATA_PATH, EMBEDDING_MODEL
import nltk
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
from datetime import datetime


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="please specify only one operate method once time.")

    parser.add_argument(
        "-r",
        "--recreate-vs",
        action="store_true",
        help=('''
            recreate vector store.
            use this option if you have copied document files to the content folder, but vector store has not been populated or DEFAUL_VS_TYPE/EMBEDDING_MODEL changed.
            '''
        )
    )
    parser.add_argument(
        "--create-tables",
        action="store_true",
        help=("create empty tables if not existed")
    )
    parser.add_argument(
        "--clear-tables",
        action="store_true",
        help=("create empty tables, or drop the database tables before recreate vector stores")
    )
    parser.add_argument(
        "--import-db",
        help="import tables from specified sqlite database"
    )
    parser.add_argument(
        "-u",
        "--update-in-db",
        action="store_true",
        help=('''
            update vector store for files exist in database.
            use this option if you want to recreate vectors for files exist in db and skip files exist in local folder only.
            '''
        )
    )
    parser.add_argument(
        "-i",
        "--increment",
        action="store_true",
        help=('''
            update vector store for files exist in local folder and not exist in database.
            use this option if you want to create vectors incrementally.
            '''
        )
    )
    parser.add_argument(
        "--prune-db",
        action="store_true",
        help=('''
            delete docs in database that not existed in local folder.
            it is used to delete database docs after user deleted some doc files in file browser
            '''
        )
    )
    parser.add_argument(
        "--prune-folder",
        action="store_true",
        help=('''
            delete doc files in local folder that not existed in database.
            is is used to free local disk space by delete unused doc files.
            '''
        )
    )
    parser.add_argument(
        "-n",
        "--kb-name",
        type=str,
        nargs="+",
        default=[],
        help=("specify knowledge base names to operate on. default is all folders exist in KB_ROOT_PATH.")
    )
    parser.add_argument(
        "-e",
        "--embed-model",
        type=str,
        default=EMBEDDING_MODEL,
        help=("specify embeddings model.")
    )

    args = parser.parse_args()
    start_time = datetime.now()

    if args.create_tables:
        create_tables() # confirm tables exist

    if args.clear_tables:
        reset_tables()
        print("database tables reset")

    if args.recreate_vs:
        create_tables()
        print("recreating all vector stores")
        folder2db(kb_names=args.kb_name, mode="recreate_vs", embed_model=args.embed_model)
    elif args.import_db:
        import_from_db(args.import_db)
    elif args.update_in_db:
        folder2db(kb_names=args.kb_name, mode="update_in_db", embed_model=args.embed_model)
    elif args.increment:
        folder2db(kb_names=args.kb_name, mode="increment", embed_model=args.embed_model)
    elif args.prune_db:
        prune_db_docs(args.kb_name)
    elif args.prune_folder:
        prune_folder_files(args.kb_name)

    end_time = datetime.now()
    print(f"总计用时: {end_time-start_time}")