File size: 3,349 Bytes
776c1a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8f5915
17d1354
776c1a4
 
 
 
 
 
 
 
3931ea9
 
776c1a4
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# https://github.com/jerryjliu/llama_index/issues/1900
from llama_index import (
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
    GPTVectorStoreIndex,
    PromptHelper,
    load_index_from_storage,
    StorageContext,
)
from langchain.chat_models import ChatOpenAI
import gradio as gr
import openai

import os
from pathlib import Path

# BRTest
openai.api_key = os.environ["OPENAI_API_KEY"]

BASE_DIR = Path(os.path.dirname(os.path.abspath(__file__)))

# set maximum input size
max_input_size = 4096
# set number of output tokens
num_outputs = 512
# set maximum chunk overlap
max_chunk_overlap = 0.20
# set chunk size limit
chunk_size_limit = 600


prompt_helper = PromptHelper(
    max_input_size,
    num_outputs,
    max_chunk_overlap,
    chunk_size_limit=chunk_size_limit,
)
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo", max_tokens=num_outputs)
llm_predictor = LLMPredictor(llm=llm)

# Define the file extension you want to target (e.g., ".txt")
target_extension = ".pdf"


def get_files_with_extension(directory_path):
    files_list = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith(target_extension):
                file_path = os.path.join(root, file)
                files_list.append(file_path)

    return files_list


def construct_index(folder_name):
    DOCS_PATH = Path(os.path.join(BASE_DIR, folder_name, "docs"))
    Path(DOCS_PATH).mkdir(parents=True, exist_ok=True)

    files_list = get_files_with_extension(DOCS_PATH)
    documents = SimpleDirectoryReader(input_files=files_list).load_data()

    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
    index = GPTVectorStoreIndex.from_documents(
        documents, service_context=service_context
    )

    # index.save_to_disk(f"{folder_name}/index.json")
    index.storage_context.persist(persist_dir=f"{folder_name}/storage")

    return index


def main(folder_name):
    DOCS_PATH = Path(os.path.join(BASE_DIR, folder_name, "docs"))
    Path(DOCS_PATH).mkdir(parents=True, exist_ok=True)

    def chatbot(input_text):
        print("chatbot")

        storage_context = StorageContext.from_defaults(
            persist_dir=f"{folder_name}/storage"
        )
        index = load_index_from_storage(storage_context)
        # GPTVectorStoreIndex.load_from_disk(
        #     f"{folder_name}\\index.json",
        #     llm_predictor=llm_predictor,
        #     prompt_helper=prompt_helper,
        # )

        query_engine = index.as_query_engine()
        response = query_engine.query(input_text)

        print(response)
        return response.response

    iface = gr.Interface(
        fn=chatbot,
        inputs=gr.components.Textbox(lines=7, label="Enter your text"),
        outputs="text",
        title="Custom-trained AI Chatbot",
    )

    index = construct_index(folder_name)

    print("Launcing")
    # iface.launch(share=True)
    iface.launch()


if __name__ == "__main__":
    import sys

    try:
        folder_name = sys.argv[1]
    except IndexError:
        # raise Exception("folder_name not passed\napp.py folder_name")
        folder_name = "tax"

    DOCS_PATH = Path(os.path.join(BASE_DIR, folder_name))
    if not Path(DOCS_PATH).exists():
        raise Exception("folder does not exist.")

    main(folder_name)