Tax_AI / app.py
vvklingayat's picture
Upload folder using huggingface_hub
17d1354
# https://github.com/jerryjliu/llama_index/issues/1900
from llama_index import (
SimpleDirectoryReader,
LLMPredictor,
ServiceContext,
GPTVectorStoreIndex,
PromptHelper,
load_index_from_storage,
StorageContext,
)
from langchain.chat_models import ChatOpenAI
import gradio as gr
import openai
import os
from pathlib import Path
# BRTest
openai.api_key = os.environ["OPENAI_API_KEY"]
BASE_DIR = Path(os.path.dirname(os.path.abspath(__file__)))
# set maximum input size
max_input_size = 4096
# set number of output tokens
num_outputs = 512
# set maximum chunk overlap
max_chunk_overlap = 0.20
# set chunk size limit
chunk_size_limit = 600
prompt_helper = PromptHelper(
max_input_size,
num_outputs,
max_chunk_overlap,
chunk_size_limit=chunk_size_limit,
)
llm = ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo", max_tokens=num_outputs)
llm_predictor = LLMPredictor(llm=llm)
# Define the file extension you want to target (e.g., ".txt")
target_extension = ".pdf"
def get_files_with_extension(directory_path):
files_list = []
for root, _, files in os.walk(directory_path):
for file in files:
if file.endswith(target_extension):
file_path = os.path.join(root, file)
files_list.append(file_path)
return files_list
def construct_index(folder_name):
DOCS_PATH = Path(os.path.join(BASE_DIR, folder_name, "docs"))
Path(DOCS_PATH).mkdir(parents=True, exist_ok=True)
files_list = get_files_with_extension(DOCS_PATH)
documents = SimpleDirectoryReader(input_files=files_list).load_data()
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
index = GPTVectorStoreIndex.from_documents(
documents, service_context=service_context
)
# index.save_to_disk(f"{folder_name}/index.json")
index.storage_context.persist(persist_dir=f"{folder_name}/storage")
return index
def main(folder_name):
DOCS_PATH = Path(os.path.join(BASE_DIR, folder_name, "docs"))
Path(DOCS_PATH).mkdir(parents=True, exist_ok=True)
def chatbot(input_text):
print("chatbot")
storage_context = StorageContext.from_defaults(
persist_dir=f"{folder_name}/storage"
)
index = load_index_from_storage(storage_context)
# GPTVectorStoreIndex.load_from_disk(
# f"{folder_name}\\index.json",
# llm_predictor=llm_predictor,
# prompt_helper=prompt_helper,
# )
query_engine = index.as_query_engine()
response = query_engine.query(input_text)
print(response)
return response.response
iface = gr.Interface(
fn=chatbot,
inputs=gr.components.Textbox(lines=7, label="Enter your text"),
outputs="text",
title="Custom-trained AI Chatbot",
)
index = construct_index(folder_name)
print("Launcing")
# iface.launch(share=True)
iface.launch()
if __name__ == "__main__":
import sys
try:
folder_name = sys.argv[1]
except IndexError:
# raise Exception("folder_name not passed\napp.py folder_name")
folder_name = "tax"
DOCS_PATH = Path(os.path.join(BASE_DIR, folder_name))
if not Path(DOCS_PATH).exists():
raise Exception("folder does not exist.")
main(folder_name)