File size: 2,892 Bytes
0809507
9a2650e
0809507
 
 
38bc9e2
0809507
 
9a2650e
 
 
 
 
 
 
 
 
 
ef2a3f4
0809507
 
 
 
38bc9e2
9a2650e
0809507
 
9a2650e
3b7cf58
 
0809507
 
 
1230ae3
0809507
 
 
1230ae3
 
0809507
ef2a3f4
3b7cf58
 
 
 
 
 
 
 
 
 
 
ef2a3f4
0809507
3b7cf58
9a2650e
 
 
0809507
9a2650e
 
0809507
9a2650e
 
0809507
 
 
 
9a2650e
 
 
 
 
 
 
 
 
0809507
 
 
 
9a2650e
0809507
 
 
 
 
9a2650e
 
0809507
9a2650e
 
402c1d3
 
38bc9e2
0809507
 
9a2650e
 
 
 
0809507
9a2650e
 
402c1d3
 
9a2650e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import os
import pickle
from json import dumps, loads

import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import HfFileSystem
from llama_index import (
    Document,
    GPTVectorStoreIndex,
    LLMPredictor,
    PromptHelper,
    ServiceContext,
    StorageContext,
    load_index_from_storage,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from utils.customLLM import CustomLLM

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
fs = HfFileSystem()

# get model
# model_name = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')

# define prompt helper
# set maximum input size
context_window = 2048
# set number of output tokens
num_output = 525
# set maximum chunk overlap
chunk_overlap_ratio = 0.2
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)

# create a pipeline
# pl = pipeline(
#     model=model,
#     tokenizer=tokenizer,
#     task="text-generation",
#     # device=0, # GPU device number
#     # max_length=512,
#     do_sample=True,
#     top_p=0.95,
#     top_k=50,
#     temperature=0.7
# )

# define llm
llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor, prompt_helper=prompt_helper
)


def prepare_data(file_path: str):
    df = pd.read_json(file_path)
    df = df.replace(to_replace="", value=np.nan).dropna(axis=0)  # remove null values

    parsed = loads(df.to_json(orient="records"))

    documents = []
    for item in parsed:
        document = Document(
            item["paragraphText"],
            item["_id"]["$oid"],
            extra_info={
                "chapter": item["chapter"],
                "article": item["article"],
                "title": item["title"],
            },
        )
        documents.append(document)

    return documents


def initialize_index(index_name):
    file_path = f"./vectorStores/{index_name}"
    if os.path.exists(file_path):
        # rebuild storage context
        storage_context = StorageContext.from_defaults(persist_dir=file_path)

        # local load index access
        index = load_index_from_storage(storage_context)

        # huggingface repo load access
        # with fs.open(file_path, "r") as file:
        #     index = pickle.loads(file.readlines())
        return index
    else:
        documents = prepare_data(r"./assets/regItems.json")
        index = GPTVectorStoreIndex.from_documents(
            documents, service_context=service_context
        )
        # local write access
        index.storage_context.persist(file_path)

        # huggingface repo write access
        # with fs.open(file_path, "w") as file:
        #     file.write(pickle.dumps(index))
        return index