Zwea Htet commited on
Commit
9a2650e
·
1 Parent(s): 38bc9e2

added file system for huggingface and object serialization

Browse files
Files changed (3) hide show
  1. .gitignore +2 -1
  2. models/bloom.py +45 -16
  3. requirements.txt +3 -1
.gitignore CHANGED
@@ -3,4 +3,5 @@ data/__pycache__
3
  models/__pycache__
4
  .env
5
  __pycache__
6
- vectorStores
 
 
3
  models/__pycache__
4
  .env
5
  __pycache__
6
+ vectorStores
7
+ .vscode
models/bloom.py CHANGED
@@ -1,22 +1,31 @@
1
  import os
 
2
  from json import dumps, loads
3
 
4
  import numpy as np
5
  import openai
6
  import pandas as pd
7
  from dotenv import load_dotenv
8
- from llama_index import (Document, GPTVectorStoreIndex, LLMPredictor,
9
- PromptHelper, ServiceContext, StorageContext,
10
- load_index_from_storage)
 
 
 
 
 
 
 
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
12
 
13
  from utils.customLLM import CustomLLM
14
 
15
  load_dotenv()
16
  openai.api_key = os.getenv("OPENAI_API_KEY")
 
17
 
18
  # get model
19
- # model_name = "bigscience/bloom-560m"
20
  # tokenizer = AutoTokenizer.from_pretrained(model_name)
21
  # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
22
 
@@ -44,35 +53,55 @@ prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)
44
 
45
  # define llm
46
  llm_predictor = LLMPredictor(llm=CustomLLM())
47
- service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
 
 
48
 
49
- def prepare_data(file_path:str):
 
50
  df = pd.read_json(file_path)
51
- df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values
52
-
53
  parsed = loads(df.to_json(orient="records"))
54
 
55
  documents = []
56
  for item in parsed:
57
- document = Document(item['paragraphText'],
58
- item['_id']['$oid'],
59
- extra_info={"chapter": item['chapter'],
60
- "article": item['article'],
61
- "title": item['title']})
 
 
 
 
62
  documents.append(document)
63
 
64
  return documents
65
 
 
66
  def initialize_index(index_name):
67
  file_path = f"./vectorStores/{index_name}"
68
  if os.path.exists(file_path):
69
  # rebuild storage context
70
  storage_context = StorageContext.from_defaults(persist_dir=file_path)
71
- # load index
 
72
  index = load_index_from_storage(storage_context)
 
 
 
 
73
  return index
74
  else:
75
  documents = prepare_data(r"./assets/regItems.json")
76
- index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
 
 
 
77
  index.storage_context.persist(file_path)
78
- return index
 
 
 
 
 
1
  import os
2
+ import pickle
3
  from json import dumps, loads
4
 
5
  import numpy as np
6
  import openai
7
  import pandas as pd
8
  from dotenv import load_dotenv
9
+ from huggingface_hub import HfFileSystem
10
+ from llama_index import (
11
+ Document,
12
+ GPTVectorStoreIndex,
13
+ LLMPredictor,
14
+ PromptHelper,
15
+ ServiceContext,
16
+ StorageContext,
17
+ load_index_from_storage,
18
+ )
19
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
20
 
21
  from utils.customLLM import CustomLLM
22
 
23
  load_dotenv()
24
  openai.api_key = os.getenv("OPENAI_API_KEY")
25
+ fs = HfFileSystem()
26
 
27
  # get model
28
+ # model_name = "bigscience/bloom-560m"
29
  # tokenizer = AutoTokenizer.from_pretrained(model_name)
30
  # model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
31
 
 
53
 
54
  # define llm
55
  llm_predictor = LLMPredictor(llm=CustomLLM())
56
+ service_context = ServiceContext.from_defaults(
57
+ llm_predictor=llm_predictor, prompt_helper=prompt_helper
58
+ )
59
 
60
+
61
+ def prepare_data(file_path: str):
62
  df = pd.read_json(file_path)
63
+ df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values
64
+
65
  parsed = loads(df.to_json(orient="records"))
66
 
67
  documents = []
68
  for item in parsed:
69
+ document = Document(
70
+ item["paragraphText"],
71
+ item["_id"]["$oid"],
72
+ extra_info={
73
+ "chapter": item["chapter"],
74
+ "article": item["article"],
75
+ "title": item["title"],
76
+ },
77
+ )
78
  documents.append(document)
79
 
80
  return documents
81
 
82
+
83
  def initialize_index(index_name):
84
  file_path = f"./vectorStores/{index_name}"
85
  if os.path.exists(file_path):
86
  # rebuild storage context
87
  storage_context = StorageContext.from_defaults(persist_dir=file_path)
88
+
89
+ # local load index access
90
  index = load_index_from_storage(storage_context)
91
+
92
+ # huggingface repo load access
93
+ with fs.open(file_path, "r") as file:
94
+ index = pickle.loads(file.readlines())
95
  return index
96
  else:
97
  documents = prepare_data(r"./assets/regItems.json")
98
+ index = GPTVectorStoreIndex.from_documents(
99
+ documents, service_context=service_context
100
+ )
101
+ # local write access
102
  index.storage_context.persist(file_path)
103
+
104
+ # huggingface repo write access
105
+ with fs.open(file_path, "w") as file:
106
+ file.write(pickle.dumps(index))
107
+ return index
requirements.txt CHANGED
@@ -8,4 +8,6 @@ openai
8
  faiss-cpu
9
  python-dotenv
10
  streamlit
11
- streamlit-chat
 
 
 
8
  faiss-cpu
9
  python-dotenv
10
  streamlit
11
+ streamlit-chat
12
+ huggingface_hub
13
+ pickle5