Carlosito16 commited on
Commit
8421d14
1 Parent(s): 5e44f32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -15
app.py CHANGED
@@ -5,6 +5,7 @@ import torch
5
  from tqdm.auto import tqdm
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
 
 
8
  # from langchain.vectorstores import Chroma
9
  from langchain.vectorstores import FAISS
10
  from langchain.embeddings import HuggingFaceInstructEmbeddings
@@ -23,28 +24,47 @@ st.set_page_config(
23
  st.markdown("# Hello")
24
 
25
 
26
- with open("ait-web-document", "rb") as fp:
27
- ait_web_documents = pickle.load(fp)
28
-
29
-
30
- text_splitter = RecursiveCharacterTextSplitter(
31
- # Set a really small chunk size, just to show.
32
- chunk_size = 500,
33
- chunk_overlap = 100,
34
- length_function = len,
35
- )
 
 
 
 
 
36
 
37
- chunked_text = text_splitter.create_documents([doc for doc in tqdm(ait_web_documents)])
 
38
 
39
 
40
- st.markdown(f"Number of Documents: {len(ait_web_documents)}")
41
- st.markdown(f"Number of chunked texts: {len(chunked_text)}")
42
 
43
 
44
- embedding_model = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-base',
 
 
45
  model_kwargs = {'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')})
 
 
 
 
 
 
 
46
 
47
- vector_database = FAISS.load_local("faiss_index", embedding_model)
 
 
 
 
 
 
48
  print("load done")
49
 
50
 
@@ -57,3 +77,4 @@ def retrieve_document(query_input):
57
 
58
  output = st.text_area(label = "Here is the relevant documents",
59
  value = retrieve_document(query_input))
 
 
5
  from tqdm.auto import tqdm
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
 
8
+
9
  # from langchain.vectorstores import Chroma
10
  from langchain.vectorstores import FAISS
11
  from langchain.embeddings import HuggingFaceInstructEmbeddings
 
24
  st.markdown("# Hello")
25
 
26
 
27
+ @st.cache_data
28
+ def load_scraped_web_info():
29
+ with open("/Users/carlosito/Library/CloudStorage/OneDrive-Personal/AIT material/99-AIT-thesis/aitGPT/ait-web-document", "rb") as fp:
30
+ ait_web_documents = pickle.load(fp)
31
+
32
+
33
+ text_splitter = RecursiveCharacterTextSplitter(
34
+ # Set a really small chunk size, just to show.
35
+ chunk_size = 500,
36
+ chunk_overlap = 100,
37
+ length_function = len,
38
+ )
39
+
40
+ chunked_text = text_splitter.create_documents([doc for doc in tqdm(ait_web_documents)])
41
+
42
 
43
+ st.markdown(f"Number of Documents: {len(ait_web_documents)}")
44
+ st.markdown(f"Number of chunked texts: {len(chunked_text)}")
45
 
46
 
 
 
47
 
48
 
49
+ @st.cache_resource
50
+ def load_embedding_model():
51
+ embedding_model = HuggingFaceInstructEmbeddings(model_name='hkunlp/instructor-base',
52
  model_kwargs = {'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')})
53
+ return embedding_model
54
+
55
+ @st.cache_data
56
+ def load_faiss_index():
57
+ vector_database = FAISS.load_local("faiss_index", embedding_model)
58
+ return vector_database
59
+
60
 
61
+ #--------------
62
+
63
+
64
+
65
+ load_scraped_web_info()
66
+ embedding_model = load_embedding_model()
67
+ vector_database = load_faiss_index()
68
  print("load done")
69
 
70
 
 
77
 
78
  output = st.text_area(label = "Here is the relevant documents",
79
  value = retrieve_document(query_input))
80
+