Carlos Salgado commited on
Commit
dbd084e
1 Parent(s): 2050d2a

update prompt template, minor fixes

Browse files
Files changed (3) hide show
  1. DocVerifyRAG.py +0 -148
  2. backend/generate_metadata.py +3 -3
  3. ingest.py +0 -7
DocVerifyRAG.py DELETED
@@ -1,148 +0,0 @@
1
- import streamlit as st
2
- from dotenv import load_dotenv
3
- from PyPDF2 import PdfReader
4
- from langchain import embeddings
5
- from langchain.text_splitter import CharacterTextSplitter
6
- from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
7
- from langchain.vectorstores import FAISS
8
- from langchain.vectorstores import faiss
9
- from langchain.chat_models import ChatOpenAI
10
- from langchain.memory import ConversationBufferMemory
11
- from langchain.chains import ConversationalRetrievalChain
12
- from html_templates import css, bot_template, user_template
13
- from langchain.llms import HuggingFaceHub
14
- import os
15
- import pickle
16
- from datetime import datetime
17
-
18
-
19
- def get_pdf_text(pdf_docs):
20
- text = ""
21
- for pdf in pdf_docs:
22
- pdf_reader = PdfReader(pdf)
23
- for page in pdf_reader.pages:
24
- text += page.extract_text()
25
- return text
26
-
27
-
28
- def get_text_chunks(text):
29
- text_splitter = CharacterTextSplitter(
30
- separator="\n",
31
- chunk_size=1000,
32
- chunk_overlap=200,
33
- length_function=len
34
- )
35
- chunks = text_splitter.split_text(text)
36
- return chunks
37
-
38
-
39
- def get_vectorstore(text_chunks):
40
- embeddings = OpenAIEmbeddings()
41
- # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
42
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
43
- return vectorstore
44
-
45
-
46
- def get_conversation_chain(vectorstore):
47
- llm = ChatOpenAI()
48
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
49
-
50
- memory = ConversationBufferMemory(
51
- memory_key='chat_history', return_messages=True)
52
- conversation_chain = ConversationalRetrievalChain.from_llm(
53
- llm=llm,
54
- retriever=vectorstore.as_retriever(),
55
- memory=memory
56
- )
57
- return conversation_chain
58
-
59
-
60
- def handle_userinput(user_question):
61
- response = st.session_state.conversation({'question': user_question})
62
- st.session_state.chat_history = response['chat_history']
63
-
64
- for i, message in enumerate(st.session_state.chat_history):
65
- # Display user message
66
- if i % 2 == 0:
67
- st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
68
- else:
69
- print(message)
70
- # Display AI response
71
- st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
72
- # Display source document information if available in the message
73
- if hasattr(message, 'source') and message.source:
74
- st.write(f"Source Document: {message.source}", unsafe_allow_html=True)
75
-
76
-
77
- def safe_vec_store():
78
- os.makedirs('vectorstore', exist_ok=True)
79
- filename = 'vectores' + datetime.now().strftime('%Y%m%d%H%M') + '.pkl'
80
- file_path = os.path.join('vectorstore', filename)
81
- vector_store = st.session_state.vectorstore
82
-
83
- # Serialize and save the entire FAISS object using pickle
84
- with open(file_path, 'wb') as f:
85
- pickle.dump(vector_store, f)
86
-
87
-
88
- def main():
89
- load_dotenv()
90
- st.set_page_config(page_title="DOC Verify RAG", page_icon=":hospital:")
91
- st.write(css, unsafe_allow_html=True)
92
-
93
- st.subheader("Your documents")
94
- pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
95
- filenames = [file.name for file in pdf_docs if file is not None]
96
-
97
- if "conversation" not in st.session_state:
98
- st.session_state.conversation = None
99
- if "chat_history" not in st.session_state:
100
- st.session_state.chat_history = None
101
-
102
- st.header("DOC Verify RAG :hospital:")
103
- user_question = st.text_input("Ask a question about your documents:")
104
- if user_question:
105
- handle_userinput(user_question)
106
-
107
- with st.sidebar:
108
-
109
- st.subheader("Classification Instrucitons")
110
- classifier_docs = st.file_uploader("Upload your instructions here and click on 'Process'", accept_multiple_files=True)
111
- filenames = [file.name for file in classifier_docs if file is not None]
112
-
113
- if st.button("Process"):
114
- with st.spinner("Processing"):
115
- loaded_vec_store = None
116
- for filename in filenames:
117
- if ".pkl" in filename:
118
- file_path = os.path.join('vectorstore', filename)
119
- with open(file_path, 'rb') as f:
120
- loaded_vec_store = pickle.load(f)
121
- raw_text = get_pdf_text(pdf_docs)
122
- text_chunks = get_text_chunks(raw_text)
123
- vec = get_vectorstore(text_chunks)
124
- if loaded_vec_store:
125
- vec.merge_from(loaded_vec_store)
126
- st.warning("loaded vectorstore")
127
- if "vectorstore" in st.session_state:
128
- vec.merge_from(st.session_state.vectorstore)
129
- st.warning("merged to existing")
130
- st.session_state.vectorstore = vec
131
- st.session_state.conversation = get_conversation_chain(vec)
132
- st.success("data loaded")
133
-
134
- # Save and Load Embeddings
135
- if st.button("Save Embeddings"):
136
- if "vectorstore" in st.session_state:
137
- safe_vec_store()
138
- # st.session_state.vectorstore.save_local("faiss_index")
139
- st.sidebar.success("safed")
140
- else:
141
- st.sidebar.warning("No embeddings to save. Please process documents first.")
142
-
143
- if st.button("Load Embeddings"):
144
- st.warning("this function is not in use, just upload the vectorstore")
145
-
146
-
147
- if __name__ == '__main__':
148
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/generate_metadata.py CHANGED
@@ -91,9 +91,9 @@ def extract_metadata(docs):
91
  }
92
  ]
93
  )
 
 
94
 
95
- created_user = json.loads(chat_completion.choices[0].message.content)
96
- return created_user
97
 
98
  if __name__ == "__main__":
99
  parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
@@ -108,4 +108,4 @@ if __name__ == "__main__":
108
 
109
  docs = ingest(args.document)
110
  metadata = extract_metadata(docs)
111
- print(json.dumps(metadata, indent=2))
 
91
  }
92
  ]
93
  )
94
+ # returns a dictionary
95
+ return json.loads(chat_completion.choices[0].message.content)
96
 
 
 
97
 
98
  if __name__ == "__main__":
99
  parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
 
108
 
109
  docs = ingest(args.document)
110
  metadata = extract_metadata(docs)
111
+ print(metadata)
ingest.py DELETED
@@ -1,7 +0,0 @@
1
- from langchain_community.document_loaders import UnstructuredPDFLoader
2
-
3
- def ingest_pdf(path):
4
- loader = UnstructuredPDFLoader()
5
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
6
-
7
- return data