Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	cosmetics
Browse files- document_qa/document_qa_engine.py +22 -8
- streamlit_app.py +1 -0
    	
        document_qa/document_qa_engine.py
    CHANGED
    
    | @@ -23,7 +23,13 @@ class DocumentQAEngine: | |
| 23 | 
             
                embeddings_map_from_md5 = {}
         | 
| 24 | 
             
                embeddings_map_to_md5 = {}
         | 
| 25 |  | 
| 26 | 
            -
                def __init__(self, | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 27 | 
             
                    self.embedding_function = embedding_function
         | 
| 28 | 
             
                    self.llm = llm
         | 
| 29 | 
             
                    self.chain = load_qa_chain(llm, chain_type=qa_chain_type)
         | 
| @@ -81,14 +87,14 @@ class DocumentQAEngine: | |
| 81 | 
             
                    return self.embeddings_map_from_md5[md5]
         | 
| 82 |  | 
| 83 | 
             
                def query_document(self, query: str, doc_id, output_parser=None, context_size=4, extraction_schema=None,
         | 
| 84 | 
            -
                                   verbose=False) -> (
         | 
| 85 | 
             
                        Any, str):
         | 
| 86 | 
             
                    # self.load_embeddings(self.embeddings_root_path)
         | 
| 87 |  | 
| 88 | 
             
                    if verbose:
         | 
| 89 | 
             
                        print(query)
         | 
| 90 |  | 
| 91 | 
            -
                    response = self._run_query(doc_id, query, context_size=context_size)
         | 
| 92 | 
             
                    response = response['output_text'] if 'output_text' in response else response
         | 
| 93 |  | 
| 94 | 
             
                    if verbose:
         | 
| @@ -138,9 +144,15 @@ class DocumentQAEngine: | |
| 138 |  | 
| 139 | 
             
                    return parsed_output
         | 
| 140 |  | 
| 141 | 
            -
                def _run_query(self, doc_id, query, context_size=4):
         | 
| 142 | 
             
                    relevant_documents = self._get_context(doc_id, query, context_size)
         | 
| 143 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 144 | 
             
                    # return self.chain({"input_documents": relevant_documents, "question": prompt_chat_template}, return_only_outputs=True)
         | 
| 145 |  | 
| 146 | 
             
                def _get_context(self, doc_id, query, context_size=4):
         | 
| @@ -150,6 +162,7 @@ class DocumentQAEngine: | |
| 150 | 
             
                    return relevant_documents
         | 
| 151 |  | 
| 152 | 
             
                def get_all_context_by_document(self, doc_id):
         | 
|  | |
| 153 | 
             
                    db = self.embeddings_dict[doc_id]
         | 
| 154 | 
             
                    docs = db.get()
         | 
| 155 | 
             
                    return docs['documents']
         | 
| @@ -161,6 +174,7 @@ class DocumentQAEngine: | |
| 161 | 
             
                    return relevant_documents
         | 
| 162 |  | 
| 163 | 
             
                def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
         | 
|  | |
| 164 | 
             
                    if verbose:
         | 
| 165 | 
             
                        print("File", pdf_file_path)
         | 
| 166 | 
             
                    filename = Path(pdf_file_path).stem
         | 
| @@ -215,12 +229,11 @@ class DocumentQAEngine: | |
| 215 | 
             
                        self.embeddings_dict[hash] = Chroma.from_texts(texts, embedding=self.embedding_function, metadatas=metadata,
         | 
| 216 | 
             
                                                                       collection_name=hash)
         | 
| 217 |  | 
| 218 | 
            -
             | 
| 219 | 
             
                    self.embeddings_root_path = None
         | 
| 220 |  | 
| 221 | 
             
                    return hash
         | 
| 222 |  | 
| 223 | 
            -
                def create_embeddings(self, pdfs_dir_path: Path):
         | 
| 224 | 
             
                    input_files = []
         | 
| 225 | 
             
                    for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
         | 
| 226 | 
             
                        for file_ in files:
         | 
| @@ -238,7 +251,8 @@ class DocumentQAEngine: | |
| 238 | 
             
                            print(data_path, "exists. Skipping it ")
         | 
| 239 | 
             
                            continue
         | 
| 240 |  | 
| 241 | 
            -
                        texts, metadata, ids = self.get_text_from_document(input_file, chunk_size= | 
|  | |
| 242 | 
             
                        filename = metadata[0]['filename']
         | 
| 243 |  | 
| 244 | 
             
                        vector_db_document = Chroma.from_texts(texts,
         | 
|  | |
| 23 | 
             
                embeddings_map_from_md5 = {}
         | 
| 24 | 
             
                embeddings_map_to_md5 = {}
         | 
| 25 |  | 
| 26 | 
            +
                def __init__(self,
         | 
| 27 | 
            +
                             llm,
         | 
| 28 | 
            +
                             embedding_function,
         | 
| 29 | 
            +
                             qa_chain_type="stuff",
         | 
| 30 | 
            +
                             embeddings_root_path=None,
         | 
| 31 | 
            +
                             grobid_url=None,
         | 
| 32 | 
            +
                             ):
         | 
| 33 | 
             
                    self.embedding_function = embedding_function
         | 
| 34 | 
             
                    self.llm = llm
         | 
| 35 | 
             
                    self.chain = load_qa_chain(llm, chain_type=qa_chain_type)
         | 
|  | |
| 87 | 
             
                    return self.embeddings_map_from_md5[md5]
         | 
| 88 |  | 
| 89 | 
             
                def query_document(self, query: str, doc_id, output_parser=None, context_size=4, extraction_schema=None,
         | 
| 90 | 
            +
                                   verbose=False, memory=None) -> (
         | 
| 91 | 
             
                        Any, str):
         | 
| 92 | 
             
                    # self.load_embeddings(self.embeddings_root_path)
         | 
| 93 |  | 
| 94 | 
             
                    if verbose:
         | 
| 95 | 
             
                        print(query)
         | 
| 96 |  | 
| 97 | 
            +
                    response = self._run_query(doc_id, query, context_size=context_size, memory=memory)
         | 
| 98 | 
             
                    response = response['output_text'] if 'output_text' in response else response
         | 
| 99 |  | 
| 100 | 
             
                    if verbose:
         | 
|  | |
| 144 |  | 
| 145 | 
             
                    return parsed_output
         | 
| 146 |  | 
| 147 | 
            +
                def _run_query(self, doc_id, query, memory=None, context_size=4):
         | 
| 148 | 
             
                    relevant_documents = self._get_context(doc_id, query, context_size)
         | 
| 149 | 
            +
                    if memory:
         | 
| 150 | 
            +
                        return self.chain.run(input_documents=relevant_documents,
         | 
| 151 | 
            +
                                              question=query)
         | 
| 152 | 
            +
                    else:
         | 
| 153 | 
            +
                        return self.chain.run(input_documents=relevant_documents,
         | 
| 154 | 
            +
                                              question=query,
         | 
| 155 | 
            +
                                              memory=memory)
         | 
| 156 | 
             
                    # return self.chain({"input_documents": relevant_documents, "question": prompt_chat_template}, return_only_outputs=True)
         | 
| 157 |  | 
| 158 | 
             
                def _get_context(self, doc_id, query, context_size=4):
         | 
|  | |
| 162 | 
             
                    return relevant_documents
         | 
| 163 |  | 
| 164 | 
             
                def get_all_context_by_document(self, doc_id):
         | 
| 165 | 
            +
                    """Return the full context from the document"""
         | 
| 166 | 
             
                    db = self.embeddings_dict[doc_id]
         | 
| 167 | 
             
                    docs = db.get()
         | 
| 168 | 
             
                    return docs['documents']
         | 
|  | |
| 174 | 
             
                    return relevant_documents
         | 
| 175 |  | 
| 176 | 
             
                def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
         | 
| 177 | 
            +
                    """Extract text from documents using Grobid, if chunk_size is < 0 it keep each paragraph separately"""
         | 
| 178 | 
             
                    if verbose:
         | 
| 179 | 
             
                        print("File", pdf_file_path)
         | 
| 180 | 
             
                    filename = Path(pdf_file_path).stem
         | 
|  | |
| 229 | 
             
                        self.embeddings_dict[hash] = Chroma.from_texts(texts, embedding=self.embedding_function, metadatas=metadata,
         | 
| 230 | 
             
                                                                       collection_name=hash)
         | 
| 231 |  | 
|  | |
| 232 | 
             
                    self.embeddings_root_path = None
         | 
| 233 |  | 
| 234 | 
             
                    return hash
         | 
| 235 |  | 
| 236 | 
            +
                def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
         | 
| 237 | 
             
                    input_files = []
         | 
| 238 | 
             
                    for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
         | 
| 239 | 
             
                        for file_ in files:
         | 
|  | |
| 251 | 
             
                            print(data_path, "exists. Skipping it ")
         | 
| 252 | 
             
                            continue
         | 
| 253 |  | 
| 254 | 
            +
                        texts, metadata, ids = self.get_text_from_document(input_file, chunk_size=chunk_size,
         | 
| 255 | 
            +
                                                                           perc_overlap=perc_overlap)
         | 
| 256 | 
             
                        filename = metadata[0]['filename']
         | 
| 257 |  | 
| 258 | 
             
                        vector_db_document = Chroma.from_texts(texts,
         | 
    	
        streamlit_app.py
    CHANGED
    
    | @@ -97,6 +97,7 @@ def init_qa(model, api_key=None): | |
| 97 | 
             
                else:
         | 
| 98 | 
             
                    st.error("The model was not loaded properly. Try reloading. ")
         | 
| 99 | 
             
                    st.stop()
         | 
|  | |
| 100 |  | 
| 101 | 
             
                return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
         | 
| 102 |  | 
|  | |
| 97 | 
             
                else:
         | 
| 98 | 
             
                    st.error("The model was not loaded properly. Try reloading. ")
         | 
| 99 | 
             
                    st.stop()
         | 
| 100 | 
            +
                    return
         | 
| 101 |  | 
| 102 | 
             
                return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
         | 
| 103 |  | 
