ppsingh commited on
Commit
04f287e
1 Parent(s): 6e606fd

Create doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +43 -0
auditqa/doc_process.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
4
+ from transformers import AutoTokenizer
5
+ from torch import cuda
6
+ from langchain_community.document_loaders import PyMuPDFLoader
7
+ from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
8
+ from langchain_community.vectorstores import Qdrant
9
+ from auditqa.reports import files, report_list
10
+ device = 'cuda' if cuda.is_available() else 'cpu'
11
+ #from dotenv import load_dotenv
12
+ #load_dotenv()
13
+
14
+ #HF_token = os.environ["HF_TOKEN"]
15
+ path_to_data = "./data/pdf/"
16
+
17
+ def process_pdf():
18
+ docs = {}
19
+ for file in report_list:
20
+ try:
21
+ docs[file] = PyMuPDFLoader(path_to_data + file + '.pdf').load()
22
+ except Exception as e:
23
+ print("Exception: ", e)
24
+
25
+ # text splitter based on the tokenizer of a model of your choosing
26
+ # to make texts fit exactly a transformer's context window size
27
+ # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
28
+ chunk_size = 256
29
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
30
+ AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5"),
31
+ chunk_size=chunk_size,
32
+ chunk_overlap=10,
33
+ add_start_index=True,
34
+ strip_whitespace=True,
35
+ separators=["\n\n", "\n"],
36
+ )
37
+ all_documents = {}
38
+ categories = list(files.keys())
39
+ for category in categories:
40
+ all_documents[category] = {}
41
+
42
+ print(all_documents)
43
+