ppsingh commited on
Commit
0a5f808
1 Parent(s): fd70dcf

Update auditqa/doc_process.py

Browse files
Files changed (1) hide show
  1. auditqa/doc_process.py +5 -3
auditqa/doc_process.py CHANGED
@@ -45,17 +45,18 @@ def process_pdf():
45
  categories = list(files.keys())
46
  # iterate through 'source'
47
  for category in categories:
48
- print(category)
49
  all_documents[category] = []
50
  subtypes = list(files[category].keys())
51
  # iterate through 'subtype' within the source
52
  # example source/category == 'District', has subtypes which is district names
53
  for subtype in subtypes:
54
- print(subtype)
55
  for file in files[category][subtype]:
56
 
57
  # create the chunks
58
  doc_processed = text_splitter.split_documents(docs[file])
 
59
 
60
  # add metadata information
61
  for doc in doc_processed:
@@ -69,6 +70,7 @@ def process_pdf():
69
  # convert list of list to flat list
70
  for key, docs_processed in all_documents.items():
71
  docs_processed = [item for sublist in docs_processed for item in sublist]
 
72
  all_documents[key] = docs_processed
73
  all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
74
  all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
@@ -90,7 +92,7 @@ def process_pdf():
90
  location=":memory:",
91
  collection_name=file,
92
  )
93
-
94
  print("vector embeddings done")
95
  return qdrant_collections
96
 
 
45
  categories = list(files.keys())
46
  # iterate through 'source'
47
  for category in categories:
48
+ print("documents splitting in source:",category)
49
  all_documents[category] = []
50
  subtypes = list(files[category].keys())
51
  # iterate through 'subtype' within the source
52
  # example source/category == 'District', has subtypes which is district names
53
  for subtype in subtypes:
54
+ print("document splitting for subtype:",subtype)
55
  for file in files[category][subtype]:
56
 
57
  # create the chunks
58
  doc_processed = text_splitter.split_documents(docs[file])
59
+ print("chunks in subtype:",subtype, "are:",len(doc_processed))
60
 
61
  # add metadata information
62
  for doc in doc_processed:
 
70
  # convert list of list to flat list
71
  for key, docs_processed in all_documents.items():
72
  docs_processed = [item for sublist in docs_processed for item in sublist]
73
+ print("length of chunks in source:",source, "are:",len(docs_processed)
74
  all_documents[key] = docs_processed
75
  all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
76
  all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
 
92
  location=":memory:",
93
  collection_name=file,
94
  )
95
+ print(qdrant_collections)
96
  print("vector embeddings done")
97
  return qdrant_collections
98