import re from docx import Document from helpers import get_doc_blocks def get_ikz_pdf(pdf_blocks): ikz_pdf = set() for block in pdf_blocks: ikz_pdf.update( re.findall("\d{32,40}", block) ) return ikz_pdf def get_ikz_doc(doc): ikz_docx = set() paragraphs = get_doc_blocks(doc) ikz_doc_regex = ["\d{36}", "(?:\d{2})(?:-\d{3,20}){5}"] for docpara in paragraphs: for val in ikz_doc_regex: ikz_docx.update( re.findall(val, docpara) ) for table in doc.tables: for row in table.rows: for cell in row.cells: for para in cell.paragraphs: for val in ikz_doc_regex: ikz_docx.update( re.findall(val, para.text) ) return ikz_docx