| import re | |
| from docx import Document | |
| from helpers import get_doc_blocks | |
| def get_ikz_pdf(pdf_blocks): | |
| ikz_pdf = set() | |
| for block in pdf_blocks: | |
| ikz_pdf.update( | |
| re.findall("\d{32,40}", block) | |
| ) | |
| return ikz_pdf | |
| def get_ikz_doc(doc): | |
| ikz_docx = set() | |
| paragraphs = get_doc_blocks(doc) | |
| ikz_doc_regex = ["\d{36}", "(?:\d{2})(?:-\d{3,20}){5}"] | |
| for docpara in paragraphs: | |
| for val in ikz_doc_regex: | |
| ikz_docx.update( | |
| re.findall(val, docpara) | |
| ) | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| for para in cell.paragraphs: | |
| for val in ikz_doc_regex: | |
| ikz_docx.update( | |
| re.findall(val, para.text) | |
| ) | |
| return ikz_docx | |