|
import re |
|
from docx import Document |
|
from helpers import get_doc_blocks |
|
|
|
def get_ikz_pdf(pdf_blocks): |
|
ikz_pdf = set() |
|
for block in pdf_blocks: |
|
ikz_pdf.update( |
|
re.findall("\d{32,40}", block) |
|
) |
|
return ikz_pdf |
|
|
|
|
|
def get_ikz_doc(doc): |
|
ikz_docx = set() |
|
paragraphs = get_doc_blocks(doc) |
|
ikz_doc_regex = ["\d{36}", "(?:\d{2})(?:-\d{3,20}){5}"] |
|
|
|
for docpara in paragraphs: |
|
for val in ikz_doc_regex: |
|
ikz_docx.update( |
|
re.findall(val, docpara) |
|
) |
|
for table in doc.tables: |
|
for row in table.rows: |
|
for cell in row.cells: |
|
for para in cell.paragraphs: |
|
for val in ikz_doc_regex: |
|
ikz_docx.update( |
|
re.findall(val, para.text) |
|
) |
|
return ikz_docx |
|
|