Patrol / ikz.py
Mariia5's picture
Upload 8 files
3478195 verified
raw
history blame contribute delete
866 Bytes
import re
from docx import Document
from helpers import get_doc_blocks
def get_ikz_pdf(pdf_blocks):
ikz_pdf = set()
for block in pdf_blocks:
ikz_pdf.update(
re.findall("\d{32,40}", block)
)
return ikz_pdf
def get_ikz_doc(doc):
ikz_docx = set()
paragraphs = get_doc_blocks(doc)
ikz_doc_regex = ["\d{36}", "(?:\d{2})(?:-\d{3,20}){5}"]
for docpara in paragraphs:
for val in ikz_doc_regex:
ikz_docx.update(
re.findall(val, docpara)
)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
for val in ikz_doc_regex:
ikz_docx.update(
re.findall(val, para.text)
)
return ikz_docx