adrien.aribaut-gaudin
test from page 9 to 13
62d4365
raw
history blame
3.93 kB
import os
import pdfplumber as pdfp
from src.model.paragraph import Paragraph
import asyncio
def skip_header(dictionary):
i = 0
if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30):
i+=2
return i
def get_style_of_line(size : float):
if size >= 9 and size < 11.5:
return "content"
elif size >= 11.5 and size <= 12.7:
return "title5"
elif size >= 12.8 and size <= 13.5:
return "title4"
elif size > 13.5 and size <= 15.5:
return "title3"
elif size > 15.5 and size <= 18.5:
return "title2"
elif size > 19 and size < 30:
return "title1"
# elif size >= 12 and size <= 14.5:
# return "title2"
# elif size > 14.5 and size <= 16.5:
# return "title1"
else:
return "unknown"
def get_pdf_title_styles(path):
pdf_to_read = extract_all_lines_from_the_doc(path)
paragraphs = []
j = 0
while j < len(pdf_to_read):
dictionary = pdf_to_read[j]["content"]
i = skip_header(dictionary)
while i < len(dictionary):
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")):
i+=1
continue
p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"])
if(i != len(dictionary)-1):
while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]):
p.text += " " + dictionary[i+1]["text"]
i += 1
# if(i == len(dictionary)-1):
# print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO")
# if(j == len(pdf_to_read)-1):
# print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU")
# break
# else:
# if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]):
# print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
# j += 1
# p.text += " " + pdf_to_read[j]["content"][0]["text"]
# dictionary = pdf_to_read[j]["content"]
# i = 0
# else:
# print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ")
# break
else:
p.text = dictionary[i]["text"]
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}")
i += 1
# print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}')
paragraphs.append(p)
j += 1
return paragraphs
def test_get_font_sizes_of_a_page(page : int, path):
with open(os.path.abspath(path)) as f:
reader = pdfp.PDF(f)
page = reader.pages[page]
dictionary = page.extract_text_lines()
for i in range(len(dictionary)):
print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}')
def extract_all_lines_from_the_doc(path):
lines_of_doc = []
with open(path, 'rb') as f:
reader = pdfp.PDF(f)
skip_table_of_contents = reader.pages[8:13]
j = 0
while j < len(skip_table_of_contents):
lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()})
j += 1
return lines_of_doc
# path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf"
# get_pdf_title_styles(os.path.abspath(path))
# print("--------------------------------------------------")
# print("--------------------------------------------------")
#print(test_get_font_sizes_of_a_page(8))