Spaces:
Runtime error
Runtime error
import os | |
import pdfplumber as pdfp | |
from src.model.paragraph import Paragraph | |
import asyncio | |
def skip_header(dictionary): | |
i = 0 | |
if not (dictionary[i]["chars"][0]["size"] > 19 and dictionary[i]["chars"][0]["size"] < 30): | |
i+=2 | |
return i | |
def get_style_of_line(size : float): | |
if size >= 9 and size < 11.5: | |
return "content" | |
elif size >= 11.5 and size <= 12.7: | |
return "title5" | |
elif size >= 12.8 and size <= 13.5: | |
return "title4" | |
elif size > 13.5 and size <= 15.5: | |
return "title3" | |
elif size > 15.5 and size <= 18.5: | |
return "title2" | |
elif size > 19 and size < 30: | |
return "title1" | |
# elif size >= 12 and size <= 14.5: | |
# return "title2" | |
# elif size > 14.5 and size <= 16.5: | |
# return "title1" | |
else: | |
return "unknown" | |
def get_pdf_title_styles(path): | |
pdf_to_read = extract_all_lines_from_the_doc(path) | |
paragraphs = [] | |
j = 0 | |
while j < len(pdf_to_read): | |
dictionary = pdf_to_read[j]["content"] | |
i = skip_header(dictionary) | |
while i < len(dictionary): | |
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}") | |
if(dictionary[i]["text"].startswith("RESTAPIDeveloperGuide")): | |
i+=1 | |
continue | |
p = Paragraph(dictionary[i]["text"],font_style=get_style_of_line(dictionary[i]["chars"][0]["size"]),id_=i,page_id=pdf_to_read[j]["page_number"]) | |
if(i != len(dictionary)-1): | |
while(dictionary[i+1]["chars"][0]["size"] == dictionary[i]["chars"][0]["size"]): | |
p.text += " " + dictionary[i+1]["text"] | |
i += 1 | |
# if(i == len(dictionary)-1): | |
# print("PIDOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO") | |
# if(j == len(pdf_to_read)-1): | |
# print("JUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU") | |
# break | |
# else: | |
# if(dictionary[i]["chars"][0]["size"] == pdf_to_read[j+1]["content"][0]["chars"][0]["size"]): | |
# print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") | |
# j += 1 | |
# p.text += " " + pdf_to_read[j]["content"][0]["text"] | |
# dictionary = pdf_to_read[j]["content"] | |
# i = 0 | |
# else: | |
# print("RRIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIZ") | |
# break | |
else: | |
p.text = dictionary[i]["text"] | |
#print(f"{dictionary[i]['chars'][0]} : {dictionary[i]['text']}") | |
i += 1 | |
# print(f'{p.page_id} : {p.font_style} ->>>>> {p.text}') | |
paragraphs.append(p) | |
j += 1 | |
return paragraphs | |
def test_get_font_sizes_of_a_page(page : int, path): | |
with open(os.path.abspath(path)) as f: | |
reader = pdfp.PDF(f) | |
page = reader.pages[page] | |
dictionary = page.extract_text_lines() | |
for i in range(len(dictionary)): | |
print(f'{i} : {dictionary[i]["chars"][0]["size"]} ->>>>> {dictionary[i]["text"]}') | |
def extract_all_lines_from_the_doc(path): | |
lines_of_doc = [] | |
with open(path, 'rb') as f: | |
reader = pdfp.PDF(f) | |
skip_table_of_contents = reader.pages[8:13] | |
j = 0 | |
while j < len(skip_table_of_contents): | |
lines_of_doc.append({"page_number": j+9, "content": skip_table_of_contents[j].extract_text_lines()}) | |
j += 1 | |
return lines_of_doc | |
# path = "data/Illumio_Core_REST_API_Developer_Guide_23.3.pdf" | |
# get_pdf_title_styles(os.path.abspath(path)) | |
# print("--------------------------------------------------") | |
# print("--------------------------------------------------") | |
#print(test_get_font_sizes_of_a_page(8)) |