''' This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more information visit https://github.com/bizres ''' from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer from pdfminer.high_level import extract_text import fitz import langid langid.set_languages(['en', 'de','fr','it']) import pandas as pd def pdf_to_text(file): ''' This function extracts text from a pdf. Parameters: path: path to pdf ''' text = extract_text(file) paragraphs = text.split('\n\n') return paragraphs def detect_language(text): ''' This function detects the language of a text using langid ''' return langid.classify(text) def count_pages(pdf_file): return len(list(extract_pages(pdf_file))) def pdf_text_to_sections(text): ''' This function generates a pandas DataFrame from the extracted text. Each section is provided with the page it is on and a section_index ''' sections = [] page_nr = 0 section_index = 0 for page in text.split('\n\n'): page_nr += 1 for section in page.split('\n'): sections.append([page_nr, section_index, section]) section_index += 1 return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text'])