testitest / text_transformation_tools.py
crocidoc's picture
initial commit
cc83a1d
'''
This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata
It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more
information visit https://github.com/bizres
'''
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from pdfminer.high_level import extract_text
import fitz
import langid
langid.set_languages(['en', 'de','fr','it'])
import pandas as pd
def pdf_to_text(file):
'''
This function extracts text from a pdf.
Parameters:
path: path to pdf
'''
text = extract_text(file)
paragraphs = text.split('\n\n')
return paragraphs
def detect_language(text):
'''
This function detects the language of a text using langid
'''
return langid.classify(text)
def count_pages(pdf_file):
return len(list(extract_pages(pdf_file)))
def pdf_text_to_sections(text):
'''
This function generates a pandas DataFrame from the extracted text. Each section
is provided with the page it is on and a section_index
'''
sections = []
page_nr = 0
section_index = 0
for page in text.split('\n\n'):
page_nr += 1
for section in page.split('\n'):
sections.append([page_nr, section_index, section])
section_index += 1
return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text'])