File size: 1,439 Bytes
cc83a1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
'''
This module contains helperfunctions to load pdfs, extract their texts and generate additional metadata

It was initially created for the businessresponsibility.ch project of the Prototype Fund. For more
information visit https://github.com/bizres

'''
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from pdfminer.high_level import extract_text

import fitz

import langid
langid.set_languages(['en', 'de','fr','it'])

import pandas as pd

def pdf_to_text(file):
    '''
    This function extracts text from a pdf.

    Parameters:
    path: path to pdf
    '''

    text = extract_text(file)
    paragraphs = text.split('\n\n')
    return paragraphs


def detect_language(text):
    '''
    This function detects the language of a text using langid
    '''
    return langid.classify(text)

def count_pages(pdf_file):
    return len(list(extract_pages(pdf_file)))

def pdf_text_to_sections(text):
    '''
    This function generates a pandas DataFrame from the extracted text. Each section
    is provided with the page it is on and a section_index
    '''
    sections = []
    page_nr = 0
    section_index = 0
    for page in text.split('\n\n'):
        page_nr += 1
        for section in page.split('\n'):
            sections.append([page_nr, section_index, section])
            section_index += 1

    return pd.DataFrame(sections, columns=['page', 'section_index', 'section_text'])