File size: 703 Bytes
22be37d
 
 
 
 
 
 
 
 
 
 
 
 
 
0c94c61
22be37d
 
 
0c94c61
22be37d
 
 
 
 
 
 
 
 
 
0c94c61
22be37d
 
 
0c94c61
22be37d
0c94c61
22be37d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""
Script for processing an input CV document
"""

import io

import fitz
from docx import Document


def parse_pdf(pdf_file) -> str:
    """Read PDF from Streamlit's file uploader"""

    pdf_document = fitz.open("pdf", pdf_file)
    n_pages = len(pdf_document)

    all_text = []

    for page_number in range(n_pages):

        page = pdf_document.load_page(page_number)

        all_text.append(page.get_text("text"))

    pdf_document.close()

    return "\n\n".join(all_text)


def parse_docx(docx_file) -> str:
    """Read in docx file"""
    docx_file = io.BytesIO(docx_file)

    doc = Document(docx_file)

    all_text = [para.text for para in doc.paragraphs]

    return "\n".join(all_text)