Spaces:
Running
Running
Upload reader.py
Browse files
reader.py
CHANGED
@@ -1,5 +1,11 @@
|
|
1 |
import pypdfium2 as pdfium
|
2 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
class ResumeReader:
|
4 |
|
5 |
def clean_text(self, raw_text):
|
@@ -12,14 +18,37 @@ class ResumeReader:
|
|
12 |
clean_text = re.sub(r'• ', " ", clean_text)
|
13 |
return clean_text
|
14 |
|
15 |
-
def
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
20 |
clean_text = self.clean_text(raw_text)
|
21 |
resume_lines = clean_text.splitlines(True)
|
22 |
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
|
|
|
|
|
23 |
return resume_lines
|
24 |
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pypdfium2 as pdfium
|
2 |
import re
|
3 |
+
import wordninja
|
4 |
+
from PIL import Image
|
5 |
+
from pytesseract import image_to_string
|
6 |
+
from utils import recover_text, get_average_line_len
|
7 |
+
import pdfplumber
|
8 |
+
|
9 |
class ResumeReader:
|
10 |
|
11 |
def clean_text(self, raw_text):
|
|
|
18 |
clean_text = re.sub(r'• ', " ", clean_text)
|
19 |
return clean_text
|
20 |
|
21 |
+
def recover_text(self, text_without_spaces):
|
22 |
+
recovered_text = " ".join(wordninja.split(text_without_spaces))
|
23 |
+
return recovered_text
|
24 |
+
|
25 |
+
def read_image(self, path_file):
|
26 |
+
raw_text = str(image_to_string(Image.open(path_file)))
|
27 |
clean_text = self.clean_text(raw_text)
|
28 |
resume_lines = clean_text.splitlines(True)
|
29 |
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
|
30 |
+
#avg_line = get_average_line_len(resume_lines)
|
31 |
+
#resume_lines = [recover_text(line,avg_line) for line in resume_lines]
|
32 |
return resume_lines
|
33 |
|
34 |
+
def read_pdf(self, path_file):
|
35 |
+
raw_text = ""
|
36 |
+
with pdfplumber.open(path_file) as pdf:
|
37 |
+
# Extract text from all pages
|
38 |
+
for page_number in range(len(pdf.pages)):
|
39 |
+
page = pdf.pages[page_number]
|
40 |
+
raw_text += page.extract_text()
|
41 |
+
clean_text = self.clean_text(raw_text)
|
42 |
+
resume_lines = clean_text.splitlines(True)
|
43 |
+
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
|
44 |
+
#avg_line = get_average_line_len(resume_lines)
|
45 |
+
#resume_lines = [recover_text(line,avg_line) for line in resume_lines]
|
46 |
+
return resume_lines
|
47 |
+
def read(self, path_file):
|
48 |
+
if path_file.endswith('.pdf'):
|
49 |
+
return self.read_pdf(path_file)
|
50 |
+
elif path_file.endswith('.jpg') or path_file.endswith('.png') or path_file.endswith('.jpeg'):
|
51 |
+
return self.read_image(path_file)
|
52 |
+
else:
|
53 |
+
print("Unsupported file format")
|
54 |
+
return None
|