Spaces:
Sleeping
Sleeping
File size: 1,967 Bytes
e892881 c02a423 cbc2d14 e892881 c02a423 e892881 c02a423 e892881 cbc2d14 c02a423 cbc2d14 c02a423 2dc8310 e892881 c02a423 e892881 c02a423 cbc2d14 c02a423 cbc2d14 2dc8310 e892881 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import requests
from io import BytesIO
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredFileLoader
from dotenv import load_dotenv
from yolo_text_extraction import pdf_to_text
load_dotenv()
class CV:
def __init__(self, file_url):
self.file_url = file_url
self.doc_loader = {
".pdf": PyPDFLoader,
".docx": Docx2txtLoader,
".txt": UnstructuredFileLoader
}
def get_cv_text(self):
# Download the file from Supabase
response = requests.get(self.file_url)
file_content = BytesIO(response.content)
# Determine file extension
_, ext = os.path.splitext(self.file_url.split("?")[0]) # Remove query parameters
if ext.lower() in self.doc_loader:
if ext.lower() == '.pdf':
loader = self.doc_loader[ext.lower()](file_content)
else:
# For non-PDF files, save temporarily and use the appropriate loader
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file:
temp_file.write(file_content.getvalue())
temp_file_path = temp_file.name
loader = self.doc_loader[ext.lower()](temp_file_path)
data = loader.load()
if ext.lower() != '.pdf':
os.unlink(temp_file_path) # Delete the temporary file
if data:
text = "\n".join([page.page_content for page in data])
return text if text.strip() else pdf_to_text(file_content)
else:
return pdf_to_text(file_content)
else:
return pdf_to_text(file_content)
def analyse_cv_quality(self):
from cv_analyzer import analyze_cv
cv_text = self.get_cv_text()
result = analyze_cv(cv_text)
return result |