Spaces:
Sleeping
Sleeping
import os | |
import requests | |
from io import BytesIO | |
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, UnstructuredFileLoader | |
from dotenv import load_dotenv | |
from yolo_text_extraction import pdf_to_text | |
load_dotenv() | |
class CV: | |
def __init__(self, file_url): | |
self.file_url = file_url | |
self.doc_loader = { | |
".pdf": PyPDFLoader, | |
".docx": Docx2txtLoader, | |
".txt": UnstructuredFileLoader | |
} | |
def get_cv_text(self): | |
# Download the file from Supabase | |
response = requests.get(self.file_url) | |
file_content = BytesIO(response.content) | |
# Determine file extension | |
_, ext = os.path.splitext(self.file_url.split("?")[0]) # Remove query parameters | |
if ext.lower() in self.doc_loader: | |
if ext.lower() == '.pdf': | |
loader = self.doc_loader[ext.lower()](file_content) | |
else: | |
# For non-PDF files, save temporarily and use the appropriate loader | |
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as temp_file: | |
temp_file.write(file_content.getvalue()) | |
temp_file_path = temp_file.name | |
loader = self.doc_loader[ext.lower()](temp_file_path) | |
data = loader.load() | |
if ext.lower() != '.pdf': | |
os.unlink(temp_file_path) # Delete the temporary file | |
if data: | |
text = "\n".join([page.page_content for page in data]) | |
return text if text.strip() else pdf_to_text(file_content) | |
else: | |
return pdf_to_text(file_content) | |
else: | |
return pdf_to_text(file_content) | |
def analyse_cv_quality(self): | |
from cv_analyzer import analyze_cv | |
cv_text = self.get_cv_text() | |
result = analyze_cv(cv_text) | |
return result |