Spaces:
Running
Running
from PIL import Image | |
import pytesseract | |
import os | |
import pymupdf | |
import spaces | |
import torch | |
import gradio as gr | |
from prepare import prepare | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
from langchain_community.llms import HuggingFacePipeline | |
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader | |
from langchain_community.vectorstores.utils import filter_complex_metadata | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores import FAISS | |
from langchain.schema.runnable import RunnablePassthrough | |
from langchain_core.messages import AIMessage, HumanMessage | |
from langchain_community.llms import HuggingFaceEndpoint | |
from dotenv import load_dotenv | |
from huggingface_hub import InferenceClient | |
import huggingface_hub | |
#zero = torch.Tensor([0]).cuda() | |
load_dotenv() | |
api_token = os.getenv("HF_TOKEN") | |
huggingface_hub.login(token=api_token) | |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b') | |
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device) | |
#@spaces.GPU | |
def read_pdf(file_path): | |
output = '' | |
doc = pymupdf.open(file_path) | |
for page in range(len(doc)): | |
text = doc[page].get_text().encode("utf8") | |
if text: | |
output += text.decode('utf-8') | |
else: | |
image_list = doc[page].get_images() | |
for image_index, img in enumerate(image_list, start=1): # enumerate the image list | |
xref = img[0] # get the XREF of the image | |
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap | |
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first | |
pix = pymupdf.Pixmap(pymupdf.csRGB, pix) | |
path = "page_{}-image_{}.png".format(page, image_index) | |
pix.save(path) # save the image as png | |
img = Image.open(path) | |
pix = None | |
output += pytesseract.image_to_string(img, lang='vie') + '\n' | |
os.remove(path) | |
return output | |
def LLM_Inference(cv_text): | |
text = f''' | |
You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details: | |
**CV** | |
{cv_text} | |
**Information extraction and output format** | |
1. Candidate Information | |
- Full Name | |
- Contact Information (Phone, Email, Address, etc.) | |
- Date of Birth (if available) | |
2. Education | |
- Degree Name (e.g., Bachelor's, Master's, Ph.D.) | |
- Field of Study (e.g., Computer Science, Business Administration) | |
- Institution Name | |
- Year(s) of Graduation | |
3. Professional Experience | |
For each job, extract: | |
- Job Title | |
- Company Name | |
- Duration (start and end dates) | |
- Summarize key Responsibilities and Achievements | |
4. Skills | |
- List of technical, soft, or industry-specific skills mentioned. | |
5. Certifications | |
- Name of Certification | |
- Issuing Organization | |
- Year of Issuance | |
6. Language | |
- List the languages mentioned in the CV along with proficiency levels (if specified). | |
Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step | |
''' | |
inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id, | |
top_p=0.99, # Nucleus sampling - only consider top 90% probability mass | |
top_k=1, # Top-k sampling - choose from top 50 tokens | |
temperature=0.0 | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
def process(file_path): | |
cv_text = read_pdf(file_path) | |
cv_summary = LLM_Inference(cv_text) | |
return cv_text, cv_summary | |
# Create Gradio App | |
interface = gr.Interface( | |
fn=process, | |
inputs=gr.File(label="Upload a PDF file"), | |
outputs=[ | |
gr.Textbox(label="PDF Content"), # Display PDF content | |
gr.Textbox(label="CV Summary"), | |
], | |
title="PDF Processor", | |
description="Upload a PDF file and extract its content." | |
) | |
# Launch the Gradio App | |
if __name__ == "__main__": | |
prepare() | |
interface.launch() |