cv_ocr_gradio / app.py
khoatran94's picture
Update app.py
0b5b10c verified
from PIL import Image
import pytesseract
import os
import pymupdf
import spaces
import torch
import gradio as gr
from prepare import prepare
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.llms import HuggingFaceEndpoint
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
import huggingface_hub
#zero = torch.Tensor([0]).cuda()
load_dotenv()
api_token = os.getenv("HF_TOKEN")
huggingface_hub.login(token=api_token)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device)
#@spaces.GPU
def read_pdf(file_path):
output = ''
doc = pymupdf.open(file_path)
for page in range(len(doc)):
text = doc[page].get_text().encode("utf8")
if text:
output += text.decode('utf-8')
else:
image_list = doc[page].get_images()
for image_index, img in enumerate(image_list, start=1): # enumerate the image list
xref = img[0] # get the XREF of the image
pix = pymupdf.Pixmap(doc, xref) # create a Pixmap
if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
path = "page_{}-image_{}.png".format(page, image_index)
pix.save(path) # save the image as png
img = Image.open(path)
pix = None
output += pytesseract.image_to_string(img, lang='vie') + '\n'
os.remove(path)
return output
@spaces.GPU(duration=60)
def LLM_Inference(cv_text):
text = f'''
You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:
**CV**
{cv_text}
**Information extraction and output format**
1. Candidate Information
- Full Name
- Contact Information (Phone, Email, Address, etc.)
- Date of Birth (if available)
2. Education
- Degree Name (e.g., Bachelor's, Master's, Ph.D.)
- Field of Study (e.g., Computer Science, Business Administration)
- Institution Name
- Year(s) of Graduation
3. Professional Experience
For each job, extract:
- Job Title
- Company Name
- Duration (start and end dates)
- Summarize key Responsibilities and Achievements
4. Skills
- List of technical, soft, or industry-specific skills mentioned.
5. Certifications
- Name of Certification
- Issuing Organization
- Year of Issuance
6. Language
- List the languages mentioned in the CV along with proficiency levels (if specified).
Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step
'''
inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device)
with torch.no_grad():
outputs = model.generate(
**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id,
top_p=0.99, # Nucleus sampling - only consider top 90% probability mass
top_k=1, # Top-k sampling - choose from top 50 tokens
temperature=0.0
)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
def process(file_path):
cv_text = read_pdf(file_path)
cv_summary = LLM_Inference(cv_text)
return cv_text, cv_summary
# Create Gradio App
interface = gr.Interface(
fn=process,
inputs=gr.File(label="Upload a PDF file"),
outputs=[
gr.Textbox(label="PDF Content"), # Display PDF content
gr.Textbox(label="CV Summary"),
],
title="PDF Processor",
description="Upload a PDF file and extract its content."
)
# Launch the Gradio App
if __name__ == "__main__":
prepare()
interface.launch()