Spaces:

khoatran94
/

cv_ocr_gradio

Sleeping

App Files Files Community

cv_ocr_gradio / app.py

khoatran94

Update app.py

0b5b10c verified 7 months ago

raw

history blame contribute delete

4.87 kB

	from PIL import Image
	import pytesseract
	import os
	import pymupdf
	import spaces
	import torch
	import gradio as gr
	from prepare import prepare

	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
	from langchain_community.llms import HuggingFacePipeline
	from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader
	from langchain_community.vectorstores.utils import filter_complex_metadata
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.schema.runnable import RunnablePassthrough
	from langchain_core.messages import AIMessage, HumanMessage
	from langchain_community.llms import HuggingFaceEndpoint
	from dotenv import load_dotenv
	from huggingface_hub import InferenceClient
	import huggingface_hub
	#zero = torch.Tensor([0]).cuda()

	load_dotenv()
	api_token = os.getenv("HF_TOKEN")
	huggingface_hub.login(token=api_token)
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b')
	model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device)

	#@spaces.GPU
	def read_pdf(file_path):
	output = ''
	doc = pymupdf.open(file_path)
	for page in range(len(doc)):
	text = doc[page].get_text().encode("utf8")
	if text:
	output += text.decode('utf-8')
	else:
	image_list = doc[page].get_images()
	for image_index, img in enumerate(image_list, start=1): # enumerate the image list
	xref = img[0] # get the XREF of the image
	pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

	if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
	pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

	path = "page_{}-image_{}.png".format(page, image_index)
	pix.save(path) # save the image as png
	img = Image.open(path)
	pix = None
	output += pytesseract.image_to_string(img, lang='vie') + '\n'
	os.remove(path)
	return output


	@spaces.GPU(duration=60)
	def LLM_Inference(cv_text):
	text = f'''
	You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details:

	CV
	{cv_text}

	Information extraction and output format
	1. Candidate Information
	- Full Name
	- Contact Information (Phone, Email, Address, etc.)
	- Date of Birth (if available)

	2. Education
	- Degree Name (e.g., Bachelor's, Master's, Ph.D.)
	- Field of Study (e.g., Computer Science, Business Administration)
	- Institution Name
	- Year(s) of Graduation

	3. Professional Experience
	For each job, extract:
	- Job Title
	- Company Name
	- Duration (start and end dates)
	- Summarize key Responsibilities and Achievements

	4. Skills
	- List of technical, soft, or industry-specific skills mentioned.

	5. Certifications
	- Name of Certification
	- Issuing Organization
	- Year of Issuance

	6. Language
	- List the languages mentioned in the CV along with proficiency levels (if specified).

	Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step
	'''
	inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device)
	with torch.no_grad():
	outputs = model.generate(
	**inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id,
	top_p=0.99, # Nucleus sampling - only consider top 90% probability mass
	top_k=1, # Top-k sampling - choose from top 50 tokens
	temperature=0.0
	)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)

	def process(file_path):
	cv_text = read_pdf(file_path)
	cv_summary = LLM_Inference(cv_text)
	return cv_text, cv_summary

	# Create Gradio App
	interface = gr.Interface(
	fn=process,
	inputs=gr.File(label="Upload a PDF file"),
	outputs=[
	gr.Textbox(label="PDF Content"), # Display PDF content
	gr.Textbox(label="CV Summary"),
	],
	title="PDF Processor",
	description="Upload a PDF file and extract its content."
	)


	# Launch the Gradio App
	if __name__ == "__main__":
	prepare()
	interface.launch()