Spaces:
Sleeping
Sleeping
| from PIL import Image | |
| import pytesseract | |
| import os | |
| import pymupdf | |
| import spaces | |
| import torch | |
| import gradio as gr | |
| from prepare import prepare | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from langchain_community.llms import HuggingFacePipeline | |
| from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate | |
| from langchain_core.output_parsers import StrOutputParser | |
| from langchain_community.document_loaders import YoutubeLoader, DataFrameLoader | |
| from langchain_community.vectorstores.utils import filter_complex_metadata | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.schema.runnable import RunnablePassthrough | |
| from langchain_core.messages import AIMessage, HumanMessage | |
| from langchain_community.llms import HuggingFaceEndpoint | |
| from dotenv import load_dotenv | |
| from huggingface_hub import InferenceClient | |
| import huggingface_hub | |
| #zero = torch.Tensor([0]).cuda() | |
| load_dotenv() | |
| api_token = os.getenv("HF_TOKEN") | |
| huggingface_hub.login(token=api_token) | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b') | |
| model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').to(device) | |
| #@spaces.GPU | |
| def read_pdf(file_path): | |
| output = '' | |
| doc = pymupdf.open(file_path) | |
| for page in range(len(doc)): | |
| text = doc[page].get_text().encode("utf8") | |
| if text: | |
| output += text.decode('utf-8') | |
| else: | |
| image_list = doc[page].get_images() | |
| for image_index, img in enumerate(image_list, start=1): # enumerate the image list | |
| xref = img[0] # get the XREF of the image | |
| pix = pymupdf.Pixmap(doc, xref) # create a Pixmap | |
| if pix.n - pix.alpha > 3: # CMYK: convert to RGB first | |
| pix = pymupdf.Pixmap(pymupdf.csRGB, pix) | |
| path = "page_{}-image_{}.png".format(page, image_index) | |
| pix.save(path) # save the image as png | |
| img = Image.open(path) | |
| pix = None | |
| output += pytesseract.image_to_string(img, lang='vie') + '\n' | |
| os.remove(path) | |
| return output | |
| def LLM_Inference(cv_text): | |
| text = f''' | |
| You are an AI designed to extract structured information from unstructured text. Your task is to analyze the content of a candidate's CV and extract the following details: | |
| **CV** | |
| {cv_text} | |
| **Information extraction and output format** | |
| 1. Candidate Information | |
| - Full Name | |
| - Contact Information (Phone, Email, Address, etc.) | |
| - Date of Birth (if available) | |
| 2. Education | |
| - Degree Name (e.g., Bachelor's, Master's, Ph.D.) | |
| - Field of Study (e.g., Computer Science, Business Administration) | |
| - Institution Name | |
| - Year(s) of Graduation | |
| 3. Professional Experience | |
| For each job, extract: | |
| - Job Title | |
| - Company Name | |
| - Duration (start and end dates) | |
| - Summarize key Responsibilities and Achievements | |
| 4. Skills | |
| - List of technical, soft, or industry-specific skills mentioned. | |
| 5. Certifications | |
| - Name of Certification | |
| - Issuing Organization | |
| - Year of Issuance | |
| 6. Language | |
| - List the languages mentioned in the CV along with proficiency levels (if specified). | |
| Do not explain, comment or make up any more information that is not relative to the list of Information extraction. Respond in the CV language. Let's work this out in a step by step way to ensure the correct answer. Do not repeat the step | |
| ''' | |
| inputs = tokenizer(text, return_tensors='pt', max_length=2048,truncation=True).to(device) | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, max_new_tokens=1024, pad_token_id = tokenizer.eos_token_id, | |
| top_p=0.99, # Nucleus sampling - only consider top 90% probability mass | |
| top_k=1, # Top-k sampling - choose from top 50 tokens | |
| temperature=0.0 | |
| ) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| def process(file_path): | |
| cv_text = read_pdf(file_path) | |
| cv_summary = LLM_Inference(cv_text) | |
| return cv_text, cv_summary | |
| # Create Gradio App | |
| interface = gr.Interface( | |
| fn=process, | |
| inputs=gr.File(label="Upload a PDF file"), | |
| outputs=[ | |
| gr.Textbox(label="PDF Content"), # Display PDF content | |
| gr.Textbox(label="CV Summary"), | |
| ], | |
| title="PDF Processor", | |
| description="Upload a PDF file and extract its content." | |
| ) | |
| # Launch the Gradio App | |
| if __name__ == "__main__": | |
| prepare() | |
| interface.launch() |