Spaces:

jmedina888
/

RAG_cv_jam

Running

App Files Files Community

RAG_cv_jam / app.py

jmedina888

Update app.py

4f3c57e verified 17 days ago

raw

history blame contribute delete

10.4 kB

	#!/usr/bin/env python
	# coding: utf-8

	# In[1]:


	import pandas as pd
	import numpy as np
	import fitz
	import io
	import re
	from sentence_transformers import SentenceTransformer


	# In[2]:


	import faiss
	import pickle
	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

	# In[3]:


	resume_dict= {
	"CONTACT": {
	"NAME": "Juan Medina",
	"CITY": "Toronto, Canada",
	"EMAIL": "juanandresmedina125@gmail.com"
	},
	"RELEVANT WORK EXPERIENCE": {
	"Vector Institute": {
	"CITY": "Toronto, ON",
	"ROLE": "Faculty Affiliate Researcher",
	"DATE": "Sep 2023 - Feb 2025",
	"DESCRIPTION": "Developed and implemented a comprehensive Natural Language Processing and Machine Learning pipeline utilizing large language models (LLM) in few shot, pre-training and fine-tuning settings, resulting in valuable policy insights on post-COVID conditions (PCC). Implemented a novel named entity recognition (NER) tool for identifying social determinants of PCC, facilitating the analysis of unexplored relationships between PCC and sociodemographic dimensions. Designed and executed a language entailment pipeline to automate a granular analysis of annotated data, providing actionable insights for over 26 SDOH entity dimensions in over 7,000 texts."
	},
	"Rubik": {
	"CITY": "London, UK (Remote)",
	"ROLE": "AI Product Strategy [Consulting Project]",
	"DATE": "Sep 2024 - Dec 2024",
	"DESCRIPTION": "Developed a comprehensive business intelligence AI implementation strategy tailored to the waste management sector, including scalability in cloud systems, interoperability, and product differentiation."
	},
	"J. Roy Gillis Lab, University of Toronto": {
	"CITY": "Toronto, ON",
	"ROLE": "Data Science, Quantitative Analysis Specialist",
	"DATE": "Jun 2024 - Dec 2024",
	"DESCRIPTION": "Designed an end-to-end sentiment analysis pipeline to analyze discourse around vaccination hesitancy in Canada, entailing the extraction, cleaning, annotation, modeling and visualization of over 100,000 data points from the Reddit API. Led an interdisciplinary team of 8 researchers, including engineers, social scientists and designers. Created an interactive visual story showcasing key trends and contextual patterns related to vaccination hesitancy, enhancing understanding and decision-making for stakeholders."
	},
	"i4Health Research lab, York University": {
	"CITY": "Toronto, ON",
	"ROLE": "Machine Learning Research Assistant",
	"DATE": "Oct 2023 - Sep 2024",
	"DESCRIPTION": "Explored and developed ML-driven disparity analysis pipelines with Natural Language Processing and Causal Inference to assess discriminatory relationships in health. Collaborated in the development of a question-answering model for medical images."
	}
	},
	"EDUCATION": {
	"University of Toronto": {
	"CITY": "Toronto, ON, CA",
	"DEGREE": "Master of Science (M.Sc.): Health Systems Artificial Intelligence emphasis",
	"START DATE": "Sep 2023",
	"GRADUATION DATE": "Mar 2025"
	},
	"University of California, San Diego": {
	"CITY": "Remote",
	"DEGREE": "Coursework in Object-Oriented Programming, Natural Language Processing, Probability and Statistics for Deep Learning, and Discrete Mathematics",
	"START DATE": "Jun 2022",
	"GRADUATION DATE": "Nov 2022"
	},
	"Wesleyan University": {
	"CITY": "Middletown, CT, USA",
	"DEGREE": "Bachelor of Arts Double Major: Economics, Science in Society Program (Mathematics/Neuroscience & Sociology emphases)",
	"START DATE": "Aug 2018",
	"GRADUATION DATE": "May 2022"
	}
	},
	"SKILLS": {
	"Technical": "Python, R, SQL, Tableau, SAS, SLURM (HPC), AWS, Spark, PowerBI, Stata",
	"Relevant Courses": "Machine Learning, Deep Learning, Statistical Learning, Data Visualization, Causal Inference, AI Implementation, Biostatistics, Innovation Management, Health Policy"
	},
	"ASPIRATIONS" : "I aspire to work in a multidisciplinary environment where I can utilize my skills in data science, economics, public health, and policy."
	}


	# In[4]:


	resume_chunks=[]
	for k, val in resume_dict.items():
	string= str(k) + ': ' + str(val) +' }}'
	resume_chunks.append(string)


	# In[5]:


	# Load pre-trained embedding model
	model = SentenceTransformer("all-MiniLM-L6-v2")

	# Generate embeddings
	resume_embeddings = model.encode(resume_chunks)

	print("Embedding shape:", resume_embeddings.shape) # Should be (num_chunks, 384)


	# In[6]:


	# Initialize FAISS index
	dimension = resume_embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(resume_embeddings)

	# Save index and text chunks for future use
	with open("resume_faiss.pkl", "wb") as f:
	pickle.dump((index, resume_chunks), f)

	print("FAISS index built and saved.")


	# In[7]:


	# Load the saved FAISS index and text chunks
	with open("resume_faiss.pkl", "rb") as f:
	index, resume_chunks = pickle.load(f)


	# In[8]:


	import transformers
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer


	# In[9]:


	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	model_name = "Qwen/Qwen2.5-0.5B"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	llm = AutoModelForCausalLM.from_pretrained(model_name)


	# In[10]:


	def retrieve_and_generate(query, top_k=2):
	# Embed the query
	query_embedding = embedder.encode([query])

	# Search FAISS for top matching chunks
	distances, indices = index.search(query_embedding, k=top_k)
	retrieved_texts = [resume_chunks[i] for i in indices[0] if i < len(resume_chunks)]

	if not retrieved_texts:
	return "Sorry, I couldn't find relevant information in the resume."

	# Combine context
	context = "\n".join(retrieved_texts)

	# Manually build the prompt for Qwen2.5
	prompt = (
	"You are Juan, a recent master's graduate. Based on your resume information below (in python dictionary format), "
	"answer the user's question truthfully and concisely in first person, checking for the right key in the dictionary. Let's think step by step.\n\n"
	f"Resume:\n{context}\n\n"
	f"Question: {query}\nAnswer:"
	)

	# Tokenize and generate
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")
	output = llm.generate(input_ids, max_new_tokens=200)

	# Only decode the newly generated part
	generated_tokens = output[0][input_ids.shape[-1]:]
	response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

	return response


	# In[11]:





	# In[12]:


	qa_dict = {"Work experience": "My experience spans both academia and industry, including my role as a Faculty Affiliate Researcher at the Vector Institute for Artificial Intelligence, where I focused on Natural Language Processing in healthcare, and working in an AI strategy consulting project for Rubik, a London-based SaaS startup.",
	"Education": "I recently earned an M.Sc. with an emphasis in Health Systems Artificial Intelligence from the University of Toronto and hold a B.A. in Economics and Science & Technology Studies from Wesleyan University.",
	"Skillset": "With a multidisciplinary background in AI, statistics, economics, and health systems, I bring a unique perspective to solving complex problems.",
	"Other": "Please reach out directly at juanandresmedina125[at]gmail[dot]com, and I would be delighted to discuss my background and experiences in greater detail."
	}


	# In[14]:


	import gradio as gr
	import requests


	# In[15]:

	futuristic_theme = gr.themes.Soft(
	primary_hue="purple",
	secondary_hue="slate",
	font=["Montserrat", "sans-serif"]
	).set(
	body_background_fill="#0f1117", # Deep navy
	body_text_color="#74EE2D", # Soft gray text
	block_background_fill="#1f2937", # Slightly lighter panel
	block_border_color="#6b7280", # Cool gray border
	input_background_fill="#000000", # Input fields
	input_border_color="#9CA3AF",
	button_primary_background_fill="#74EE2D",
	button_primary_text_color="#000000",
	button_primary_background_fill_hover="#7C3AED"
	)

	# Define logic
	def handle_predefined_question(topic):
	return qa_dict.get(topic, "Please select a valid topic.")

	#Gradio UI
	with gr.Blocks(title="Ask My Resume", theme=futuristic_theme, css="""
	/* Style the dropdown button */
	#custom-dropdown .gr-button {
	background-color: #000000 !important;
	color: #74EE2D !important;
	border: 1px solid #9CA3AF !important;
	}

	/* Global override for dropdown list */
	ul[role="listbox"] {
	background-color: #000000 !important;
	color: #74EE2D !important;
	border: 1px solid #9CA3AF !important;
	}

	/* Style individual options */
	ul[role="listbox"] > li {
	background-color: #000000 !important;
	color: #74EE2D !important;
	}

	/* Hover effect */
	ul[role="listbox"] > li:hover {
	background-color: #1f2937 !important;
	}
	""") as demo:
	gr.Markdown(
	"## ✨ Ask My Resume\n"
	"Welcome! Ask me anything about my experience below.",
	elem_id="title"
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Topics")
	dropdown = gr.Dropdown(
	choices=["Select..."] + list(qa_dict.keys()),
	label="Choose a topic",
	elem_id="custom-dropdown"
	)
	predefined_output = gr.Textbox(
	label="Answer", lines=4, interactive=False, show_copy_button=True
	)
	dropdown.change(fn=handle_predefined_question, inputs=dropdown, outputs=predefined_output)

	with gr.Column():
	gr.Markdown("## Ask Your Own Question")
	user_input = gr.Textbox(label="Enter your question")
	custom_output = gr.Textbox(
	label="Answer (AI-Powered)", lines=5, interactive=False, show_copy_button=True
	)
	ask_btn = gr.Button("Ask", variant="primary")
	ask_btn.click(fn=retrieve_and_generate, inputs=user_input, outputs=custom_output)
	gr.Markdown("Disclaimer: LLM-based answers can be prone to errors.")

	demo.launch()


	# In[ ]: