Spaces:

cm0805
/

ResuMate_NVIDIA

Sleeping

App Files Files Community

cm0805 commited on May 31, 2023

Commit

54f4f78

•

1 Parent(s): 6cad64a

Create nlp.py

Browse files

Files changed (1) hide show

nlp.py +135 -0

nlp.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import requests
+import json
+import re
+import numpy as np
+from sentence_transformers import SentenceTransformer, util
+from sklearn.metrics.pairwise import cosine_similarity
+import spacy
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+# Langchain packages
+from langchain.text_splitter import CharacterTextSplitter #text splitter
+from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
+from langchain.vectorstores import FAISS  #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
+from langchain import HuggingFaceHub
+from langchain.chains.question_answering import load_qa_chain
+from constants import StreamlitException
+from constants import API_URL_summary, API_URL_name, HEADERS, TECH_SKILLS
+from constants import SENTENCE_TRANSFORMER_MODEL, LLM_REPO_ID
+# Function to summarize resume text
+def summarize_text(text, max_length=100):
+    if text != '':
+        data = json.dumps(
+            {
+                "inputs": text,
+                "parameters": {"max_length": max_length}
+            }
+        )
+        response = requests.post(API_URL_summary, headers=HEADERS, data=data)
+        if response.status_code != 200:
+            return StreamlitException(f"**Error**: {response.status_code}")
+        try:
+            summary = response.json()[0]["summary_text"]
+        except (KeyError, IndexError):
+            return StreamlitException("**Error**: Invalid response from API.")
+        return summary
+    else:
+        return 'nan'
+# Function to extract candidate name(s) from resume text
+def extract_person_names_and_email(text):
+    print(text)
+    emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
+    data = json.dumps({"inputs": [text]})
+    response = requests.post(API_URL_name, headers=HEADERS, data=data)
+    output = json.loads(response.content.decode("utf-8"))
+    print(output)
+    person_names = set()
+    for text in output[0]:
+        if text["entity_group"] == "PER":
+            person_names.add(text["word"])
+    # Extract email addresses
+    print(text)
+    return set(person_names), set(emails)
+# Function to extract key technical skills from resume text
+def extract_tech_skills(doc):
+    keywords = [token.text.upper() for token in doc if token.text.lower() in TECH_SKILLS]
+    return set(keywords)
+# Function to calculate overall percentage match between job description and resume
+def calculate_similarity(job_description, resume):
+    if job_description != '':
+        model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
+        job_description_embeddings = model.encode(job_description)
+        resume_embeddings = model.encode(resume)
+        similarity_score = util.cos_sim(job_description_embeddings, resume_embeddings)
+        return similarity_score[0][0] * 100
+    else:
+        return np.NaN
+# Define a function to clean sentences
+def clean_text(text):
+    # Remove bullet points
+    text = re.sub(r'[\u2022\u2023\u25E6\u2043]', '', text).strip()
+    # Remove more types of bullet points
+    text = re.sub(r'^\s*[-*•⁃◦▸▹]*\s+', '', text, flags=re.MULTILINE)
+    # Remove extra new lines
+    text = re.sub(r'\n+', '\n', text).strip()
+    # Remove any leading/trailing newlines
+    text = text.strip('\n')
+    # Remove any leading/trailing spaces
+    text = text.strip()
+    # Replace pipe symbol with a dot
+    text = re.sub(r'\s*\|\s*', '. ', text).strip()
+    # Add full stops to the end of each sentence
+    text = re.sub(r'([^.!?])\s*\n', r'\1. ', text)
+    # Capitalize the first letter of each sentence
+    text = re.sub(r'(?<=[.!?]\s)(\w+)', lambda x: x.group().capitalize(), text)
+    # Replace ' - ' with '. ' only if it's not part of a hyphenated word
+    text = re.sub(r'(?<![^\W\d_])-(?!\d|\w*-)(?<!\d)\s*', '. ', text)
+    # Return cleaned text
+    return text
+# Define a function to split sentences based on regular expressions
+def split_text(string):
+    # Split the clean string into sentences
+    sentences = sent_tokenize(string)
+    return sentences
+# Function to calculate overall percentage match
+def get_average_similarity_scores(job_description, resumes):
+    # Calculate cosine similarity matrix between job description and resumes
+    model = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)
+    job_description_embeddings = model.encode(job_description)
+    resume_embeddings = model.encode(resumes)
+    similarity_matrix = cosine_similarity(job_description_embeddings, resume_embeddings)
+    # Calculate the average similarity score for each phrase in the job description across all phrases in the resumes
+    avg_similarity_scores = np.mean(similarity_matrix, axis=1)
+    # Return the average similarity scores as a list
+    return avg_similarity_scores.tolist()
+# Function to respond to user Q&A
+def qna_query(loader, query, chunk_size=500, chunk_overlap=10, temperature=1, max_length=100):
+    pages = loader.load_and_split()
+    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    docs = text_splitter.split_documents(pages)
+    embeddings = HuggingFaceEmbeddings()
+    db = FAISS.from_documents(docs, embeddings)
+    llm = HuggingFaceHub(
+        repo_id=LLM_REPO_ID, model_kwargs={
+            "temperature": temperature, "max_length": max_length
+        })
+    chain = load_qa_chain(llm, chain_type="stuff")
+    docs = db.similarity_search(query)
+    return chain.run(input_documents=docs, question=query)
+# Load the English language model for spaCy
+lang_model = spacy.load("en_core_web_sm")