File size: 6,819 Bytes
184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a 9585d12 184c72a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from datasets import load_dataset
class CompanyDescriptionModel:
def __init__(self):
self.vectorizer = TfidfVectorizer()
self.company_descriptions = {}
self.description_vectors = None
def load_huggingface_data(self):
"""
Load and process the job descriptions dataset from HuggingFace
"""
print("Loading dataset from HuggingFace...")
dataset = load_dataset("jacob-hugging-face/job-descriptions")
# Process the training split
train_data = dataset['train']
# Create company-description pairs
for item in train_data:
company = item['company_name'].strip().lower()
description = item['job_description'].strip()
# If company already exists, append new description
if company in self.company_descriptions:
if isinstance(self.company_descriptions[company], list):
self.company_descriptions[company].append(description)
else:
self.company_descriptions[company] = [self.company_descriptions[company], description]
else:
self.company_descriptions[company] = description
print(f"Loaded descriptions for {len(self.company_descriptions)} companies")
# Create vectors for all descriptions
descriptions = []
for desc in self.company_descriptions.values():
if isinstance(desc, list):
# If multiple descriptions, join them
descriptions.append(" ".join(desc))
else:
descriptions.append(desc)
self.description_vectors = self.vectorizer.fit_transform(descriptions)
def get_description(self, company_name, similarity_threshold=0.3):
"""
Get job descriptions for a company
"""
company_name = company_name.lower().strip()
# Direct match
if company_name in self.company_descriptions:
desc = self.company_descriptions[company_name]
if isinstance(desc, list):
return True, f"Found {len(desc)} job descriptions for {company_name}:\n\n" + "\n\n---\n\n".join(desc)
return True, f"Job description for {company_name}:\n\n{desc}"
# Try to find similar company names
try:
company_vector = self.vectorizer.transform([company_name])
similarities = cosine_similarity(company_vector, self.description_vectors).flatten()
max_sim_idx = np.argmax(similarities)
if similarities[max_sim_idx] >= similarity_threshold:
similar_company = list(self.company_descriptions.keys())[max_sim_idx]
desc = self.company_descriptions[similar_company]
if isinstance(desc, list):
return True, f"Similar to '{similar_company}':\n\n" + "\n\n---\n\n".join(desc)
return True, f"Similar to '{similar_company}':\n\n{desc}"
else:
return False, f"No job descriptions found for '{company_name}'. Please provide one for training."
except Exception as e:
return False, f"Error processing company name: {str(e)}"
def add_new_description(self, company_name, description):
"""
Add a new company and job description
"""
company_name = company_name.lower().strip()
if company_name in self.company_descriptions:
if isinstance(self.company_descriptions[company_name], list):
self.company_descriptions[company_name].append(description)
else:
self.company_descriptions[company_name] = [self.company_descriptions[company_name], description]
else:
self.company_descriptions[company_name] = description
# Retrain vectors
descriptions = []
for desc in self.company_descriptions.values():
if isinstance(desc, list):
descriptions.append(" ".join(desc))
else:
descriptions.append(desc)
self.description_vectors = self.vectorizer.fit_transform(descriptions)
def save_model(self, filename):
"""
Save the model to a file
"""
model_data = {
'company_descriptions': self.company_descriptions,
'vectorizer': self.vectorizer,
'description_vectors': self.description_vectors
}
with open(filename, 'wb') as f:
pickle.dump(model_data, f)
def load_model(self, filename):
"""
Load the model from a file
"""
try:
with open(filename, 'rb') as f:
model_data = pickle.load(f)
self.company_descriptions = model_data['company_descriptions']
self.vectorizer = model_data['vectorizer']
self.description_vectors = model_data['description_vectors']
return True
except FileNotFoundError:
return False
def main():
model = CompanyDescriptionModel()
model_file = 'company_description_model.pkl'
# Try to load existing model, if not found, load from HuggingFace
if not model.load_model(model_file):
print("No existing model found. Loading data from HuggingFace...")
model.load_huggingface_data()
model.save_model(model_file)
print("Initial model created and saved.")
while True:
print("\n=== Company Job Description System ===")
company = input("Enter a company name to get job descriptions (or 'quit' to exit): ").strip()
if company.lower() == 'quit':
break
found, description = model.get_description(company)
print(f"\nResult:\n{description}")
if not found:
print("\nLet's add this company to our database!")
new_description = input("Please provide a job description for this company: ").strip()
model.add_new_description(company, new_description)
print(f"\nThank you! Job description for '{company}' has been added to the database.")
# Save the updated model
model.save_model(model_file)
print("Model has been updated and saved.")
if __name__ == "__main__":
main() |