Spaces:

marie000
/

salary-predictor

Sleeping

App Files Files Community

marie000 commited on May 27

Commit

301a219

•

1 Parent(s): 2301341

initial commit

Browse files

Files changed (7) hide show

.gitattributes +2 -0
.gitignore +1 -0
app.py +47 -0
model.pt +3 -0
model.py +39 -0
requirements.text +4 -0
vocab.pt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.pt filter=lfs diff=lfs merge=lfs -text
+vocab.pt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import gradio as gr
+import os
+import torch
+from model import create_model
+model = create_model()
+model.load_state_dict(torch.load(f="model.pt", map_location=torch.device("cpu")))
+vocab = torch.load("vocab.pt", map_location=torch.device("cpu"))
+text_pipeline = lambda x: vocab(tokenizer(x))
+def predict(text):
+    with torch.no_grad():
+        text = torch.tensor(text_pipeline(text), dtype=torch.int64)
+        text = torch.unsqueeze(text, 0)
+        result = model(text).squeeze()
+        result = (round(result.item(), 2)) * 10000
+        return result
+title = "Salary Predictor"
+description = "This is a test project to see if I could built a Machine Learning model to predict salary offered based on a posted job description. To test, copy a whole job description from a linkedIn post. Results are in USD."
+article = ''' This project was built by Marie Pelletier <br/>
+            <a href="https://www.linkedin.com/in/marie-pelletier-14837441/">LinkedIn</a><br/>
+            <a href="https://github.com/Marie000">Github</a>
+        </p>
+        <p>This is a work in progress and is not meant to be used as an accurate predictor of salary. It is limited by the <a
+            href="https://www.kaggle.com/datasets/arshkon/linkedin-job-postings/data">data that was
+            used</a>, which is US-based, from 2023. It also does not take into account regions or fluctuations in the
+        market over time. The dataset included over 33,000 job postings, but only 13,000 had salary information.</p>
+        <p>For all you statistic nerds, the r-squared score of the model was around 0.6. For the rest of you, that level of accuracy can probably be described as "better than nothing". It might have been better if I had
+        more data to work with. Or maybe there is just so much we can infer from a description alone.</p>
+        <p>For this first test, only the description of the job is taken into account. Adding other information, including the date of the posting and the location, could improve the prediction.</p>
+        <p>The google colab notebook used to generate this model can be found <a href="https://github.com/Marie000/Linkedin-predictor-model">here</a>'''
+demo = gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(lines=20, placeholder="copy whole job description here"),
+    outputs="number",
+    title=title,
+    description=description,
+    article=article
+)
+demo.launch(debug=False, share=True)

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b57443fe7d4d756a72036063848324e67c9aad8453f52f79ae972ce44c98d985
+size 5206736

model.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+import torchtext
+from torchtext.data.utils import get_tokenizer
+tokenizer = get_tokenizer("basic_english")
+def create_model():
+  class RNNModel(nn.Module):
+      def __init__(self, input_dim=20000, embedding_dim=64, hidden_dim=32, num_layers=2):
+          super().__init__()
+          self.hidden_dim = hidden_dim
+          self.num_layers = num_layers
+          self.embedding = nn.Embedding(input_dim, embedding_dim)
+          self.rnn = nn.LSTM(
+              embedding_dim, hidden_dim, num_layers=num_layers, dropout=0.5
+          )
+          self.fc = nn.Linear(hidden_dim * num_layers, 1)
+          self.init_weights()
+      def init_weights(self):
+          self.embedding.weight.data.uniform_(-0.5, 0.5)
+          self.fc.weight.data.uniform_(-0.5, 0.5)
+      def forward(self, x):
+          x = x.permute(1, 0)
+          emb = self.embedding(x)
+          # output will not be used because we have a many-to-one rnn
+          output, (hidden, cell) = self.rnn(emb)
+          hidden.squeeze_(0)
+          hidden = hidden.transpose(0, 1)
+          hidden = hidden.reshape(-1, self.hidden_dim * self.num_layers)
+          out = self.fc(hidden)
+          return out
+  model = RNNModel()
+  return model

requirements.text ADDED Viewed

	@@ -0,0 +1,4 @@

+torch=2.3.0
+torchtext=0.18.0
+gradio=4.31.5

vocab.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45daf4ffad472370db34bff448ec3f03c0fde2925b80c8471d54c2f69d6bc76e
+size 366424