marie000 commited on
Commit
301a219
1 Parent(s): 2301341

initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. .gitignore +1 -0
  3. app.py +47 -0
  4. model.pt +3 -0
  5. model.py +39 -0
  6. requirements.text +4 -0
  7. vocab.pt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.pt filter=lfs diff=lfs merge=lfs -text
37
+ vocab.pt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import torch
4
+
5
+ from model import create_model
6
+
7
+ model = create_model()
8
+
9
+ model.load_state_dict(torch.load(f="model.pt", map_location=torch.device("cpu")))
10
+
11
+ vocab = torch.load("vocab.pt", map_location=torch.device("cpu"))
12
+ text_pipeline = lambda x: vocab(tokenizer(x))
13
+
14
+ def predict(text):
15
+ with torch.no_grad():
16
+ text = torch.tensor(text_pipeline(text), dtype=torch.int64)
17
+ text = torch.unsqueeze(text, 0)
18
+ result = model(text).squeeze()
19
+ result = (round(result.item(), 2)) * 10000
20
+ return result
21
+
22
+
23
+ title = "Salary Predictor"
24
+ description = "This is a test project to see if I could built a Machine Learning model to predict salary offered based on a posted job description. To test, copy a whole job description from a linkedIn post. Results are in USD."
25
+ article = ''' This project was built by Marie Pelletier <br/>
26
+ <a href="https://www.linkedin.com/in/marie-pelletier-14837441/">LinkedIn</a><br/>
27
+ <a href="https://github.com/Marie000">Github</a>
28
+ </p>
29
+ <p>This is a work in progress and is not meant to be used as an accurate predictor of salary. It is limited by the <a
30
+ href="https://www.kaggle.com/datasets/arshkon/linkedin-job-postings/data">data that was
31
+ used</a>, which is US-based, from 2023. It also does not take into account regions or fluctuations in the
32
+ market over time. The dataset included over 33,000 job postings, but only 13,000 had salary information.</p>
33
+ <p>For all you statistic nerds, the r-squared score of the model was around 0.6. For the rest of you, that level of accuracy can probably be described as "better than nothing". It might have been better if I had
34
+ more data to work with. Or maybe there is just so much we can infer from a description alone.</p>
35
+ <p>For this first test, only the description of the job is taken into account. Adding other information, including the date of the posting and the location, could improve the prediction.</p>
36
+ <p>The google colab notebook used to generate this model can be found <a href="https://github.com/Marie000/Linkedin-predictor-model">here</a>'''
37
+
38
+ demo = gr.Interface(
39
+ fn=predict,
40
+ inputs=gr.Textbox(lines=20, placeholder="copy whole job description here"),
41
+ outputs="number",
42
+ title=title,
43
+ description=description,
44
+ article=article
45
+ )
46
+
47
+ demo.launch(debug=False, share=True)
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b57443fe7d4d756a72036063848324e67c9aad8453f52f79ae972ce44c98d985
3
+ size 5206736
model.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import torch.nn as nn
4
+ import torchtext
5
+ from torchtext.data.utils import get_tokenizer
6
+
7
+ tokenizer = get_tokenizer("basic_english")
8
+
9
+ def create_model():
10
+
11
+ class RNNModel(nn.Module):
12
+ def __init__(self, input_dim=20000, embedding_dim=64, hidden_dim=32, num_layers=2):
13
+ super().__init__()
14
+ self.hidden_dim = hidden_dim
15
+ self.num_layers = num_layers
16
+ self.embedding = nn.Embedding(input_dim, embedding_dim)
17
+ self.rnn = nn.LSTM(
18
+ embedding_dim, hidden_dim, num_layers=num_layers, dropout=0.5
19
+ )
20
+ self.fc = nn.Linear(hidden_dim * num_layers, 1)
21
+ self.init_weights()
22
+
23
+ def init_weights(self):
24
+ self.embedding.weight.data.uniform_(-0.5, 0.5)
25
+ self.fc.weight.data.uniform_(-0.5, 0.5)
26
+
27
+ def forward(self, x):
28
+ x = x.permute(1, 0)
29
+ emb = self.embedding(x)
30
+ # output will not be used because we have a many-to-one rnn
31
+ output, (hidden, cell) = self.rnn(emb)
32
+ hidden.squeeze_(0)
33
+ hidden = hidden.transpose(0, 1)
34
+ hidden = hidden.reshape(-1, self.hidden_dim * self.num_layers)
35
+ out = self.fc(hidden)
36
+ return out
37
+
38
+ model = RNNModel()
39
+ return model
requirements.text ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ torch=2.3.0
3
+ torchtext=0.18.0
4
+ gradio=4.31.5
vocab.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45daf4ffad472370db34bff448ec3f03c0fde2925b80c8471d54c2f69d6bc76e
3
+ size 366424