ludekcizinsky commited on
Commit
f47e717
β€’
1 Parent(s): db4540e

feat(new models): added new models, refactor the ui

Browse files
app.py CHANGED
@@ -5,10 +5,21 @@ import os
5
  from homepage2vec.model import WebsiteClassifier as Homepage2Vec
6
 
7
  EXAMPLES = [
8
- ["gpt3.5", "tanjasenghaasdesigns.de"],
9
- ["gpt3.5", "epfl.ch"],
10
- ["gpt3.5", "cc.cz"],
11
- ["gpt3.5", "promaminky.cz"]
 
 
 
 
 
 
 
 
 
 
 
12
  ]
13
 
14
 
@@ -24,8 +35,11 @@ def predict(model_choice : str, url : str) -> Dict[str, float]:
24
  Dict[str, float]: The categories and their corresponding scores.
25
  """
26
 
27
- # Define the model directory path
28
- model_dir = os.path.join("models", model_choice)
 
 
 
29
 
30
  # Initialise model
31
  model = Homepage2Vec(model_dir=model_dir)
@@ -44,11 +58,11 @@ def predict(model_choice : str, url : str) -> Dict[str, float]:
44
 
45
  iface = gr.Interface(
46
  fn=predict,
47
- inputs=[gr.Dropdown(choices=["gpt3.5", "gpt4"], label="Select Model"),
48
- gr.Textbox(label="Enter Website URL", placeholder="www.mikasenghaas.de")],
49
  outputs=gr.Label(num_top_classes=14, label="Predicted Labels", show_label=True),
50
  title="Homepage2Vec",
51
- description="Use Homepage2Vec to predict the categories of any website you wish.",
52
  examples=EXAMPLES,
53
  live=False,
54
  allow_flagging="never",
 
5
  from homepage2vec.model import WebsiteClassifier as Homepage2Vec
6
 
7
  EXAMPLES = [
8
+ # Personal site
9
+ ["original", "tanjasenghaasdesigns.de"],
10
+ ["finetuned-gpt4", "tanjasenghaasdesigns.de"],
11
+
12
+ # EPFL
13
+ ["finetuned-gpt3.5", "epfl.ch"],
14
+ ["finetuned-gpt4", "epfl.ch"],
15
+
16
+ # Czech Crunch - czech tech news
17
+ ["original", "cc.cz"],
18
+ ["finetuned-gpt4", "cc.cz"],
19
+
20
+ # Promaminky - czech site for moms
21
+ ["original", "promaminky.cz"],
22
+ ["finetuned-gpt3.5", "promaminky.cz"],
23
  ]
24
 
25
 
 
35
  Dict[str, float]: The categories and their corresponding scores.
36
  """
37
 
38
+ if model_choice == "original":
39
+ model_dir = os.path.join("models", "homepage2vec")
40
+ else:
41
+ which_gpt = model_choice.split("-")[1]
42
+ model_dir = os.path.join("models", "finetuned", which_gpt)
43
 
44
  # Initialise model
45
  model = Homepage2Vec(model_dir=model_dir)
 
58
 
59
  iface = gr.Interface(
60
  fn=predict,
61
+ inputs=[gr.Dropdown(choices=["original", "finetuned-gpt3.5", "finetuned-gpt4"], label="Select Model", show_label=True, value="finetuned-gpt4"),
62
+ gr.Textbox(label="Enter Website's URL or domain", placeholder="e.g. ikea.com")],
63
  outputs=gr.Label(num_top_classes=14, label="Predicted Labels", show_label=True),
64
  title="Homepage2Vec",
65
+ description="Select a version of the Homepage2Vec model and enter a website's URL or domain to predict its categories. The original model was trained on 886K websites from Curlie directory. The finetuned models, in addition, were trained on GPT annotated websites. On average, the fintuned models should predict more labels than the original model while maintaining high accuracy.",
66
  examples=EXAMPLES,
67
  live=False,
68
  allow_flagging="never",
homepage2vec/__pycache__/data_collection.cpython-310.pyc CHANGED
Binary files a/homepage2vec/__pycache__/data_collection.cpython-310.pyc and b/homepage2vec/__pycache__/data_collection.cpython-310.pyc differ
 
models/.DS_Store CHANGED
Binary files a/models/.DS_Store and b/models/.DS_Store differ
 
models/{gpt3.5 β†’ finetuned/gpt3.5}/features.txt RENAMED
File without changes
models/{gpt3.5 β†’ finetuned/gpt3.5}/model.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d40bb85c577a8c0951b585714c35fa10509267f0d52ec1c6952f650e9622887
3
  size 19072308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3c325b49245d46326daa3c9ff6dd4e52b806cf88b8986e27598419764aed3ab
3
  size 19072308
models/finetuned/gpt4/features.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ f_tld 27
2
+ f_url 768
3
+ f_metatags 30
4
+ f_title 768
5
+ f_description 768
6
+ f_keywords 768
7
+ f_links_50 768
8
+ f_text_100 768
models/finetuned/gpt4/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b0c10a4a65ad90f87e33f7c62364b4d44991e72819d186785beff84c24f0f22
3
+ size 19072308
models/homepage2vec/features.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ f_tld 27
2
+ f_url 768
3
+ f_metatags 30
4
+ f_title 768
5
+ f_description 768
6
+ f_keywords 768
7
+ f_links_50 768
8
+ f_text_100 768
models/homepage2vec/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634de448281c5026597c05b5132ac0dc802305689adc104bde71949186999215
3
+ size 19072357