ludekcizinsky commited on
Commit
8e53f74
1 Parent(s): 0a71fa6

feat(init): init commit

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
app.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from typing import Dict
3
+ import os
4
+
5
+ from homepage2vec.model import WebsiteClassifier as Homepage2Vec
6
+
7
+ EXAMPLES = [
8
+ ["gpt3.5", "tanjasenghaasdesigns.de"],
9
+ ["gpt3.5", "epfl.ch"],
10
+ ["gpt3.5", "cc.cz"],
11
+ ["gpt3.5", "promaminky.cz"]
12
+ ]
13
+
14
+
15
+ def predict(model_choice : str, url : str) -> Dict[str, float]:
16
+ """
17
+ Predict the categories of a website using the Homepage2Vec model.
18
+
19
+ Args:
20
+ model_choice (str): The model to use for prediction.
21
+ url (str): The url of the website to predict.
22
+
23
+ Returns:
24
+ Dict[str, float]: The categories and their corresponding scores.
25
+ """
26
+
27
+ # Define the model directory path
28
+ model_dir = os.path.join("models", model_choice)
29
+
30
+ # Initialise model
31
+ model = Homepage2Vec(model_dir=model_dir)
32
+
33
+ # Website to predict
34
+ website = model.fetch_website(url)
35
+
36
+ # Obtain scores and embeddings
37
+ scores, _ = model.predict(website)
38
+
39
+ # Filter only scores that have a value greater than 0.5
40
+ scores = {k: v for k, v in scores.items() if v > 0.5}
41
+
42
+ return scores
43
+
44
+
45
+ iface = gr.Interface(
46
+ fn=predict,
47
+ inputs=[gr.Dropdown(choices=["gpt3.5", "gpt4"], label="Select Model"),
48
+ gr.Textbox(label="Enter Website URL", placeholder="www.mikasenghaas.de")],
49
+ outputs=gr.Label(num_top_classes=14, label="Predicted Labels", show_label=True),
50
+ title="Homepage2Vec",
51
+ description="Use Homepage2Vec to predict the categories of any website you wish.",
52
+ examples=EXAMPLES,
53
+ live=False,
54
+ allow_flagging="never",
55
+ )
56
+
57
+ iface.launch()
homepage2vec/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Adapted version of the code from Homepage2Vec (https://github.com/epfl-dlab/homepage2vec).
3
+ """
homepage2vec/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (268 Bytes). View file
 
homepage2vec/__pycache__/data_collection.cpython-310.pyc ADDED
Binary file (1.33 kB). View file
 
homepage2vec/__pycache__/model.cpython-310.pyc ADDED
Binary file (5.42 kB). View file
 
homepage2vec/__pycache__/textual_extractor.cpython-310.pyc ADDED
Binary file (8.94 kB). View file
 
homepage2vec/data_collection.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module to access and load a webpage to be used by the homepage2vec model.
3
+
4
+ Includes:
5
+ - TimeoutException: Exception to be raised when a timeout occurs.
6
+ - time_limit: Context manager to set a time limit on the execution of a block.
7
+ - access_website: Function to access a website and return its response.
8
+ """
9
+
10
+ import requests
11
+
12
+
13
+ def access_website(url, timeout=10):
14
+ """
15
+ Return the response corresponding to a url, or None if there was a request error
16
+ """
17
+
18
+ try:
19
+ # change user-agent so that we don't look like a bot
20
+ headers = requests.utils.default_headers()
21
+ headers.update(
22
+ {
23
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:84.0) Gecko/20100101 Firefox/84.0",
24
+ }
25
+ )
26
+
27
+ # r_head = requests.head("http://" + url, timeout=timeout, headers=headers)
28
+ if not url.startswith("http://") and not url.startswith("https:"):
29
+ url = "http://" + url
30
+
31
+ r_get = requests.get(url, timeout=timeout, headers=headers)
32
+
33
+ # head_code = r_head.status_code
34
+ get_code = r_get.status_code
35
+ if r_get.encoding.lower() != "utf-8":
36
+ r_get.encoding = r_get.apparent_encoding
37
+ text = r_get.text
38
+ content_type = r_get.headers.get("content-type", "?").strip()
39
+ return text, get_code, content_type
40
+
41
+ except Exception as e:
42
+ return None
homepage2vec/model.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module that defines the Homepage2vec model (consisting of a textual extractor and a classifier).
3
+
4
+ Includes:
5
+ - WebsiteClassifier: Class to load and use the Homepage2vec model.
6
+ - SimpleClassifier: Class to define the architecture of the Homepage2vec model.
7
+ - Webpage: Class to define a webpage query.
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import tempfile
13
+ import uuid
14
+ from typing import OrderedDict
15
+
16
+ import numpy as np
17
+ import torch
18
+ from torch import nn
19
+ from torch.nn import functional as F
20
+
21
+ from homepage2vec.data_collection import access_website
22
+ from homepage2vec.textual_extractor import TextualExtractor
23
+
24
+
25
+ class WebsiteClassifier:
26
+ """
27
+ Pretrained Homepage2vec model
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ model_dir: str,
33
+ device=None,
34
+ cpu_threads_count=1,
35
+ dataloader_workers=1,
36
+ state_dict: OrderedDict | None = None,
37
+ ):
38
+ self.input_dim = 4665
39
+ self.output_dim = 14
40
+ self.classes = [
41
+ "Arts",
42
+ "Business",
43
+ "Computers",
44
+ "Games",
45
+ "Health",
46
+ "Home",
47
+ "Kids_and_Teens",
48
+ "News",
49
+ "Recreation",
50
+ "Reference",
51
+ "Science",
52
+ "Shopping",
53
+ "Society",
54
+ "Sports",
55
+ ]
56
+
57
+ self.temporary_dir = tempfile.gettempdir() + "/homepage2vec/"
58
+
59
+ self.device = device
60
+ self.dataloader_workers = dataloader_workers
61
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
62
+ if not device:
63
+ if torch.cuda.is_available():
64
+ self.device = "cuda:0"
65
+ else:
66
+ self.device = "cpu"
67
+ torch.set_num_threads(cpu_threads_count)
68
+
69
+ # Load state dict if not specified
70
+ if not state_dict:
71
+ weight_path = os.path.join(model_dir, "model.pt")
72
+ state_dict = torch.load(weight_path, map_location=torch.device(self.device))
73
+
74
+ # Load pretrained model
75
+ self.model = SimpleClassifier(self.input_dim, self.output_dim)
76
+ self.model.load_state_dict(state_dict)
77
+
78
+ # features used in training
79
+ self.features_order = []
80
+ self.features_dim = {}
81
+ feature_path = os.path.join(model_dir, "features.txt")
82
+ with open(feature_path, "r") as file:
83
+ for f in file:
84
+ name = f.split(" ")[0]
85
+ dim = int(f.split(" ")[1][:-1])
86
+ self.features_order.append(name)
87
+ self.features_dim[name] = dim
88
+
89
+ def get_scores(self, x):
90
+ with torch.no_grad():
91
+ self.model.eval()
92
+ return self.model.forward(x)
93
+
94
+ def fetch_website(self, url):
95
+ response = access_website(url)
96
+ w = Webpage(url)
97
+ if response is not None:
98
+ html, get_code, content_type = response
99
+ w.http_code = get_code
100
+ if self.is_valid(get_code, content_type):
101
+ w.is_valid = True
102
+ w.html = html
103
+
104
+ return w
105
+
106
+ def get_features(self, url, html, screenshot_path):
107
+ te = TextualExtractor(self.device)
108
+ features = te.get_features(url, html)
109
+
110
+ return features
111
+
112
+ def predict(self, website):
113
+ website.features = self.get_features(
114
+ website.url, website.html, website.screenshot_path
115
+ )
116
+ all_features = self.concatenate_features(website)
117
+ input_features = torch.FloatTensor(all_features)
118
+ scores, embeddings = self.get_scores(input_features)
119
+ return (
120
+ dict(zip(self.classes, torch.sigmoid(scores).tolist())),
121
+ embeddings.tolist(),
122
+ )
123
+
124
+ def concatenate_features(self, w):
125
+ """
126
+ Concatenate the features attributes of webpage instance, with respect to the features order in h2v
127
+ """
128
+
129
+ v = np.zeros(self.input_dim)
130
+
131
+ ix = 0
132
+
133
+ for f_name in self.features_order:
134
+ f_dim = self.features_dim[f_name]
135
+ f_value = w.features[f_name]
136
+ if f_value is None:
137
+ f_value = f_dim * [0] # if no feature, replace with zeros
138
+ v[ix : ix + f_dim] = f_value
139
+ ix += f_dim
140
+
141
+ return v
142
+
143
+ def is_valid(self, get_code, content_type):
144
+ valid_get_code = get_code == 200
145
+ valid_content_type = content_type.startswith("text/html")
146
+ return valid_get_code and valid_content_type
147
+
148
+
149
+ class SimpleClassifier(nn.Module):
150
+ """
151
+ Model architecture of Homepage2vec
152
+ """
153
+
154
+ def __init__(self, input_dim, output_dim, dropout=0.5):
155
+ super(SimpleClassifier, self).__init__()
156
+
157
+ self.layer1 = torch.nn.Linear(input_dim, 1000)
158
+ self.layer2 = torch.nn.Linear(1000, 100)
159
+ self.fc = torch.nn.Linear(100, output_dim)
160
+
161
+ self.drop = torch.nn.Dropout(dropout) # dropout of 0.5 before each layer
162
+
163
+ def forward(self, x):
164
+ x = self.layer1(x)
165
+ x = F.relu(self.drop(x))
166
+
167
+ emb = self.layer2(x)
168
+ x = F.relu(self.drop(emb))
169
+
170
+ x = self.fc(x)
171
+
172
+ return x, emb
173
+
174
+
175
+ class Webpage:
176
+ """
177
+ Shell for a webpage query
178
+ """
179
+
180
+ def __init__(self, url):
181
+ self.url = url
182
+ self.uid = uuid.uuid4().hex
183
+ self.is_valid = False
184
+ self.http_code = False
185
+ self.html = None
186
+ self.screenshot_path = None
187
+ self.features = None
188
+ self.embedding = None
189
+ self.scores = None
190
+
191
+ def __repr__(self):
192
+ return json.dumps(self.__dict__)
homepage2vec/textual_extractor.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module to extract textual features from the html content of a webpage.
3
+
4
+ Includes:
5
+ - TextualExtractor: Class to extract textual features from the html content of a webpage.
6
+ - embed_text: Function to embed the text of a webpage.
7
+ - embed_description: Function to embed the description of a webpage.
8
+ - embed_keywords: Function to embed the keywords of a webpage.
9
+ - embed_title: Function to embed the title of a webpage.
10
+ - embed_links: Function to embed the links of a webpage.
11
+ - embed_url: Function to embed the url of a webpage.
12
+ - embed_tld: Function to embed the top-level domain of a webpage.
13
+ - embed_metatags: Function to embed the metatags of a webpage.
14
+ - split_in_sentences: Function to split the text of a webpage in sentences.
15
+ - clean_url: Function to clean the url of a webpage.
16
+ - clean_field: Function to clean a field of a webpage.
17
+ - clean_link: Function to clean a link of a webpage.
18
+ - trunc: Function to truncate the output of a tokenizer to a given length.
19
+ """
20
+
21
+ import logging
22
+ import re
23
+ from collections import Counter
24
+
25
+ from bs4 import BeautifulSoup
26
+ from sentence_transformers import SentenceTransformer
27
+
28
+ logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
29
+
30
+
31
+ class TextualExtractor:
32
+ """
33
+ Extract textual features from the html content of a webpage
34
+ """
35
+
36
+ xlmr = None
37
+
38
+ def __init__(self, device="cpu"):
39
+ if not TextualExtractor.xlmr:
40
+ # Turn off logging and progress bar
41
+ TextualExtractor.xlmr = SentenceTransformer(
42
+ "paraphrase-xlm-r-multilingual-v1",
43
+ device=device,
44
+ )
45
+ # self.xlmr = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1', device=device)
46
+
47
+ # TLD used for one-hot encoding
48
+ self.rep_tld = [
49
+ "com",
50
+ "org",
51
+ "net",
52
+ "info",
53
+ "xyz",
54
+ "club",
55
+ "biz",
56
+ "top",
57
+ "edu",
58
+ "online",
59
+ "pro",
60
+ "site",
61
+ "vip",
62
+ "icu",
63
+ "buzz",
64
+ "app",
65
+ "asia",
66
+ "su",
67
+ "gov",
68
+ "space",
69
+ ]
70
+
71
+ # Metatags used for one-hot encoding
72
+ self.rep_metatags = [
73
+ "viewport",
74
+ "description",
75
+ "generator",
76
+ "keywords",
77
+ "robots",
78
+ "twitter:card",
79
+ "msapplication-tileimage",
80
+ "google-site-verification",
81
+ "author",
82
+ "twitter:title",
83
+ "twitter:description",
84
+ "theme-color",
85
+ "twitter:image",
86
+ "twitter:site",
87
+ "format-detection",
88
+ "msapplication-tilecolor",
89
+ "copyright",
90
+ "twitter:data1",
91
+ "twitter:label1",
92
+ "revisit-after",
93
+ "apple-mobile-web-app-capable",
94
+ "handheldfriendly",
95
+ "language",
96
+ "msvalidate.01",
97
+ "twitter:url",
98
+ "title",
99
+ "mobileoptimized",
100
+ "twitter:creator",
101
+ "skype_toolbar",
102
+ "rating",
103
+ ]
104
+
105
+ # number of sentences and links over which we compute the features
106
+ self.k_sentences = 100
107
+ self.k_links = 50
108
+
109
+ def get_features(self, url, html):
110
+ features = {}
111
+
112
+ # url
113
+ url_feature = embed_url(url, TextualExtractor.xlmr)
114
+ features["f_url"] = url_feature
115
+
116
+ # tld
117
+ tld_feature = embed_tld(url, self.rep_tld)
118
+ features["f_tld"] = tld_feature
119
+
120
+ # print(html)
121
+ soup = BeautifulSoup(html, "lxml")
122
+
123
+ # metatags
124
+ metatags_feature = embed_metatags(soup, self.rep_metatags)
125
+ features["f_metatags"] = metatags_feature
126
+
127
+ # title
128
+ title_feature = embed_title(soup, TextualExtractor.xlmr)
129
+ features["f_title"] = title_feature
130
+
131
+ # description
132
+ description_feature = embed_description(soup, TextualExtractor.xlmr)
133
+ features["f_description"] = description_feature
134
+
135
+ # keywords
136
+ keywords_feature = embed_keywords(soup, TextualExtractor.xlmr)
137
+ features["f_keywords"] = keywords_feature
138
+
139
+ # links
140
+ links_feature = embed_links(soup, TextualExtractor.xlmr, self.k_links)
141
+ features["f_links_" + str(self.k_links)] = links_feature
142
+
143
+ # text
144
+ text_feature = embed_text(soup, TextualExtractor.xlmr, self.k_sentences)
145
+ features["f_text_" + str(self.k_sentences)] = text_feature
146
+
147
+ return features
148
+
149
+
150
+ def embed_text(soup, transformer, k_sentences):
151
+ """Embed the text of a webpage""" ""
152
+ sentences = split_in_sentences(soup)[:k_sentences]
153
+
154
+ if len(sentences) == 0:
155
+ return None
156
+
157
+ # this is needed to avoid some warnings, truncate the sentences
158
+ sentences_trunc = [
159
+ trunc(s, transformer.tokenizer, transformer.max_seq_length) for s in sentences
160
+ ]
161
+
162
+ sentences_emb = transformer.encode(sentences_trunc)
163
+
164
+ if sentences_emb.size == 0:
165
+ return None
166
+
167
+ text_emb = sentences_emb.mean(axis=0).tolist() # mean of the sentences
168
+
169
+ return text_emb
170
+
171
+
172
+ def embed_description(soup, transformer):
173
+ """Embed the description of a webpage""" ""
174
+ desc = soup.find("meta", attrs={"name": ["description", "Description"]})
175
+
176
+ if not desc:
177
+ return None
178
+
179
+ content = desc.get("content", "")
180
+
181
+ if len(content.strip()) == 0:
182
+ return None
183
+
184
+ content = clean_field(content)
185
+
186
+ # this is needed to avoid some warnings
187
+ desc_trunc = trunc(content, transformer.tokenizer, transformer.max_seq_length)
188
+ desc_emb = transformer.encode(desc_trunc)
189
+
190
+ if desc_emb.size == 0:
191
+ return None
192
+
193
+ return desc_emb.tolist()
194
+
195
+
196
+ def embed_keywords(soup, transformer):
197
+ """Embed the keywords of a webpage""" ""
198
+ kw = soup.find("meta", attrs={"name": "keywords"})
199
+
200
+ if not kw:
201
+ return None
202
+
203
+ content = kw.get("content", "")
204
+
205
+ if len(content.strip()) == 0:
206
+ return None
207
+
208
+ # this is needed to avoid some warnings
209
+ kw_trunc = trunc(content, transformer.tokenizer, transformer.max_seq_length)
210
+ kw_emb = transformer.encode(kw_trunc)
211
+
212
+ if kw_emb.size == 0:
213
+ return None
214
+
215
+ return kw_emb.tolist()
216
+
217
+
218
+ def embed_title(soup, transformer):
219
+ """Embed the title of a webpage""" ""
220
+ title = soup.find("title")
221
+
222
+ if title is None:
223
+ return None
224
+
225
+ title = str(title.string)
226
+ title = clean_field(title)
227
+
228
+ if len(title) == 0:
229
+ return None
230
+
231
+ # this is needed to avoid some warnings
232
+ title_trunc = trunc(title, transformer.tokenizer, transformer.max_seq_length)
233
+ title_emb = transformer.encode(title_trunc)
234
+
235
+ if title_emb.size == 0:
236
+ return None
237
+
238
+ return title_emb.tolist()
239
+
240
+
241
+ def embed_links(soup, transformer, k_links):
242
+ """Embed the links of a webpage""" ""
243
+ a_tags = soup.find_all("a", href=True)
244
+
245
+ links = [a.get("href", "") for a in a_tags]
246
+ links = [clean_link(link) for link in links]
247
+ links = [link for link in links if len(link) != 0]
248
+
249
+ words = [w.lower() for w in " ".join(links).split(" ") if len(w) != 0]
250
+
251
+ if len(words) == 0:
252
+ return None
253
+
254
+ most_frequent_words = [w[0] for w in Counter(words).most_common(k_links)]
255
+
256
+ # most_frequent_words = pd.Series(words).value_counts()[:k_links].index.values
257
+
258
+ # this is needed to avoid some warnings
259
+ words_trunc = [
260
+ trunc(w, transformer.tokenizer, transformer.max_seq_length)
261
+ for w in most_frequent_words
262
+ ]
263
+ words_emb = transformer.encode(words_trunc)
264
+
265
+ if words_emb.size == 0:
266
+ return None
267
+
268
+ links_emb = words_emb.mean(axis=0).tolist()
269
+
270
+ return links_emb
271
+
272
+
273
+ def embed_url(url, transformer):
274
+ """Embed the url of a webpage"""
275
+ cleaned_url = clean_url(url)
276
+
277
+ url_emb = transformer.encode(cleaned_url)
278
+
279
+ if url_emb.size == 0:
280
+ return None
281
+
282
+ return url_emb.mean(axis=0).tolist()
283
+
284
+
285
+ def embed_tld(url, rep_tld):
286
+ """Embed the top-level domain of a webpage"""
287
+ tld = url.split(".")[-1]
288
+ rep_onehot = [int(tld.startswith(d)) for d in rep_tld]
289
+ continent_onehot = 7 * [0] # TODO
290
+
291
+ return rep_onehot + continent_onehot
292
+
293
+
294
+ def embed_metatags(soup, rep_metatags):
295
+ """Embed the metatags of a webpage"""
296
+ metatags = soup.findAll("meta")
297
+ attr = [m.get("name", None) for m in metatags]
298
+ attr = [a.lower() for a in attr if a is not None]
299
+
300
+ attr_emb = [int(a in attr) for a in rep_metatags]
301
+
302
+ return attr_emb
303
+
304
+
305
+ def split_in_sentences(soup):
306
+ """From the raw html content of a website, extract the text visible to the user and splits it in sentences"""
307
+
308
+ sep = soup.get_text("[SEP]").split(
309
+ "[SEP]"
310
+ ) # separate text elements with special separators [SEP]
311
+ strip = [s.strip() for s in sep if s != "\n"]
312
+ clean = [s for s in strip if len(s) != 0]
313
+
314
+ return clean
315
+
316
+
317
+ def clean_url(url):
318
+ """Clean the url of a webpage"""
319
+ url = re.sub(r"www.|http://|https://|-|_", "", url)
320
+ return url.split(".")[:-1]
321
+
322
+
323
+ def clean_field(field):
324
+ """Clean a field of a webpage"""
325
+ field = re.sub(r"\*|\n|\r|\t|\||:|-|–", "", field)
326
+ return field.strip()
327
+
328
+
329
+ def clean_link(link):
330
+ """Clean a link of a webpage"""
331
+ link = re.sub(r"www.|http://|https://|[0-9]+", "", link)
332
+ link = re.sub(r"-|_|=|\?|:", " ", link)
333
+ link = link.split("/")[1:]
334
+ return " ".join(link).strip()
335
+
336
+
337
+ def trunc(seq, tok, max_length):
338
+ """Truncate the output of a tokenizer to a given length, doesn't affect the performances"""
339
+ e = tok.encode(seq, truncation=True)
340
+ d = tok.decode(e[1:-1][: max_length - 2])
341
+ return d
models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/gpt3.5/features.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ f_tld 27
2
+ f_url 768
3
+ f_metatags 30
4
+ f_title 768
5
+ f_description 768
6
+ f_keywords 768
7
+ f_links_50 768
8
+ f_text_100 768
models/gpt3.5/model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d40bb85c577a8c0951b585714c35fa10509267f0d52ec1c6952f650e9622887
3
+ size 19072308
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "homepage2vec"
3
+ version = "0.1.0"
4
+ description = "Website Classifier"
5
+ authors = ["Your Name <you@example.com>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.dependencies]
9
+ python = "3.10.13"
10
+ requests = "*"
11
+ torch = "*"
12
+ beautifulsoup4 = "*"
13
+ lxml = "*"
14
+ sentence-transformers = "*"
15
+ numpy = "*"
16
+
17
+
18
+ [build-system]
19
+ requires = ["poetry-core"]
20
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ requests
2
+ torch
3
+ beautifulsoup4
4
+ lxml
5
+ sentence-transformers
6
+ numpy