Benjamin Bossan commited on
Commit
64d4f97
1 Parent(s): c19ef6e

Add actual code for processing web pages

Browse files

This uses trafilatura to extract the main text from a web page.

pyproject.toml CHANGED
@@ -17,5 +17,6 @@ addopts = "--cov=src --cov-report=term-missing"
17
  no_implicit_optional = true
18
  strict = true
19
 
20
- [[tool.mypy-transformers]]
 
21
  ignore_missing_imports = true
 
17
  no_implicit_optional = true
18
  strict = true
19
 
20
+ [[tool.mypy.overrides]]
21
+ module = "transformers,trafilatura"
22
  ignore_missing_imports = true
requests.org CHANGED
@@ -22,7 +22,7 @@ curl -X 'POST' \
22
  #+end_src
23
 
24
  #+RESULTS:
25
- : Submitted job 6012b198ffe0467d9344a196a2ced121
26
 
27
  #+begin_src bash
28
  curl -X 'POST' \
@@ -36,16 +36,30 @@ curl -X 'POST' \
36
  #+end_src
37
 
38
  #+RESULTS:
39
- : Submitted job 05058b906f524fb4bfedc4f5a84eff06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  #+begin_src bash
42
  curl -X 'GET' \
43
- 'http://localhost:8080/check_job_status/6012b198ffe0467d9344a196a2ced121' \
44
  -H 'accept: application/json'
45
  #+end_src
46
 
47
  #+RESULTS:
48
- | {"id":"6012b198ffe0467d9344a196a2ced121" | status:"done" | last_updated:"2023-05-08T12:27:07"} |
49
 
50
  #+begin_src bash
51
  curl -X 'GET' \
@@ -54,4 +68,4 @@ curl -X 'GET' \
54
  #+end_src
55
 
56
  #+RESULTS:
57
- | [{"id":"05058b906f524fb4bfedc4f5a84eff06" | author:"ben" | summary:"A new approach to NLP that incorporates reinforcement learning and human feedback. How does it work? Why does it work? In this post | I’ll explain how it works. RLHF is a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback." | tags:["#general" | #rlhf] | date:"2023-05-08T12:27:31"} | {"id":"6012b198ffe0467d9344a196a2ced121" | author:"ben" | summary:"GitLab | the most comprehensive | scalable enterprise DevSecOps platform for software innovation | and Google Cloud today announced an extension of their strategic partnership to deliver secure AI offerings to the enterprise. By leveraging Google Cloud's customizable foundation models and open generative AI infrastructure | GitLab will provide customers with AI-assisted features directly within the enterprise DevSecOps platform. The company's AI capabilities are designed to help enterprises improve productivity and reduce costs." | tags:["#general"] | date:"2023-05-08T12:27:07"}] |
 
22
  #+end_src
23
 
24
  #+RESULTS:
25
+ : Submitted job 04deee1a2a9b4d6ea986ffe0fa4017d9
26
 
27
  #+begin_src bash
28
  curl -X 'POST' \
 
36
  #+end_src
37
 
38
  #+RESULTS:
39
+ : Submitted job 730352e00e8145b39971fdc386c28a8f
40
+
41
+ #+begin_src bash
42
+ curl -X 'POST' \
43
+ 'http://localhost:8080/submit/' \
44
+ -H 'accept: application/json' \
45
+ -H 'Content-Type: application/json' \
46
+ -d '{
47
+ "author": "ben",
48
+ "content": "https://en.wikipedia.org/wiki/Goulburn_Street"
49
+ }'
50
+ #+end_src
51
+
52
+ #+RESULTS:
53
+ : Submitted job 1738d7daa96147198d80b93ea040863d
54
 
55
  #+begin_src bash
56
  curl -X 'GET' \
57
+ 'http://localhost:8080/check_job_status/1738d7daa96147198d80b93ea040863d' \
58
  -H 'accept: application/json'
59
  #+end_src
60
 
61
  #+RESULTS:
62
+ | {"id":"1738d7daa96147198d80b93ea040863d" | status:"pending" | last_updated:"2023-05-09T13:24:42"} |
63
 
64
  #+begin_src bash
65
  curl -X 'GET' \
 
68
  #+end_src
69
 
70
  #+RESULTS:
71
+ | [{"id":"1738d7daa96147198d80b93ea040863d" | author:"ben" | summary:"Goulburn Street is a street in the central business district of Sydney | New South Wales | Australia. It runs from Darling Harbour and Chinatown in the west to Crown Street in the east at Darlinghurst and Surry Hills. The only car park operated by Sydney City Council within the CBD is at the corner of Goulburn and Elizabeth Streets. It was the first air rights car park in Australia | opening in 1963 over six tracks of the City Circle line.[3][4]" | tags:["#centralbusinessdistrict" | #darlinghurst | #general | #goulburnstreet | #surryhills | #sydney | #sydneymasoniccentre] | date:"2023-05-09T13:24:42"} | {"id":"730352e00e8145b39971fdc386c28a8f" | author:"ben" | summary:"A new approach to NLP that incorporates reinforcement learning and human feedback. How does it work? Why does it work? In this post | I’ll explain how it works. RLHF is a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback. It’s a new approach to NLP that incorporates reinforcement learning and human feedback." | tags:["#" | #general | #rlhf] | date:"2023-05-09T13:24:38"} | {"id":"04deee1a2a9b4d6ea986ffe0fa4017d9" | author:"ben" | summary:"GitLab | the most comprehensive | scalable enterprise DevSecOps platform for software innovation | and Google Cloud today announced an extension of their strategic partnership to deliver secure AI offerings to the enterprise. By leveraging Google Cloud's customizable foundation models and open generative AI infrastructure | GitLab will provide customers with AI-assisted features directly within the enterprise DevSecOps platform. The company's AI capabilities are designed to help enterprises improve productivity and reduce costs." | tags:["#ai-assistedfeatures" | #enterprisedevsecopsplatform | #general | #gitlab | #googlecloud] | date:"2023-05-09T13:24:36"}] |
requirements.txt CHANGED
@@ -3,3 +3,5 @@ httpx
3
  uvicorn[standard]
4
  torch
5
  transformers
 
 
 
3
  uvicorn[standard]
4
  torch
5
  transformers
6
+ charset-normalizer
7
+ trafilatura
src/gistillery/ml.py CHANGED
@@ -70,14 +70,17 @@ class HfTransformersTagger(Tagger):
70
 
71
  self.template = (
72
  "Create a list of tags for the text below. The tags should be high level "
73
- "and specific. Prefix each tag with a hashtag.\n\n{}\n\nTags: #general"
 
74
  )
75
 
76
  def _extract_tags(self, text: str) -> list[str]:
77
- tags = set()
78
- for tag in text.split():
79
- if tag.startswith("#"):
80
- tags.add(tag.lower())
 
 
81
  return sorted(tags)
82
 
83
  def __call__(self, x: str) -> list[str]:
 
70
 
71
  self.template = (
72
  "Create a list of tags for the text below. The tags should be high level "
73
+ "and specific. Return the results as a comma separated list.\n\n"
74
+ "{}\n\nTags:\n"
75
  )
76
 
77
  def _extract_tags(self, text: str) -> list[str]:
78
+ tags = {"#general"}
79
+ for tag in text.split(","):
80
+ tag = tag.strip().lower().replace(" ", "")
81
+ if not tag.startswith("#"):
82
+ tag = "#" + tag
83
+ tags.add(tag)
84
  return sorted(tags)
85
 
86
  def __call__(self, x: str) -> list[str]:
src/gistillery/preprocessing.py CHANGED
@@ -2,7 +2,8 @@ import abc
2
  import logging
3
  import re
4
 
5
- import httpx
 
6
 
7
  from gistillery.base import JobInput
8
 
@@ -39,8 +40,9 @@ class RawTextProcessor(Processor):
39
 
40
 
41
  class DefaultUrlProcessor(Processor):
 
42
  def __init__(self) -> None:
43
- self.client = httpx.Client()
44
  self.regex = re.compile(r"(https?://[^\s]+)")
45
  self.url = None
46
  self.template = "{url}\n\n{content}"
@@ -57,5 +59,6 @@ class DefaultUrlProcessor(Processor):
57
  assert isinstance(self.url, str)
58
  text = self.client.get(self.url).text
59
  assert isinstance(text, str)
60
- text = self.template.format(url=self.url, content=text)
 
61
  return text
 
2
  import logging
3
  import re
4
 
5
+ from httpx import Client
6
+ from trafilatura import extract
7
 
8
  from gistillery.base import JobInput
9
 
 
40
 
41
 
42
  class DefaultUrlProcessor(Processor):
43
+ # uses trafilatura to extract text from html
44
  def __init__(self) -> None:
45
+ self.client = Client()
46
  self.regex = re.compile(r"(https?://[^\s]+)")
47
  self.url = None
48
  self.template = "{url}\n\n{content}"
 
59
  assert isinstance(self.url, str)
60
  text = self.client.get(self.url).text
61
  assert isinstance(text, str)
62
+ extracted = extract(text)
63
+ text = self.template.format(url=self.url, content=extracted)
64
  return text
src/gistillery/registry.py CHANGED
@@ -12,8 +12,11 @@ class MlRegistry:
12
  self.model = None
13
  self.tokenizer = None
14
 
15
- def register_processor(self, processor: Processor) -> None:
16
- self.processors.append(processor)
 
 
 
17
 
18
  def register_summarizer(self, summarizer: Summarizer) -> None:
19
  self.summerizer = summarizer
 
12
  self.model = None
13
  self.tokenizer = None
14
 
15
+ def register_processor(self, processor: Processor, last: bool = True) -> None:
16
+ if last:
17
+ self.processors.append(processor)
18
+ else:
19
+ self.processors.insert(0, processor)
20
 
21
  def register_summarizer(self, summarizer: Summarizer) -> None:
22
  self.summerizer = summarizer
tests/test_app.py CHANGED
@@ -234,3 +234,65 @@ class TestWebservice:
234
  rows = cursor.execute("SELECT * FROM inputs").fetchall()
235
  assert len(rows) == 1
236
  assert rows[0].input == "this is a test"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  rows = cursor.execute("SELECT * FROM inputs").fetchall()
235
  assert len(rows) == 1
236
  assert rows[0].input == "this is a test"
237
+
238
+ def test_submit_url(self, client, cursor, mlregistry, monkeypatch):
239
+ class MockClient:
240
+ """Mock httpx Client, return www.example.com content"""
241
+
242
+ def get(self, url):
243
+ return SimpleNamespace(
244
+ text=''' <!doctype html>\n<html>\n<head>\n <title>Example
245
+ Domain</title>\n\n <meta charset="utf-8" />\n <meta
246
+ http-equiv="Content-type" content="text/html; charset=utf-8"
247
+ />\n <meta name="viewport" content="width=device-width,
248
+ initial-scale=1" />\n <style type="text/css">\n body {\n
249
+ background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n
250
+ font-family: -apple-system, system-ui, BlinkMacSystemFont,
251
+ "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial,
252
+ sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em
253
+ auto;\n padding: 2em;\n background-color: #fdfdff;\n
254
+ border-radius: 0.5em;\n box-shadow: 2px 3px 7px 2px
255
+ rgba(0,0,0,0.02);\n }\n a:link, a:visited {\n color:
256
+ #38488f;\n text-decoration: none;\n }\n @media (max-width:
257
+ 700px) {\n div {\n margin: 0 auto;\n width: auto;\n }\n }\n
258
+ </style> \n</head>\n\n<body>\n<div>\n <h1>Example
259
+ Domain</h1>\n <p>This domain is for use in illustrative
260
+ examples in documents. You may use this\n domain in
261
+ literature without prior coordination or asking for
262
+ permission.</p>\n <p><a
263
+ href="https://www.iana.org/domains/example">More
264
+ information...</a></p>\n</div>\n</body>\n</html>\n'''
265
+ )
266
+
267
+ monkeypatch.setattr("gistillery.preprocessing.Client", MockClient)
268
+
269
+ from gistillery.preprocessing import DefaultUrlProcessor
270
+
271
+ # register url processor, put it before the default processor
272
+ mlregistry.register_processor(DefaultUrlProcessor(), last=False)
273
+ client.post(
274
+ "/submit",
275
+ json={
276
+ "author": "ben",
277
+ "content": "https://en.wikipedia.org/wiki/non-existing-page",
278
+ },
279
+ )
280
+ self.process_jobs(mlregistry)
281
+
282
+ rows = cursor.execute("SELECT * FROM inputs").fetchall()
283
+ assert len(rows) == 1
284
+
285
+ expected = "\n".join(
286
+ [
287
+ 'https://en.wikipedia.org/wiki/non-existing-page',
288
+ '',
289
+ 'This domain is for use in illustrative',
290
+ 'examples in documents. You may use this',
291
+ 'domain in',
292
+ 'literature without prior coordination or asking for',
293
+ 'permission.',
294
+ 'More',
295
+ 'information...',
296
+ ]
297
+ )
298
+ assert rows[0].input == expected