File size: 14,916 Bytes
66bc8ec
 
b38dea1
66bc8ec
 
 
 
 
 
 
 
5757778
66bc8ec
5757778
 
66bc8ec
 
 
 
 
 
 
 
 
6ac056a
 
 
 
 
 
 
66bc8ec
 
 
 
 
 
 
 
 
01ae0bb
66bc8ec
01ae0bb
91194ca
01ae0bb
66bc8ec
 
 
1bfc3ce
66bc8ec
 
 
 
 
 
 
 
1bfc3ce
66bc8ec
 
 
 
 
 
01ae0bb
66bc8ec
 
 
01ae0bb
66bc8ec
 
01ae0bb
66bc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e4eb94
66bc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ae0bb
6ac056a
 
 
 
 
 
 
 
 
 
 
 
 
 
01ae0bb
6ac056a
 
 
 
 
 
 
 
 
01ae0bb
66bc8ec
 
 
 
01ae0bb
66bc8ec
 
 
 
 
 
 
 
 
 
01ae0bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66bc8ec
 
 
 
 
c19ef6e
 
66bc8ec
01ae0bb
66bc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71965fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ae0bb
66bc8ec
 
 
 
 
c19ef6e
 
66bc8ec
01ae0bb
66bc8ec
 
 
 
 
 
 
 
 
 
 
 
 
 
2e4eb94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ae0bb
66bc8ec
01ae0bb
6ac056a
66bc8ec
6ac056a
 
66bc8ec
01ae0bb
6ac056a
01ae0bb
6ac056a
 
 
64d4f97
01ae0bb
64d4f97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01ae0bb
64d4f97
 
 
 
 
 
 
01ae0bb
64d4f97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b38dea1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
import datetime as dt
import os
import sqlite3
from types import SimpleNamespace

import pytest
from fastapi.testclient import TestClient


def is_roughly_now(datetime_str):
    """Check if a datetime string is roughly from now"""
    now = dt.datetime.now(dt.timezone.utc)
    datetime = dt.datetime.fromisoformat(datetime_str)
    # set timezone, otherwise cannot subtract
    datetime = datetime.replace(tzinfo=dt.timezone.utc)
    return (now - datetime).total_seconds() < 3


class TestWebservice:
    @pytest.fixture(autouse=True)
    def db_file(self, tmp_path):
        filename = tmp_path / "test-db.sqlite"
        os.environ["DB_FILE_NAME"] = str(filename)

    @pytest.fixture
    def cursor(self):
        from gistillery.db import get_db_cursor

        with get_db_cursor() as cursor:
            yield cursor

    @pytest.fixture
    def client(self):
        from gistillery.webservice import app

        client = TestClient(app)
        client.get("/clear")
        return client

    @pytest.fixture
    def registry(self):
        # use dummy models
        from gistillery.tools import Summarizer, Tagger
        from gistillery.preprocessing import RawTextProcessor
        from gistillery.registry import ToolRegistry

        class DummySummarizer(Summarizer):
            """Returns the first 10 characters of the input"""

            def get_name(self):
                return "dummy summarizer"

            def __call__(self, x):
                return x[:10]

        class DummyTagger(Tagger):
            """Returns the first 3 words of the input"""

            def get_name(self):
                return "dummy tagger"

            def __call__(self, x):
                return ["#" + word for word in x.split(maxsplit=4)[:3]]

        registry = ToolRegistry()
        registry.register_processor(RawTextProcessor())

        # arguments don't matter for dummy summarizer and tagger
        summarizer = DummySummarizer()
        registry.register_summarizer(summarizer)

        tagger = DummyTagger()
        registry.register_tagger(tagger)
        return registry

    def process_jobs(self, registry):
        # emulate work of the background worker
        from gistillery.worker import check_pending_jobs, process_job

        jobs = check_pending_jobs()
        for job in jobs:
            process_job(job, registry)

    def test_status(self, client):
        resp = client.get("/status")
        assert resp.status_code == 200
        assert resp.json() == "OK"

    def test_recent_empty(self, client):
        resp = client.get("/recent")
        assert resp.json() == []

    def test_recent_tag_empty(self, client, monkeypatch):
        resp = client.get("/recent/general")
        assert resp.json() == []

        # monkeypatch uuid4 to return a known value
        job_id = "abc1234"
        monkeypatch.setattr("uuid.uuid4", lambda: SimpleNamespace(hex=job_id))
        client.post("/submit", json={"author": "ben", "content": "this is a test"})

        resp = client.get(f"/check_job_status/{job_id}")
        output = resp.json()
        last_updated = output.pop("last_updated")
        assert output == {
            "id": job_id,
            "status": "pending",
        }
        assert is_roughly_now(last_updated)

    def test_submitted_job_status_not_found(self, client, monkeypatch):
        # monkeypatch uuid4 to return a known value
        job_id = "abc1234"
        monkeypatch.setattr("uuid.uuid4", lambda: SimpleNamespace(hex=job_id))
        client.post("/submit", json={"author": "ben", "content": "this is a test"})

        other_job_id = "def5678"
        resp = client.get(f"/check_job_status/{other_job_id}")
        output = resp.json()
        last_updated = output.pop("last_updated")
        assert output == {
            "id": other_job_id,
            "status": "not found",
        }
        assert last_updated is None

    def test_submitted_job_failed(self, client, registry, monkeypatch):
        # monkeypatch uuid4 to return a known value
        job_id = "abc1234"
        monkeypatch.setattr("uuid.uuid4", lambda: SimpleNamespace(hex=job_id))
        client.post("/submit", json={"author": "ben", "content": "this is a test"})
        # patch gistillery.worker._process_job to raise an exception

        def raise_(ex):
            raise ex

        # make the job processing fail
        monkeypatch.setattr(
            "gistillery.worker._process_job",
            lambda job, registry: raise_(RuntimeError("something went wrong")),
        )
        self.process_jobs(registry)

        resp = client.get(f"/check_job_status/{job_id}")
        output = resp.json()
        output.pop("last_updated")
        assert output == {
            "id": job_id,
            "status": "failed",
        }

    def test_submitted_job_status_done(self, client, registry, monkeypatch):
        # monkeypatch uuid4 to return a known value
        job_id = "abc1234"
        monkeypatch.setattr("uuid.uuid4", lambda: SimpleNamespace(hex=job_id))
        client.post("/submit", json={"author": "ben", "content": "this is a test"})
        self.process_jobs(registry)

        resp = client.get(f"/check_job_status/{job_id}")
        output = resp.json()
        last_updated = output.pop("last_updated")
        assert output == {
            "id": job_id,
            "status": "done",
        }
        assert is_roughly_now(last_updated)

    def test_status_pending_jobs(self, client, registry, monkeypatch):
        resp = client.get("/check_job_status/")
        output = resp.json()
        assert output == "No pending jobs found"

        monkeypatch.setattr("uuid.uuid4", lambda: SimpleNamespace(hex="abc0"))
        client.post("/submit", json={"author": "ben", "content": "this is a test"})
        resp = client.get("/check_job_status/")
        output = resp.json()
        expected = "Found 1 pending job(s): abc0"
        assert output == expected

        for i in range(1, 10):
            monkeypatch.setattr("uuid.uuid4", lambda: SimpleNamespace(hex=f"abc{i}"))
            client.post("/submit", json={"author": "ben", "content": "this is a test"})

        resp = client.get("/check_job_status/")
        output = resp.json()
        expected = "Found 10 pending job(s): abc0, abc1, abc2, ..."
        assert output == expected

    def test_recent_with_entries(self, client, registry):
        # submit 2 entries
        client.post(
            "/submit", json={"author": "maxi", "content": "this is a first test"}
        )
        client.post(
            "/submit",
            json={"author": "mini", "content": "this would be something else"},
        )
        self.process_jobs(registry)
        resp = client.get("/recent").json()

        # results are sorted by recency but since dummy models are so fast, the
        # date in the db could be the same, so we sort by author
        resp = sorted(resp, key=lambda x: x["author"])
        assert len(resp) == 2

        resp0 = resp[0]
        assert resp0["author"] == "maxi"
        assert resp0["summary"] == "this is a "
        assert resp0["tags"] == sorted(["#this", "#is", "#a"])

        resp1 = resp[1]
        assert resp1["author"] == "mini"
        assert resp1["summary"] == "this would"
        assert resp1["tags"] == sorted(["#this", "#would", "#be"])

    def test_recent_source_snippet_shortened(self, client, registry):
        # submit 2 entries
        client.post("/submit", json={"author": "alice", "content": "this is short"})
        client.post(
            "/submit",
            json={"author": "bob", "content": "this is long " * 100},
        )
        self.process_jobs(registry)
        resp = client.get("/recent").json()
        resp = sorted(resp, key=lambda x: x["author"])

        assert resp[0]["source_snippet"] == "this is short"
        expected_shortened = (
            "this is long this is long this is long this is long this is long th"
            "...ng this is long this is long this is long this is long this is long "
        )
        assert resp[1]["source_snippet"] == expected_shortened

    def test_recent_tag_with_entries(self, client, registry):
        # submit 2 entries
        client.post(
            "/submit", json={"author": "maxi", "content": "this is a first test"}
        )
        client.post(
            "/submit",
            json={"author": "mini", "content": "this would be something else"},
        )
        self.process_jobs(registry)

        # the "this" tag is in both entries
        resp = client.get("/recent/this").json()
        assert len(resp) == 2

        # the "would" tag is in only one entry
        resp = client.get("/recent/would").json()
        assert len(resp) == 1

        resp0 = resp[0]
        assert resp0["author"] == "mini"
        assert resp0["summary"] == "this would"
        assert resp0["tags"] == sorted(["#this", "#would", "#be"])

    def test_recent_multiple_entries(self, client, registry):
        # submit 2 entries
        client.post(
            "/submit", json={"author": "maxi", "content": "aardvark ant antelope"}
        )
        client.post(
            "/submit",
            json={"author": "mini", "content": "bat bear bee"},
        )
        client.post(
            "/submit",
            json={"author": "mini", "content": "camel canary cat"},
        )
        self.process_jobs(registry)

        # the "ant" tag is in only one entry
        resp = client.get("/recent/ant").json()
        assert len(resp) == 1

        # "ant" and "bee" are in two entries
        resp = client.get("/recent/ant,bee").json()
        assert len(resp) == 2

        # "ant" and "bee" and "cat" are in three entries
        resp = client.get("/recent/cat,ant,bee").json()
        assert len(resp) == 3

    def test_tag_count(self, client, registry):
        # submit 2 entries
        client.post(
            "/submit", json={"author": "ben", "content": "aardvark ant antelope"}
        )
        client.post(
            "/submit",
            json={"author": "ben", "content": "aardvark ant bat"},
        )
        client.post(
            "/submit",
            json={"author": "ben", "content": "aardvark camel canary"},
        )
        self.process_jobs(registry)

        resp = client.get("/tag_counts").json()
        expected = {
            "#aardvark": 3,
            "#ant": 2,
            "#antelope": 1,
            "#bat": 1,
            "#camel": 1,
            "#canary": 1,
        }
        assert resp == expected

    def test_clear(self, client, cursor, registry):
        client.post("/submit", json={"author": "ben", "content": "this is a test"})
        self.process_jobs(registry)
        assert cursor.execute("SELECT COUNT(*) c FROM entries").fetchone()[0] == 1

        client.get("/clear")
        assert cursor.execute("SELECT COUNT(*) c FROM entries").fetchone()[0] == 0

    def test_inputs_stored(self, client, cursor, registry):
        client.post("/submit", json={"author": "ben", "content": "  this is a test\n"})
        self.process_jobs(registry)
        rows = cursor.execute("SELECT * FROM inputs").fetchall()
        assert len(rows) == 1
        assert rows[0].input == "this is a test"

    def test_submit_url(self, client, cursor, registry, monkeypatch):
        class MockClient:
            """Mock httpx Client, return www.example.com content"""

            def get(self, url):
                return SimpleNamespace(
                    text=''' <!doctype html>\n<html>\n<head>\n <title>Example
                    Domain</title>\n\n <meta charset="utf-8" />\n <meta
                    http-equiv="Content-type" content="text/html; charset=utf-8"
                    />\n <meta name="viewport" content="width=device-width,
                    initial-scale=1" />\n <style type="text/css">\n body {\n
                    background-color: #f0f0f2;\n margin: 0;\n padding: 0;\n
                    font-family: -apple-system, system-ui, BlinkMacSystemFont,
                    "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial,
                    sans-serif;\n \n }\n div {\n width: 600px;\n margin: 5em
                    auto;\n padding: 2em;\n background-color: #fdfdff;\n
                    border-radius: 0.5em;\n box-shadow: 2px 3px 7px 2px
                    rgba(0,0,0,0.02);\n }\n a:link, a:visited {\n color:
                    #38488f;\n text-decoration: none;\n }\n @media (max-width:
                    700px) {\n div {\n margin: 0 auto;\n width: auto;\n }\n }\n
                    </style> \n</head>\n\n<body>\n<div>\n <h1>Example
                    Domain</h1>\n <p>This domain is for use in illustrative
                    examples in documents. You may use this\n domain in
                    literature without prior coordination or asking for
                    permission.</p>\n <p><a
                    href="https://www.iana.org/domains/example">More
                    information...</a></p>\n</div>\n</body>\n</html>\n'''
                )

        monkeypatch.setattr("gistillery.preprocessing.Client", MockClient)

        from gistillery.preprocessing import DefaultUrlProcessor

        # register url processor, put it before the default processor
        registry.register_processor(DefaultUrlProcessor(), last=False)
        client.post(
            "/submit",
            json={
                "author": "ben",
                "content": "https://en.wikipedia.org/wiki/non-existing-page",
            },
        )
        self.process_jobs(registry)

        rows = cursor.execute("SELECT * FROM inputs").fetchall()
        assert len(rows) == 1

        expected = "\n".join(
            [
                'https://en.wikipedia.org/wiki/non-existing-page',
                '',
                'This domain is for use in illustrative',
                'examples in documents. You may use this',
                'domain in',
                'literature without prior coordination or asking for',
                'permission.',
                'More',
                'information...',
            ]
        )
        assert rows[0].input == expected

    def test_backup(self, client, tmp_path):
        # submit an entry, create a backup, check that the backup contains the entry
        from gistillery.db import namedtuple_factory

        client.post("/submit", json={"author": "Pie Test", "content": "this is a pie"})
        resp = client.get("/backup")
        assert resp.status_code == 200

        with open(tmp_path / "backup.db", "wb") as f:
            f.write(resp.content)
            conn = sqlite3.connect(tmp_path / "backup.db")

        conn.row_factory = namedtuple_factory
        cursor = conn.cursor()
        res = cursor.execute("select * from entries").fetchall()
        assert len(res) == 1
        assert is_roughly_now(res[0].created_at)
        assert res[0].author == "Pie Test"