Spaces:
Sleeping
Sleeping
Futyn-Maker
commited on
Commit
•
4abc673
1
Parent(s):
7e1f5f6
Update data and using large model
Browse files- config.yaml +2 -2
- data/raw/textmeme.json +2 -2
- indexes/bm25/bm25_index.pkl +2 -2
- indexes/semantic/embeddings.npy +2 -2
- logs/build_bm25s_index.log +3 -0
- logs/build_semantic_index.log +3 -0
- logs/data_collector.log +4 -0
- logs/make_db.log +3 -0
- meme_search.db +2 -2
- src/db/__pycache__/crud.cpython-311.pyc +0 -0
- src/db/__pycache__/models.cpython-311.pyc +0 -0
- src/indexing/__pycache__/bm25_indexer.cpython-311.pyc +0 -0
- src/indexing/__pycache__/semantic_indexer.cpython-311.pyc +0 -0
- src/parsing/__pycache__/vk_meme_parser.cpython-311.pyc +0 -0
- src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc +0 -0
config.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
# Configuration file for the Meme Search Engine project
|
2 |
|
3 |
vk_parser:
|
4 |
-
api_token: "
|
5 |
meme_pages:
|
6 |
- "textmeme"
|
7 |
# - "badtextmeme"
|
@@ -18,6 +18,6 @@ index_folders:
|
|
18 |
semantic: "indexes/semantic"
|
19 |
|
20 |
semantic_search:
|
21 |
-
model: "intfloat/multilingual-e5-
|
22 |
query_prefix: "query: "
|
23 |
document_prefix: "passage: "
|
|
|
1 |
# Configuration file for the Meme Search Engine project
|
2 |
|
3 |
vk_parser:
|
4 |
+
api_token: "vk1.a.nAdUyPN2n4y1FxeBiAX6sHZBf8amCBJ-3LzHvAYhMFFb3Fk28-TajKXUSrfXhqZHyAsQmQ2FH6ytwxTv6CEUUp1b2F70KGmvX_2btaLRZzX9Bk4bXwmh4DsULnECjWPJ2Hu7ibijsNnfzqrguGvh4yVx5dUa0qEuAUOOEIhO2dII9REATpCpf7mcPEhTh5L0BOGhB3i2X5jBZpakAGnyhA"
|
5 |
meme_pages:
|
6 |
- "textmeme"
|
7 |
# - "badtextmeme"
|
|
|
18 |
semantic: "indexes/semantic"
|
19 |
|
20 |
semantic_search:
|
21 |
+
model: "intfloat/multilingual-e5-large"
|
22 |
query_prefix: "query: "
|
23 |
document_prefix: "passage: "
|
data/raw/textmeme.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58f854e7a463d4610ddd2788776333974719c452de24dc81a8a96646e358aa6a
|
3 |
+
size 6423617
|
indexes/bm25/bm25_index.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f37a55508023ba875370fa96dfa6d461a926244092ea277ff23682042767861
|
3 |
+
size 3992705
|
indexes/semantic/embeddings.npy
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19467fef960bb15db9617947fe3700de84de22224461695f204440306f382f8a
|
3 |
+
size 28213376
|
logs/build_bm25s_index.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-26 16:38:47.256 | INFO | __main__:get_meme_corpus:25 - Retrieved 6888 memes from the database
|
2 |
+
2024-10-26 16:39:13.340 | INFO | __main__:build_bm25_index:47 - BM25S index created and saved in indexes/bm25
|
3 |
+
2024-10-26 16:39:13.349 | INFO | __main__:main:72 - BM25S index building completed
|
logs/build_semantic_index.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-26 16:39:34.195 | INFO | __main__:get_meme_corpus:25 - Retrieved 6888 memes from the database
|
2 |
+
2024-10-26 17:38:28.945 | INFO | __main__:build_semantic_index:46 - Semantic index created and saved in indexes/semantic
|
3 |
+
2024-10-26 17:38:28.960 | INFO | __main__:main:69 - Semantic index building completed
|
logs/data_collector.log
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-26 16:28:56.857 | INFO | __main__:process_public:23 - Processing public: textmeme
|
2 |
+
2024-10-26 16:35:09.840 | INFO | __main__:process_public:23 - Processing public: textmeme
|
3 |
+
2024-10-26 16:37:32.331 | INFO | __main__:process_public:49 - Created new JSON file for textmeme with 6888 posts
|
4 |
+
2024-10-26 16:37:32.333 | INFO | __main__:main:70 - Data collection process completed
|
logs/make_db.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
2024-10-26 16:38:07.602 | INFO | __main__:process_json_files:47 - Processed file: textmeme.json, found 6888 memes
|
2 |
+
2024-10-26 16:38:08.888 | INFO | __main__:main:89 - Added 1 publics and 6888 memes to the database
|
3 |
+
2024-10-26 16:38:08.888 | INFO | __main__:main:93 - Database population completed
|
meme_search.db
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c9813efc5bf009587922861bdf630762a3c4eb36ea5848870788e0d437ecaec
|
3 |
+
size 6930432
|
src/db/__pycache__/crud.cpython-311.pyc
ADDED
Binary file (11.2 kB). View file
|
|
src/db/__pycache__/models.cpython-311.pyc
ADDED
Binary file (1.66 kB). View file
|
|
src/indexing/__pycache__/bm25_indexer.cpython-311.pyc
ADDED
Binary file (2.36 kB). View file
|
|
src/indexing/__pycache__/semantic_indexer.cpython-311.pyc
ADDED
Binary file (2.58 kB). View file
|
|
src/parsing/__pycache__/vk_meme_parser.cpython-311.pyc
ADDED
Binary file (6.33 kB). View file
|
|
src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc
CHANGED
Binary files a/src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc and b/src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc differ
|
|