Futyn-Maker commited on
Commit
4abc673
1 Parent(s): 7e1f5f6

Update data and using large model

Browse files
config.yaml CHANGED
@@ -1,7 +1,7 @@
1
  # Configuration file for the Meme Search Engine project
2
 
3
  vk_parser:
4
- api_token: "YOUR_TOKEN_HERE"
5
  meme_pages:
6
  - "textmeme"
7
  # - "badtextmeme"
@@ -18,6 +18,6 @@ index_folders:
18
  semantic: "indexes/semantic"
19
 
20
  semantic_search:
21
- model: "intfloat/multilingual-e5-small"
22
  query_prefix: "query: "
23
  document_prefix: "passage: "
 
1
  # Configuration file for the Meme Search Engine project
2
 
3
  vk_parser:
4
+ api_token: "vk1.a.nAdUyPN2n4y1FxeBiAX6sHZBf8amCBJ-3LzHvAYhMFFb3Fk28-TajKXUSrfXhqZHyAsQmQ2FH6ytwxTv6CEUUp1b2F70KGmvX_2btaLRZzX9Bk4bXwmh4DsULnECjWPJ2Hu7ibijsNnfzqrguGvh4yVx5dUa0qEuAUOOEIhO2dII9REATpCpf7mcPEhTh5L0BOGhB3i2X5jBZpakAGnyhA"
5
  meme_pages:
6
  - "textmeme"
7
  # - "badtextmeme"
 
18
  semantic: "indexes/semantic"
19
 
20
  semantic_search:
21
+ model: "intfloat/multilingual-e5-large"
22
  query_prefix: "query: "
23
  document_prefix: "passage: "
data/raw/textmeme.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96ef95553f897ae10cbe0fcca505091ff41aef726840f989dd2d944ffefcc5d0
3
- size 6394668
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58f854e7a463d4610ddd2788776333974719c452de24dc81a8a96646e358aa6a
3
+ size 6423617
indexes/bm25/bm25_index.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ca6b40f2349502f78f852b68e1c6589d5777cc34ffafb3935c12f2ea6b931e5
3
- size 3973591
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f37a55508023ba875370fa96dfa6d461a926244092ea277ff23682042767861
3
+ size 3992705
indexes/semantic/embeddings.npy CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5509dd737152797eee7d5fc2e6ba90ce059ecb7c8c1632d5d7ff17bb38dddbd
3
- size 10537088
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19467fef960bb15db9617947fe3700de84de22224461695f204440306f382f8a
3
+ size 28213376
logs/build_bm25s_index.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2024-10-26 16:38:47.256 | INFO | __main__:get_meme_corpus:25 - Retrieved 6888 memes from the database
2
+ 2024-10-26 16:39:13.340 | INFO | __main__:build_bm25_index:47 - BM25S index created and saved in indexes/bm25
3
+ 2024-10-26 16:39:13.349 | INFO | __main__:main:72 - BM25S index building completed
logs/build_semantic_index.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2024-10-26 16:39:34.195 | INFO | __main__:get_meme_corpus:25 - Retrieved 6888 memes from the database
2
+ 2024-10-26 17:38:28.945 | INFO | __main__:build_semantic_index:46 - Semantic index created and saved in indexes/semantic
3
+ 2024-10-26 17:38:28.960 | INFO | __main__:main:69 - Semantic index building completed
logs/data_collector.log ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ 2024-10-26 16:28:56.857 | INFO | __main__:process_public:23 - Processing public: textmeme
2
+ 2024-10-26 16:35:09.840 | INFO | __main__:process_public:23 - Processing public: textmeme
3
+ 2024-10-26 16:37:32.331 | INFO | __main__:process_public:49 - Created new JSON file for textmeme with 6888 posts
4
+ 2024-10-26 16:37:32.333 | INFO | __main__:main:70 - Data collection process completed
logs/make_db.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2024-10-26 16:38:07.602 | INFO | __main__:process_json_files:47 - Processed file: textmeme.json, found 6888 memes
2
+ 2024-10-26 16:38:08.888 | INFO | __main__:main:89 - Added 1 publics and 6888 memes to the database
3
+ 2024-10-26 16:38:08.888 | INFO | __main__:main:93 - Database population completed
meme_search.db CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbe21217d65e17c68c806d6baed8ac9da166fd65d34e332f93033f357b8d65ad
3
- size 6897664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c9813efc5bf009587922861bdf630762a3c4eb36ea5848870788e0d437ecaec
3
+ size 6930432
src/db/__pycache__/crud.cpython-311.pyc ADDED
Binary file (11.2 kB). View file
 
src/db/__pycache__/models.cpython-311.pyc ADDED
Binary file (1.66 kB). View file
 
src/indexing/__pycache__/bm25_indexer.cpython-311.pyc ADDED
Binary file (2.36 kB). View file
 
src/indexing/__pycache__/semantic_indexer.cpython-311.pyc ADDED
Binary file (2.58 kB). View file
 
src/parsing/__pycache__/vk_meme_parser.cpython-311.pyc ADDED
Binary file (6.33 kB). View file
 
src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc CHANGED
Binary files a/src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc and b/src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc differ