Spaces:

Futyn-Maker
/

textmeme_search

Sleeping

App Files Files Community

Futyn-Maker commited on Oct 21, 2024

Commit

7e1f5f6

1 Parent(s): f2e0a2c

Deploy the app

Browse files

Files changed (23) hide show

.gitattributes +2 -0
README.md +1 -1
config.yaml +23 -0
data/raw/textmeme.json +3 -0
indexes/bm25/bm25_index.pkl +3 -0
indexes/semantic/embeddings.npy +3 -0
meme_search.db +3 -0
requirements.txt +169 -0
scripts/build_bm25_index.py +79 -0
scripts/build_semantic_index.py +76 -0
scripts/data_collector.py +77 -0
scripts/make_db.py +100 -0
src/db/crud.py +210 -0
src/db/models.py +28 -0
src/indexing/bm25_indexer.py +40 -0
src/indexing/semantic_indexer.py +52 -0
src/interface.py +124 -0
src/main.py +104 -0
src/parsing/vk_meme_parser.py +139 -0
src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc +0 -0
src/preprocessing/mystem_tokenizer.py +46 -0
src/search/bm25_search.py +89 -0
src/search/semantic_search.py +85 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.db filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -5,7 +5,7 @@ colorFrom: indigo
 colorTo: pink
 sdk: gradio
 sdk_version: 5.1.0
-app_file: interface.py
 pinned: false
 license: wtfpl
 short_description: Search for Russian-language memes by their text descriptions

 colorTo: pink
 sdk: gradio
 sdk_version: 5.1.0
+app_file: src/interface.py
 pinned: false
 license: wtfpl
 short_description: Search for Russian-language memes by their text descriptions

config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+# Configuration file for the Meme Search Engine project
+vk_parser:
+  api_token: "YOUR_TOKEN_HERE"
+  meme_pages:
+    - "textmeme"
+    # - "badtextmeme"
+data_folders:
+  raw_data: "data/raw"
+  # images: "data/images"
+database:
+  url: "sqlite:///./meme_search.db"
+index_folders:
+  bm25: "indexes/bm25"
+  semantic: "indexes/semantic"
+semantic_search:
+  model: "intfloat/multilingual-e5-small"
+  query_prefix: "query: "
+  document_prefix: "passage: "

data/raw/textmeme.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96ef95553f897ae10cbe0fcca505091ff41aef726840f989dd2d944ffefcc5d0
+size 6394668

indexes/bm25/bm25_index.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ca6b40f2349502f78f852b68e1c6589d5777cc34ffafb3935c12f2ea6b931e5
+size 3973591

indexes/semantic/embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5509dd737152797eee7d5fc2e6ba90ce059ecb7c8c1632d5d7ff17bb38dddbd
+size 10537088

meme_search.db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbe21217d65e17c68c806d6baed8ac9da166fd65d34e332f93033f357b8d65ad
+size 6897664

requirements.txt ADDED Viewed

	@@ -0,0 +1,169 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.6.2.post1
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.1.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+click==8.1.7
+comm==0.2.2
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+eval_type_backport==0.2.0
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fire==0.7.0
+fqdn==1.5.1
+fsspec==2024.9.0
+gradio==5.1.0
+gradio_client==1.4.0
+greenlet==3.1.1
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.25.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.28.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.1
+Jinja2==3.1.4
+joblib==1.4.2
+json5==0.9.25
+jsonpath-python==1.0.6
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistralai==1.1.0
+mistune==3.0.2
+mpmath==1.3.0
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+notebook==7.2.2
+notebook_shim==0.2.4
+numpy==2.1.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+omegaconf==2.3.0
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+Pygments==2.18.0
+pymystem3==0.2.0
+python-dateutil==2.8.2
+python-json-logger==2.0.7
+python-multipart==0.0.12
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+rank-bm25==0.2.2
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+ruff==0.7.0
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+semantic-version==2.10.0
+Send2Trash==1.8.3
+sentence-transformers==3.2.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.6
+SQLAlchemy==2.0.36
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+termcolor==2.5.0
+terminado==0.18.1
+threadpoolctl==3.5.0
+tinycss2==1.3.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+torch==2.5.0
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+vk-api==11.9.9
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==12.0
+widgetsnbextension==4.0.13

scripts/build_bm25_index.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+from omegaconf import OmegaConf
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+def get_meme_corpus(db, crud) -> List[str]:
+    """
+    Retrieve all meme texts from the database.
+    Args:
+        db: Database session.
+        crud: CRUD operations module.
+    Returns:
+        List[str]: List of meme texts.
+    """
+    memes = crud.get_all_memes(db)
+    corpus = [meme.text for meme in memes]
+    logger.info(f"Retrieved {len(corpus)} memes from the database")
+    return corpus
+def build_bm25_index(corpus: List[str],
+                     config: Dict[str,
+                                  Any],
+                     mystem_tokenizer,
+                     BM25Indexer):
+    """
+    Build and save the BM25 index.
+    Args:
+        corpus (List[str]): List of meme texts.
+        config (Dict[str, Any]): Configuration dictionary.
+        mystem_tokenizer: MystemTokenizer instance.
+        BM25Indexer: BM25Indexer class.
+    """
+    indexer = BM25Indexer(corpus, mystem_tokenizer.tokenize)
+    bm25_index_folder = config['index_folders']['bm25']
+    os.makedirs(bm25_index_folder, exist_ok=True)
+    indexer.create_index(bm25_index_folder)
+    logger.info(f"BM25S index created and saved in {bm25_index_folder}")
+def main():
+    from src.db import crud
+    from src.preprocessing.mystem_tokenizer import MystemTokenizer
+    from src.indexing.bm25_indexer import BM25Indexer
+    logger.add("logs/build_bm25s_index.log", rotation="10 MB")
+    # Load configuration
+    config = OmegaConf.load('config.yaml')
+    config = OmegaConf.to_container(config)
+    engine = create_engine(config['database']['url'])
+    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+    db = SessionLocal()
+    try:
+        corpus = get_meme_corpus(db, crud)
+        mystem_tokenizer = MystemTokenizer()
+        build_bm25_index(corpus, config, mystem_tokenizer, BM25Indexer)
+    finally:
+        db.close()
+    logger.info("BM25S index building completed")
+if __name__ == "__main__":
+    # Set up project root path
+    project_root = Path(__file__).resolve().parents[1]
+    sys.path.insert(0, str(project_root))
+    main()

scripts/build_semantic_index.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+from omegaconf import OmegaConf
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+def get_meme_corpus(db, crud) -> List[str]:
+    """
+    Retrieve all meme texts from the database.
+    Args:
+        db: Database session.
+        crud: CRUD operations module.
+    Returns:
+        List[str]: List of meme texts.
+    """
+    memes = crud.get_all_memes(db)
+    corpus = [meme.text for meme in memes]
+    logger.info(f"Retrieved {len(corpus)} memes from the database")
+    return corpus
+def build_semantic_index(
+        corpus: List[str], config: Dict[str, Any], SemanticIndexer):
+    """
+    Build and save the semantic index.
+    Args:
+        corpus (List[str]): List of meme texts.
+        config (Dict[str, Any]): Configuration dictionary.
+        SemanticIndexer: SemanticIndexer class.
+    """
+    model = config['semantic_search']['model']
+    prefix = config['semantic_search']['document_prefix']
+    indexer = SemanticIndexer(corpus, model=model, prefix=prefix)
+    semantic_index_folder = config['index_folders']['semantic']
+    os.makedirs(semantic_index_folder, exist_ok=True)
+    indexer.create_index(semantic_index_folder)
+    logger.info(f"Semantic index created and saved in {semantic_index_folder}")
+def main():
+    from src.db import crud
+    from src.indexing.semantic_indexer import SemanticIndexer
+    logger.add("logs/build_semantic_index.log", rotation="10 MB")
+    # Load configuration
+    config = OmegaConf.load('config.yaml')
+    config = OmegaConf.to_container(config)
+    engine = create_engine(config['database']['url'])
+    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+    db = SessionLocal()
+    try:
+        corpus = get_meme_corpus(db, crud)
+        build_semantic_index(corpus, config, SemanticIndexer)
+    finally:
+        db.close()
+    logger.info("Semantic index building completed")
+if __name__ == "__main__":
+    # Set up project root path
+    project_root = Path(__file__).resolve().parents[1]
+    sys.path.insert(0, str(project_root))
+    main()

scripts/data_collector.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Any
+from loguru import logger
+from omegaconf import OmegaConf
+def process_public(parser, public_id: str, config: Dict[str, Any]) -> None:
+    """
+    Process a single public page, updating or creating its JSON file.
+    Args:
+        parser: VK meme parser instance.
+        public_id (str): ID our short name of the public page.
+        config (Dict[str, Any]): Configuration dictionary.
+    """
+    raw_data_path = config['data_folders']['raw_data']
+    json_file_path = os.path.join(raw_data_path, f"{public_id}.json")
+    logger.info(f"Processing public: {public_id}")
+    memes_data = parser.get_memes(public_id)
+    if os.path.exists(json_file_path):
+        # Update existing JSON file
+        with open(json_file_path, 'r', encoding='utf-8') as file:
+            existing_data = json.load(file)
+        existing_posts = {post['id']: post for post in existing_data['posts']}
+        new_posts = [post for post in memes_data['posts']
+                     if post['id'] not in existing_posts]
+        # Add new posts to the beginning of the list
+        existing_data['posts'] = new_posts + existing_data['posts']
+        with open(json_file_path, 'w', encoding='utf-8') as file:
+            json.dump(existing_data, file, ensure_ascii=False, indent=2)
+        logger.info(f"Updated {len(new_posts)} new posts for {public_id}")
+    else:
+        # Create new JSON file
+        with open(json_file_path, 'w', encoding='utf-8') as file:
+            json.dump(memes_data, file, ensure_ascii=False, indent=2)
+        logger.info(
+            f"Created new JSON file for {public_id} with {len(memes_data['posts'])} posts")
+def main():
+    from src.parsing.vk_meme_parser import VKMemeParser
+    logger.add("logs/data_collector.log", rotation="10 MB")
+    # Load configuration
+    config = OmegaConf.load('config.yaml')
+    config = OmegaConf.to_container(config)
+    parser = VKMemeParser(config['vk_parser']['api_token'])
+    for folder in config['data_folders'].values():
+        os.makedirs(folder, exist_ok=True)
+    for public_id in config['vk_parser']['meme_pages']:
+        process_public(parser, public_id, config)
+    logger.info("Data collection process completed")
+if __name__ == "__main__":
+    # Set up project root path
+    project_root = Path(__file__).resolve().parents[1]
+    sys.path.insert(0, str(project_root))
+    main()

scripts/make_db.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import json
+import os
+import sys
+from pathlib import Path
+from typing import Dict, Any, List
+from loguru import logger
+from omegaconf import OmegaConf
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+def process_json_files(
+        raw_data_path: str) -> tuple[List[Dict[str, str]], List[Dict[str, Any]]]:
+    """
+    Process all JSON files in the raw data folder.
+    Args:
+        raw_data_path (str): Path to the folder containing JSON files.
+    Returns:
+        tuple: Lists of public and meme data to be added to the database.
+    """
+    publics_to_add: List[Dict[str, str]] = []
+    memes_to_add: List[Dict[str, Any]] = []
+    for filename in os.listdir(raw_data_path):
+        if filename.endswith('.json'):
+            public_vk = filename[:-5]  # Remove .json extension
+            file_path = os.path.join(raw_data_path, filename)
+            with open(file_path, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+            publics_to_add.append({
+                "public_vk": public_vk,
+                "public_name": data['name']
+            })
+            for post in data['posts']:
+                memes_to_add.append({
+                    "public_vk": public_vk,
+                    "text": post['text'],
+                    "image_url": post['image_url']
+                })
+            logger.info(
+                f"Processed file: {filename}, found {len(data['posts'])} memes")
+    return publics_to_add, memes_to_add
+def main():
+    from src.db.models import Base
+    from src.db import crud
+    logger.add("logs/make_db.log", rotation="10 MB")
+    # Load configuration
+    config = OmegaConf.load('config.yaml')
+    config = OmegaConf.to_container(config)
+    engine = create_engine(config['database']['url'])
+    # Drop all existing tables and create new ones
+    Base.metadata.drop_all(bind=engine)
+    Base.metadata.create_all(bind=engine)
+    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+    db = SessionLocal()
+    raw_data_path = config['data_folders']['raw_data']
+    publics_to_add, memes_to_add = process_json_files(raw_data_path)
+    # Add all publics to the database
+    added_publics = crud.add_publics(db, publics_to_add)
+    # Create a mapping of public_vk to public_id
+    public_vk_to_id = {public.public_vk: public.id for public in added_publics}
+    # Update memes with correct public_id
+    for meme in memes_to_add:
+        meme['public_id'] = public_vk_to_id[meme.pop('public_vk')]
+    # Add all memes to the database
+    crud.add_memes(db, memes_to_add)
+    logger.info(
+        f"Added {len(added_publics)} publics and {len(memes_to_add)} memes to the database")
+    db.close()
+    logger.info("Database population completed")
+if __name__ == "__main__":
+    # Set up project root path
+    project_root = Path(__file__).resolve().parents[1]
+    sys.path.insert(0, str(project_root))
+    main()

src/db/crud.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from typing import List, Dict, Any
+from sqlalchemy.orm import Session
+from . import models
+def add_publics(
+        db: Session, publics: List[Dict[str, str]]) -> List[models.Public]:
+    """
+    Add multiple public pages to the database.
+    Args:
+        db (Session): The database session.
+        publics (List[Dict[str, str]]): List of public page data.
+    Returns:
+        List[models.Public]: List of added public page objects.
+    """
+    db_publics = [models.Public(**public) for public in publics]
+    db.add_all(db_publics)
+    db.commit()
+    for public in db_publics:
+        db.refresh(public)
+    return db_publics
+def add_memes(db: Session, memes: List[Dict[str, Any]]) -> List[models.Meme]:
+    """
+    Add multiple memes to the database.
+    Args:
+        db (Session): The database session.
+        memes (List[Dict[str, Any]]): List of meme data.
+    Returns:
+        List[models.Meme]: List of added meme objects.
+    """
+    db_memes = [models.Meme(**meme) for meme in memes]
+    db.add_all(db_memes)
+    db.commit()
+    for meme in db_memes:
+        db.refresh(meme)
+    return db_memes
+def get_memes_by_publics(db: Session,
+                         public_ids: List[int]) -> List[models.Meme]:
+    """
+    Retrieve memes associated with specific public pages.
+    Args:
+        db (Session): The database session.
+        public_ids (List[int]): List of public page IDs.
+    Returns:
+        List[models.Meme]: List of meme objects.
+    """
+    return db.query(
+        models.Meme).filter(
+        models.Meme.public_id.in_(public_ids)).all()
+def get_all_memes(db: Session) -> List[models.Meme]:
+    """
+    Retrieve all memes from the database.
+    Args:
+        db (Session): The database session.
+    Returns:
+        List[models.Meme]: List of all meme objects.
+    """
+    return db.query(models.Meme).all()
+def get_all_publics(db: Session) -> List[models.Public]:
+    """
+    Retrieve all public pages from the database.
+    Args:
+        db (Session): The database session.
+    Returns:
+        List[models.Public]: List of all public page objects.
+    """
+    return db.query(models.Public).all()
+def get_memes_by_ids(db: Session, meme_ids: List[int]) -> List[models.Meme]:
+    """
+    Retrieve memes by their IDs.
+    Args:
+        db (Session): The database session.
+        meme_ids (List[int]): List of meme IDs.
+    Returns:
+        List[models.Meme]: List of meme objects.
+    """
+    return db.query(models.Meme).filter(models.Meme.id.in_(meme_ids)).all()
+def get_publics_by_ids(db: Session,
+                       public_ids: List[int]) -> List[models.Public]:
+    """
+    Retrieve public pages by their IDs.
+    Args:
+        db (Session): The database session.
+        public_ids (List[int]): List of public page IDs.
+    Returns:
+        List[models.Public]: List of public page objects.
+    """
+    return db.query(
+        models.Public).filter(
+        models.Public.id.in_(public_ids)).all()
+def delete_memes(db: Session, meme_ids: List[int]) -> int:
+    """
+    Delete memes by their IDs.
+    Args:
+        db (Session): The database session.
+        meme_ids (List[int]): List of meme IDs to delete.
+    Returns:
+        int: Number of deleted memes.
+    """
+    deleted_count = db.query(
+        models.Meme).filter(
+        models.Meme.id.in_(meme_ids)).delete(
+            synchronize_session='fetch')
+    db.commit()
+    return deleted_count
+def delete_publics(db: Session, public_ids: List[int]) -> int:
+    """
+    Delete public pages and their associated memes.
+    Args:
+        db (Session): The database session.
+        public_ids (List[int]): List of public page IDs to delete.
+    Returns:
+        int: Number of deleted public pages.
+    """
+    # First, delete associated memes
+    db.query(
+        models.Meme).filter(
+        models.Meme.public_id.in_(public_ids)).delete(
+            synchronize_session='fetch')
+    # Then delete the publics
+    deleted_count = db.query(
+        models.Public).filter(
+        models.Public.id.in_(public_ids)).delete(
+            synchronize_session='fetch')
+    db.commit()
+    return deleted_count
+def get_memes_with_public_info(
+        db: Session, meme_ids: List[int]) -> List[tuple[models.Meme, models.Public]]:
+    """
+    Retrieve memes with their associated public page information.
+    Args:
+        db (Session): The database session.
+        meme_ids (List[int]): List of meme IDs.
+    Returns:
+        List[tuple[models.Meme, models.Public]]: List of tuples containing meme and public page objects.
+    """
+    return db.query(models.Meme, models.Public).\
+        join(models.Public, models.Meme.public_id == models.Public.id).\
+        filter(models.Meme.id.in_(meme_ids)).all()
+def update_memes(db: Session, meme_updates: List[Dict[str, Any]]) -> None:
+    """
+    Update multiple memes in the database.
+    Args:
+        db (Session): The database session.
+        meme_updates (List[Dict[str, Any]]): List of meme update data.
+    """
+    for update in meme_updates:
+        meme_id = update.pop('id')
+        db.query(models.Meme).filter(models.Meme.id == meme_id). update({getattr(
+            models.Meme, k): v for k, v in update.items()}, synchronize_session='fetch')
+    db.commit()
+def update_publics(db: Session, public_updates: List[Dict[str, Any]]) -> None:
+    """
+    Update multiple public pages in the database.
+    Args:
+        db (Session): The database session.
+        public_updates (List[Dict[str, Any]]): List of public page update data.
+    """
+    for update in public_updates:
+        public_id = update.pop('id')
+        db.query(models.Public).filter(models.Public.id == public_id). update({getattr(
+            models.Public, k): v for k, v in update.items()}, synchronize_session='fetch')
+    db.commit()

src/db/models.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from sqlalchemy import Column, Integer, String, ForeignKey
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import relationship
+Base = declarative_base()
+class Public(Base):
+    __tablename__ = "publics"
+    id = Column(Integer, primary_key=True, index=True, autoincrement=True)
+    public_vk = Column(String, unique=True, index=True)
+    public_name = Column(String)
+    memes = relationship("Meme", back_populates="public")
+class Meme(Base):
+    __tablename__ = "memes"
+    id = Column(Integer, primary_key=True, index=True, autoincrement=True)
+    public_id = Column(Integer, ForeignKey("publics.id"))
+    text = Column(String)
+    image_url = Column(String)
+    local_image_path = Column(String)
+    public = relationship("Public", back_populates="memes")

src/indexing/bm25_indexer.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import pickle
+from typing import List, Callable
+from rank_bm25 import BM25Okapi
+class BM25Indexer:
+    def __init__(self, corpus: List[str],
+                 tokenizer: Callable[[str], List[str]]):
+        """
+        Initialize the BM25Indexer.
+        Args:
+            corpus (List[str]): The corpus to be indexed.
+            tokenizer (Callable[[str], List[str]]): A function to tokenize the text.
+        """
+        self.corpus = corpus
+        self.tokenizer = tokenizer
+        self.bm25 = None
+    def create_index(self, save_dir: str) -> None:
+        """
+        Create and save the BM25 index.
+        Args:
+            save_dir (str): Directory to save the index.
+        """
+        # Ensure the save directory exists
+        os.makedirs(save_dir, exist_ok=True)
+        # Tokenize the corpus
+        tokenized_corpus = [self.tokenizer(doc) for doc in self.corpus]
+        # Create the BM25 model
+        self.bm25 = BM25Okapi(tokenized_corpus)
+        # Save the BM25 index
+        with open(os.path.join(save_dir, 'bm25_index.pkl'), 'wb') as f:
+            pickle.dump(self.bm25, f)

src/indexing/semantic_indexer.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+from typing import List, Optional
+import numpy as np
+from sentence_transformers import SentenceTransformer
+class SemanticIndexer:
+    def __init__(
+            self,
+            corpus: List[str],
+            model: str,
+            prefix: Optional[str] = None
+    ):
+        """
+        Initialize the SemanticIndexer.
+        Args:
+            corpus (List[str]): The corpus to be indexed.
+            model (str): The name or path of the SentenceTransformer model to use.
+            prefix (Optional[str], optional): A prefix to add to each text in the corpus. Defaults to None.
+        """
+        self.corpus = corpus
+        self.model = SentenceTransformer(model)
+        self.prefix = prefix
+        self.embeddings = None
+    def create_index(self, save_dir: str) -> None:
+        """
+        Create and save the semantic index.
+        Args:
+            save_dir (str): Directory to save the embeddings.
+        """
+        # Ensure the save directory exists
+        os.makedirs(save_dir, exist_ok=True)
+        # Prepare texts with prefix if provided
+        texts = [
+            f"{self.prefix}{text}" if self.prefix else text for text in self.corpus]
+        # Create embeddings
+        self.embeddings = self.model.encode(
+            texts,
+            show_progress_bar=True,
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )
+        # Save embeddings
+        embeddings_file = os.path.join(save_dir, "embeddings.npy")
+        np.save(embeddings_file, self.embeddings)

src/interface.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import sys
+from pathlib import Path
+import gradio as gr
+from omegaconf import OmegaConf
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+def initialize_search_engines(db, config):
+    """
+    Initialize both BM25 and Semantic search engines.
+    Args:
+        db: Database session.
+        config: Configuration dictionary.
+    Returns:
+        tuple: Initialized BM25Search and SemanticSearch engines.
+    """
+    from search.bm25_search import BM25Search
+    from search.semantic_search import SemanticSearch
+    from preprocessing.mystem_tokenizer import MystemTokenizer
+    custom_tokenizer = MystemTokenizer()
+    bm25_search = BM25Search(
+        db,
+        config['index_folders']['bm25'],
+        custom_tokenizer.tokenize
+    )
+    semantic_search = SemanticSearch(
+        db,
+        model=config['semantic_search']['model'],
+        embeddings_file=f"{config['index_folders']['semantic']}/embeddings.npy",
+        prefix=config['semantic_search']['query_prefix'])
+    return bm25_search, semantic_search
+def search_memes(query: str, search_type: str, num_results: int):
+    """
+    Search for memes using the specified search method.
+    Args:
+        query (str): The search query.
+        search_type (str): The type of search to perform. Either 'BM25' or 'Семантический'.
+        num_results (int): The number of results to return.
+    Returns:
+        tuple: A tuple containing the search results and search time.
+    """
+    if search_type == "BM25":
+        results = bm25_search.search(query, num_results)
+    else:
+        results = semantic_search.search(query, num_results)
+    output = []
+    for result in results['results']:
+        output.append((result['image_url'], result['text']))
+    return output, f"Время поиска: {results['search_time']:.4f} секунд"
+def main():
+    global bm25_search, semantic_search
+    # Load configuration
+    config = OmegaConf.load('config.yaml')
+    config = OmegaConf.to_container(config)
+    # Initialize database session
+    engine = create_engine(config['database']['url'])
+    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+    db = SessionLocal()
+    # Initialize search engines
+    bm25_search, semantic_search = initialize_search_engines(db, config)
+    # Gradio interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# Поиск мемов")
+        gr.Markdown(
+            "Добро пожаловать в приложение для поиска мемов! Введите запрос, выберите тип поиска и количество результатов."
+        )
+        with gr.Row():
+            query = gr.Textbox(label="Запрос")
+            search_type = gr.Radio(
+                ["BM25", "Семантический"],
+                label="Тип поиска",
+                value="BM25"
+            )
+            num_results = gr.Slider(
+                minimum=1,
+                maximum=10,
+                step=1,
+                value=1,
+                label="Количество результатов"
+            )
+        search_button = gr.Button("Найти")
+        output_gallery = gr.Gallery(
+            label="Результаты",
+            show_label=False,
+            columns=3,
+            height=400
+        )
+        output_time = gr.Markdown()
+        search_button.click(
+            fn=search_memes,
+            inputs=[query, search_type, num_results],
+            outputs=[output_gallery, output_time]
+        )
+    demo.launch()
+if __name__ == "__main__":
+    # Set up project root path
+    project_root = Path(__file__).resolve().parents[1]
+    sys.path.insert(0, str(project_root))
+    main()

src/main.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import sys
+from pathlib import Path
+import fire
+from omegaconf import OmegaConf
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+def initialize_bm25_search(db, config):
+    """
+    Initialize BM25 search engine.
+    Args:
+        db: Database session.
+        config: Configuration dictionary.
+    Returns:
+        BM25Search: Initialized BM25 search engine.
+    """
+    from search.bm25_search import BM25Search
+    from preprocessing.mystem_tokenizer import MystemTokenizer
+    custom_tokenizer = MystemTokenizer()
+    return BM25Search(
+        db,
+        config['index_folders']['bm25'],
+        custom_tokenizer.tokenize
+    )
+def initialize_semantic_search(db, config):
+    """
+    Initialize semantic search engine.
+    Args:
+        db: Database session.
+        config: Configuration dictionary.
+    Returns:
+        SemanticSearch: Initialized semantic search engine.
+    """
+    from search.semantic_search import SemanticSearch
+    return SemanticSearch(
+        db,
+        model=config['semantic_search']['model'],
+        embeddings_file=f"{config['index_folders']['semantic']}/embeddings.npy",
+        prefix=config['semantic_search']['query_prefix'])
+def search_memes(query: str, search_type: str = 'bm25', num: int = 1):
+    """
+    Search for memes using the specified search method.
+    Args:
+        query (str): The search query.
+        search_type (str): The type of search to perform. Either 'bm25' or 'semantic'. Defaults to 'bm25s'.
+        num (int): The number of results to return. Defaults to 1.
+    Returns:
+        None: Prints the results to the console.
+    """
+    if not query:
+        print("Error: Query is required.")
+        return
+    if search_type not in ['bm25', 'semantic']:
+        print("Error: Invalid search type. Use 'bm25' or 'semantic'.")
+        return
+    if num < 1:
+        print("Error: Number of results must be at least 1.")
+        return
+    # Load configuration
+    config = OmegaConf.load('config.yaml')
+    config = OmegaConf.to_container(config)
+    # Initialize database session
+    engine = create_engine(config['database']['url'])
+    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+    db = SessionLocal()
+    try:
+        # Initialize search engine
+        if search_type == 'bm25':
+            search_engine = initialize_bm25_search(db, config)
+        elif search_type == 'semantic':
+            search_engine = initialize_semantic_search(db, config)
+        # Perform search
+        results = search_engine.search(query, num)
+        # Print results
+        for result in results['results']:
+            print(result['text'])
+        print(f"\nSearch time: {results['search_time']:.4f} seconds")
+    finally:
+        db.close()
+if __name__ == "__main__":
+    # Set up project root path
+    project_root = Path(__file__).resolve().parents[1]
+    sys.path.insert(0, str(project_root))
+    fire.Fire(search_memes)

src/parsing/vk_meme_parser.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+from typing import Optional, Dict, Any
+from urllib.parse import urlparse
+import requests
+import vk_api
+class VKMemeParser:
+    def __init__(self, token: str):
+        """
+        Initialize the VK Meme Parser.
+        Args:
+            token (str): VK API access token.
+        """
+        self.vk_session = vk_api.VkApi(token=token)
+        self.vk = self.vk_session.get_api()
+    def _process_post(self, post: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Process a single post and extract relevant information.
+        Args:
+            post (Dict[str, Any]): A dictionary containing post data.
+        Returns:
+            Optional[Dict[str, Any]]: A dictionary with post ID, text, and image URL if valid,
+                                      None otherwise.
+        """
+        # Check if the post is valid
+        if (post.get("marked_as_ads") or
+            "is_pinned" in post or
+            "copy_history" in post or
+            len(post.get("attachments", [])) != 1 or
+                post["attachments"][0]["type"] != "photo"):
+            return None
+        post_id = post["id"]
+        text = post["text"].strip()
+        # Get the largest available photo
+        photo_sizes = post["attachments"][0]["photo"]["sizes"]
+        largest_photo = max(
+            photo_sizes,
+            key=lambda x: x["width"] * x["height"])
+        image_url = largest_photo["url"]
+        return {
+            "id": post_id,
+            "text": text,
+            "image_url": image_url
+        }
+    def get_memes(self, public_id: str) -> Dict[str, Any]:
+        """
+        Retrieve and process all meme posts from a specified public page.
+        Args:
+            public_id (str): ID or short name of the public page.
+        Returns:
+            Dict[str, Any]: A dictionary containing the public's name and processed meme posts.
+        """
+        memes = []
+        # Determine whether to use domain or owner_id
+        if public_id.isdigit() or (public_id.startswith("-")
+                                   and public_id[1:].isdigit()):
+            params: Dict[str, Any] = {"owner_id": int(public_id)}
+        else:
+            params: Dict[str, Any] = {"domain": public_id}
+        # Fetch public's name
+        group_info = self.vk.groups.getById(group_id=public_id)[0]
+        group_name = group_info['name']
+        # Process posts
+        offset = 0
+        while True:
+            # Fetch 100 posts at a time
+            params["count"] = 100
+            params["offset"] = offset
+            response = self.vk.wall.get(**params)
+            posts = response["items"]
+            for post in posts:
+                processed_post = self._process_post(post)
+                if processed_post:
+                    memes.append(processed_post)
+            # Check if we've reached the end of posts
+            if len(posts) < 100:
+                break
+            offset = response["next_from"]
+        return {
+            "name": group_name,
+            "posts": memes
+        }
+    def download_image(
+            self,
+            image_url: str,
+            folder_path: str) -> Optional[str]:
+        """
+        Download an image from the given URL and save it to the specified folder.
+        Args:
+            image_url (str): The URL of the image to download.
+            folder_path (str): The path to the folder where the image should be saved.
+        Returns:
+            Optional[str]: The path to the saved image file, or None if the download failed.
+        """
+        try:
+            # Create the folder if it doesn't exist
+            os.makedirs(folder_path, exist_ok=True)
+            filename = os.path.basename(urlparse(image_url).path)
+            if not os.path.splitext(filename)[1]:
+                return None
+            image_path = os.path.join(folder_path, filename)
+            response = requests.get(image_url, stream=True)
+            response.raise_for_status()  # Raise an exception for bad status codes
+            with open(image_path, 'wb') as file:
+                for chunk in response.iter_content(chunk_size=8192):
+                    file.write(chunk)
+            return filename
+        except Exception as e:
+            print(f"Error downloading image from {image_url}: {str(e)}")
+            return None

src/preprocessing/__pycache__/mystem_tokenizer.cpython-311.pyc ADDED Viewed

Binary file (2.47 kB). View file

src/preprocessing/mystem_tokenizer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import List, Union
+import nltk
+from pymystem3 import Mystem
+class MystemTokenizer:
+    def __init__(self, stopwords: Union[List[str], str] = "ru"):
+        """
+        Initialize the MystemTokenizer.
+        Args:
+            stopwords (Union[List[str], str]): Either a list of stopwords or "ru" for Russian stopwords.
+        """
+        if stopwords == "ru":
+            try:
+                self.stopwords = nltk.corpus.stopwords.words("russian")
+            except LookupError:
+                # Download stopwords if not available
+                nltk.download("stopwords")
+                self.stopwords = nltk.corpus.stopwords.words("russian")
+        else:
+            self.stopwords = stopwords
+        self.mystem = Mystem()
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize and lemmatize the input text, removing stopwords.
+        Args:
+            text (str): The input text to tokenize.
+        Returns:
+            List[str]: A list of lemmatized tokens.
+        """
+        # Lemmatize and tokenize using Mystem
+        lemmas = self.mystem.lemmatize(text.lower())
+        # Filter out non-letter tokens and stopwords
+        tokens = [
+            lemma for lemma in lemmas
+            if lemma.isalpha() and lemma not in self.stopwords
+        ]
+        return tokens

src/search/bm25_search.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import pickle
+import time
+from typing import List, Dict, Any, Callable
+import numpy as np
+from sqlalchemy.orm import Session
+from rank_bm25 import BM25Okapi
+from src.db import crud
+class BM25Search:
+    def __init__(
+        self,
+        db: Session,
+        index_folder: str,
+        tokenizer: Callable[[str], List[str]]
+    ):
+        """
+        Initialize the BM25Search.
+        Args:
+            db (Session): The database session.
+            index_folder (str): The folder containing the BM25 index.
+            tokenizer (Callable[[str], List[str]]): A function to tokenize the text.
+        """
+        self.db = db
+        self.tokenizer = tokenizer
+        self.bm25 = self._load_index(index_folder)
+    def _load_index(self, index_folder: str) -> BM25Okapi:
+        """
+        Load the BM25 index from a file.
+        Args:
+            index_folder (str): The folder containing the BM25 index.
+        Returns:
+            BM25Okapi: The loaded BM25 index.
+        """
+        with open(os.path.join(index_folder, 'bm25_index.pkl'), 'rb') as f:
+            return pickle.load(f)
+    def search(self, query: str, n: int = 3) -> Dict[str, Any]:
+        """
+        Perform a search using BM25.
+        Args:
+            query (str): The search query.
+            n (int, optional): The number of results to return. Defaults to 3.
+        Returns:
+            Dict[str, Any]: A dictionary containing search results and search time.
+        """
+        start_time = time.time()
+        # Tokenize the query
+        query_tokens = self.tokenizer(query)
+        # Retrieve scores for all documents
+        scores = self.bm25.get_scores(query_tokens)
+        # Get top n document indices
+        top_n_indices = np.argsort(scores)[-n:][::-1]
+        top_n_scores = scores[top_n_indices]
+        # Adjust indices to match database IDs (assuming IDs start from 1)
+        db_ids = top_n_indices + 1
+        # Retrieve memes from the database
+        memes = crud.get_memes_by_ids(self.db, db_ids.tolist())
+        # Format the results
+        results = [
+            {
+                "id": meme.id,
+                "public_id": meme.public_id,
+                "text": meme.text,
+                "image_url": meme.image_url,
+                "score": top_n_scores[db_ids.tolist().index(meme.id)]
+            }
+            for meme in memes
+        ]
+        return {
+            "results": results,
+            "search_time": time.time() - start_time
+        }

src/search/semantic_search.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import time
+from typing import List, Dict, Any, Optional
+import numpy as np
+from sqlalchemy.orm import Session
+from sentence_transformers import SentenceTransformer
+from src.db import crud
+class SemanticSearch:
+    def __init__(
+            self,
+            db: Session,
+            model: str,
+            embeddings_file: str,
+            prefix: Optional[str] = None
+    ):
+        """
+        Initialize the SemanticSearch.
+        Args:
+            db (Session): The database session.
+            model (str): The name or path of the SentenceTransformer model to use.
+            embeddings_file (str): Path to the file containing pre-computed embeddings.
+            prefix (Optional[str], optional): A prefix to add to each query. Defaults to None.
+        """
+        self.db = db
+        self.model = SentenceTransformer(model)
+        self.prefix = prefix
+        self.embeddings = np.load(embeddings_file)
+    def search(self, query: str, n: int = 3) -> Dict[str, Any]:
+        """
+        Perform a semantic search.
+        Args:
+            query (str): The search query.
+            n (int, optional): The number of results to return. Defaults to 3.
+        Returns:
+            Dict[str, Any]: A dictionary containing search results and search time.
+        """
+        start_time = time.time()
+        # Prepare query with prefix if provided
+        query_text = f"{self.prefix}{query}" if self.prefix else query
+        # Encode the query
+        query_embedding = self.model.encode(
+            [query_text],
+            convert_to_numpy=True,
+            normalize_embeddings=True
+        )[0]
+        # Compute similarity scores
+        scores = np.dot(self.embeddings, query_embedding)
+        # Get top n results
+        top_n_indices = np.argsort(scores)[-n:][::-1]
+        top_n_scores = scores[top_n_indices]
+        # Adjust indices to match database IDs
+        db_ids = top_n_indices + 1
+        # Retrieve memes from the database
+        memes = crud.get_memes_by_ids(self.db, db_ids.tolist())
+        # Format the results
+        results = [
+            {
+                "id": meme.id,
+                "public_id": meme.public_id,
+                "text": meme.text,
+                "image_url": meme.image_url,
+                "local_image_path": meme.local_image_path,
+                "score": top_n_scores[db_ids.tolist().index(meme.id)]
+            }
+            for meme in memes
+        ]
+        return {
+            "results": results,
+            "search_time": time.time() - start_time
+        }