Spaces:

DocSA
/

Legal_Position_hybrid_search_without_AI

Runtime error

File size: 6,834 Bytes

920001b

import re
import unicodedata
from pathlib import Path

import bm25s
import pandas as pd
from llama_index.core import Document
from llama_index.core.schema import MetadataMode
from llama_index.core.vector_stores.utils import node_to_metadata_dict
from llama_index.retrievers.bm25 import BM25Retriever



PERSIST_PATH = Path("Save_Index_Local")
LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"

INDEX_NAME = "bm25_retriever"
USE_META = False

# INDEX_NAME = "bm25_retriever_meta"
# USE_META = True


def clean_string(text: pd.Series):
    text = text.fillna("")
    text = text.str.replace(r"«|»", '"', regex=True)
    text = text.str.replace(r"\xa0", " ")
    text = text.str.replace(r"§", "№")

    # Handle unicode fractions
    text = text.apply(lambda t: unicodedata.normalize("NFKC", t))  # type: ignore
    text = text.str.replace("⁄", "/")

    return text


def find_matching_pattern(categories):
    """
    Search for matching patterns in the categories list and return the first match found.

    Args:
        categories: List of strings or string to search in

    Returns:
        str: Matching pattern or empty string if no match found
    """
    patterns = [
        "Велика Палата",
        "Касаційний кримінальний суд",
        "Касаційний адміністративний суд",
        "Касаційний господарський суд",
        "Касаційний цивільний суд",
    ]

    # Handle both string and list inputs
    if isinstance(categories, str):
        categories = [categories]
    elif isinstance(categories, list):
        # If list contains lists, flatten it
        categories = [item for sublist in categories for item in sublist]

    # Search for patterns
    for category in categories:
        for pattern in patterns:
            if pattern in category:
                return pattern
    return ""


ukrainian_stopwords_1 = [
    "я",
    "ти",
    "він",
    "вона",
    "воно",
    "ми",
    "ви",
    "вони",
    "це",
    "той",
    "така",
    "таке",
    "такі",
    "цей",
    "моя",
    "твоя",
    "його",
    "її",
    "наш",
    "ваш",
    "їх",
    "де",
    "чи",
    "а",
    "але",
    "і",
    "або",
    "так",
    "ні",
    "чи",
    "в",
    "на",
    "з",
    "до",
    "під",
    "через",
    "після",
    "між",
    "серед",
    "без",
    "для",
    "про",
    "о",
    "за",
    "від",
    "до",
    "як",
    "якби",
    "коли",
    "де",
    "тому",
    "тому що",
    "що",
    "чому",
    "хто",
    "що",
    "якось",
    "коли-небудь",
    "де-небудь",
    "чимало",
]

ukrainian_stopwords_2 = [
    # Articles
    "і",
    "й",
    "у",
    "в",
    "та",
    "і",
    # Pronouns
    "я",
    "ти",
    "він",
    "вона",
    "воно",
    "ми",
    "ви",
    "вони",
    "мене",
    "тебе",
    "його",
    "її",
    "нас",
    "вас",
    "їх",
    "мій",
    "твій",
    "наш",
    "ваш",
    "свій",
    # Prepositions
    "з",
    "до",
    "від",
    "біля",
    "над",
    "під",
    "через",
    "для",
    "без",
    "між",
    "серед",
    "крізь",
    "понад",
    "поза",
    "крім",
    # Conjunctions
    "та",
    "і",
    "але",
    "або",
    "однак",
    "проте",
    "тому",
    "тому що",
    "оскільки",
    "якщо",
    "коли",
    "хоча",
    # Auxiliary words
    "так",
    "ні",
    "не",
    "бути",
    "мати",
    "можна",
    "треба",
    # Common filler words
    "цей",
    "той",
    "це",
    "те",
    "такий",
    "який",
    "котрий",
    # Modal words
    "мабуть",
    "напевно",
    "звичайно",
    "можливо",
    # Particles
    "ось",
    "ніби",
    "майже",
    "майже що",
    "саме",
    "лише",
    "тільки",
]

ukrainian_stopwords = list(set(ukrainian_stopwords_1 + ukrainian_stopwords_2))


final_df = pd.read_excel(LP_INFO_FILE)

if USE_META:
    category_columns = [
        col for col in final_df.columns if re.match(r"category_\d+$", col)
    ]

    text_columns = ["title", "text_lp", "category_all"] + category_columns
    final_df[text_columns] = final_df[text_columns].apply(clean_string)

    final_df["category_search"] = final_df[category_columns].apply(
        lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
    )
    final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)

    legal_position_title_category = [
        Document(
            text=row["text_lp"],  # type: ignore
            metadata={  # type: ignore
                "lp_id": row["id"],
                "title": row["title"],
                "doc_id": row["document_ids"],
                "category_filter": find_matching_pattern(row["category_all"]),
                "category_search": row["category_search"],
            },
            excluded_embed_metadata_keys=["doc_id", "category_filter"],
            excluded_llm_metadata_keys=["doc_id", "category_filter"],
        )
        for _, row in final_df.iterrows()
    ]
else:
    final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string)
    legal_position_title_category = [
        Document(
            text=row["text_lp"],  # type: ignore
            metadata={  # type: ignore
                "title": row["title"],
            },
            excluded_embed_metadata_keys=["title"],
            excluded_llm_metadata_keys=["title"],
        )
        for _, row in final_df.iterrows()
    ]


# Copied from BM25Retriever __init__ method, but note that output looks awful and might work worse (this needs checking)
corpus = [node_to_metadata_dict(node) for node in legal_position_title_category]
corpus_tokens = bm25s.tokenize(
    [
        node.get_content(metadata_mode=MetadataMode.EMBED)
        for node in legal_position_title_category
    ],
    stopwords=ukrainian_stopwords,
)

existing_bm25 = bm25s.BM25(
    k1=1.88,
    b=1.25,
    delta=0.5,
    method="robertson",
    # No corpus is saved without this line:
    corpus=corpus,  # prevents TypeError: 'NoneType' object is not subscriptable
)
existing_bm25.index(corpus=corpus_tokens)

bm25_retriever = BM25Retriever(
    existing_bm25=existing_bm25,
    similarity_top_k=20,
)

bm25_retriever.persist(str(PERSIST_PATH / INDEX_NAME))

# Returns an error on invalid corpus
loaded_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))