i-d-lytvynenko's picture
Add basic BM25 search and corpus generation
920001b
raw
history blame
6.83 kB
import re
import unicodedata
from pathlib import Path
import bm25s
import pandas as pd
from llama_index.core import Document
from llama_index.core.schema import MetadataMode
from llama_index.core.vector_stores.utils import node_to_metadata_dict
from llama_index.retrievers.bm25 import BM25Retriever
PERSIST_PATH = Path("Save_Index_Local")
LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"
INDEX_NAME = "bm25_retriever"
USE_META = False
# INDEX_NAME = "bm25_retriever_meta"
# USE_META = True
def clean_string(text: pd.Series):
text = text.fillna("")
text = text.str.replace(r"«|»", '"', regex=True)
text = text.str.replace(r"\xa0", " ")
text = text.str.replace(r"§", "№")
# Handle unicode fractions
text = text.apply(lambda t: unicodedata.normalize("NFKC", t)) # type: ignore
text = text.str.replace("⁄", "/")
return text
def find_matching_pattern(categories):
"""
Search for matching patterns in the categories list and return the first match found.
Args:
categories: List of strings or string to search in
Returns:
str: Matching pattern or empty string if no match found
"""
patterns = [
"Велика Палата",
"Касаційний кримінальний суд",
"Касаційний адміністративний суд",
"Касаційний господарський суд",
"Касаційний цивільний суд",
]
# Handle both string and list inputs
if isinstance(categories, str):
categories = [categories]
elif isinstance(categories, list):
# If list contains lists, flatten it
categories = [item for sublist in categories for item in sublist]
# Search for patterns
for category in categories:
for pattern in patterns:
if pattern in category:
return pattern
return ""
ukrainian_stopwords_1 = [
"я",
"ти",
"він",
"вона",
"воно",
"ми",
"ви",
"вони",
"це",
"той",
"така",
"таке",
"такі",
"цей",
"моя",
"твоя",
"його",
"її",
"наш",
"ваш",
"їх",
"де",
"чи",
"а",
"але",
"і",
"або",
"так",
"ні",
"чи",
"в",
"на",
"з",
"до",
"під",
"через",
"після",
"між",
"серед",
"без",
"для",
"про",
"о",
"за",
"від",
"до",
"як",
"якби",
"коли",
"де",
"тому",
"тому що",
"що",
"чому",
"хто",
"що",
"якось",
"коли-небудь",
"де-небудь",
"чимало",
]
ukrainian_stopwords_2 = [
# Articles
"і",
"й",
"у",
"в",
"та",
"і",
# Pronouns
"я",
"ти",
"він",
"вона",
"воно",
"ми",
"ви",
"вони",
"мене",
"тебе",
"його",
"її",
"нас",
"вас",
"їх",
"мій",
"твій",
"наш",
"ваш",
"свій",
# Prepositions
"з",
"до",
"від",
"біля",
"над",
"під",
"через",
"для",
"без",
"між",
"серед",
"крізь",
"понад",
"поза",
"крім",
# Conjunctions
"та",
"і",
"але",
"або",
"однак",
"проте",
"тому",
"тому що",
"оскільки",
"якщо",
"коли",
"хоча",
# Auxiliary words
"так",
"ні",
"не",
"бути",
"мати",
"можна",
"треба",
# Common filler words
"цей",
"той",
"це",
"те",
"такий",
"який",
"котрий",
# Modal words
"мабуть",
"напевно",
"звичайно",
"можливо",
# Particles
"ось",
"ніби",
"майже",
"майже що",
"саме",
"лише",
"тільки",
]
ukrainian_stopwords = list(set(ukrainian_stopwords_1 + ukrainian_stopwords_2))
final_df = pd.read_excel(LP_INFO_FILE)
if USE_META:
category_columns = [
col for col in final_df.columns if re.match(r"category_\d+$", col)
]
text_columns = ["title", "text_lp", "category_all"] + category_columns
final_df[text_columns] = final_df[text_columns].apply(clean_string)
final_df["category_search"] = final_df[category_columns].apply(
lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
)
final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)
legal_position_title_category = [
Document(
text=row["text_lp"], # type: ignore
metadata={ # type: ignore
"lp_id": row["id"],
"title": row["title"],
"doc_id": row["document_ids"],
"category_filter": find_matching_pattern(row["category_all"]),
"category_search": row["category_search"],
},
excluded_embed_metadata_keys=["doc_id", "category_filter"],
excluded_llm_metadata_keys=["doc_id", "category_filter"],
)
for _, row in final_df.iterrows()
]
else:
final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string)
legal_position_title_category = [
Document(
text=row["text_lp"], # type: ignore
metadata={ # type: ignore
"title": row["title"],
},
excluded_embed_metadata_keys=["title"],
excluded_llm_metadata_keys=["title"],
)
for _, row in final_df.iterrows()
]
# Copied from BM25Retriever __init__ method, but note that output looks awful and might work worse (this needs checking)
corpus = [node_to_metadata_dict(node) for node in legal_position_title_category]
corpus_tokens = bm25s.tokenize(
[
node.get_content(metadata_mode=MetadataMode.EMBED)
for node in legal_position_title_category
],
stopwords=ukrainian_stopwords,
)
existing_bm25 = bm25s.BM25(
k1=1.88,
b=1.25,
delta=0.5,
method="robertson",
# No corpus is saved without this line:
corpus=corpus, # prevents TypeError: 'NoneType' object is not subscriptable
)
existing_bm25.index(corpus=corpus_tokens)
bm25_retriever = BM25Retriever(
existing_bm25=existing_bm25,
similarity_top_k=20,
)
bm25_retriever.persist(str(PERSIST_PATH / INDEX_NAME))
# Returns an error on invalid corpus
loaded_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))