Spaces:

DocSA
/

Legal_Position_hybrid_search_without_AI

Runtime error

App Files Files Community

Legal_Position_hybrid_search_without_AI / init_bm25.py

i-d-lytvynenko

Add basic BM25 search and corpus generation

920001b 3 months ago

raw

history blame

6.83 kB

	import re
	import unicodedata
	from pathlib import Path

	import bm25s
	import pandas as pd
	from llama_index.core import Document
	from llama_index.core.schema import MetadataMode
	from llama_index.core.vector_stores.utils import node_to_metadata_dict
	from llama_index.retrievers.bm25 import BM25Retriever



	PERSIST_PATH = Path("Save_Index_Local")
	LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx"

	INDEX_NAME = "bm25_retriever"
	USE_META = False

	# INDEX_NAME = "bm25_retriever_meta"
	# USE_META = True


	def clean_string(text: pd.Series):
	text = text.fillna("")
	text = text.str.replace(r"«\|»", '"', regex=True)
	text = text.str.replace(r"\xa0", " ")
	text = text.str.replace(r"§", "№")

	# Handle unicode fractions
	text = text.apply(lambda t: unicodedata.normalize("NFKC", t)) # type: ignore
	text = text.str.replace("⁄", "/")

	return text


	def find_matching_pattern(categories):
	"""
	Search for matching patterns in the categories list and return the first match found.

	Args:
	categories: List of strings or string to search in

	Returns:
	str: Matching pattern or empty string if no match found
	"""
	patterns = [
	"Велика Палата",
	"Касаційний кримінальний суд",
	"Касаційний адміністративний суд",
	"Касаційний господарський суд",
	"Касаційний цивільний суд",
	]

	# Handle both string and list inputs
	if isinstance(categories, str):
	categories = [categories]
	elif isinstance(categories, list):
	# If list contains lists, flatten it
	categories = [item for sublist in categories for item in sublist]

	# Search for patterns
	for category in categories:
	for pattern in patterns:
	if pattern in category:
	return pattern
	return ""


	ukrainian_stopwords_1 = [
	"я",
	"ти",
	"він",
	"вона",
	"воно",
	"ми",
	"ви",
	"вони",
	"це",
	"той",
	"така",
	"таке",
	"такі",
	"цей",
	"моя",
	"твоя",
	"його",
	"її",
	"наш",
	"ваш",
	"їх",
	"де",
	"чи",
	"а",
	"але",
	"і",
	"або",
	"так",
	"ні",
	"чи",
	"в",
	"на",
	"з",
	"до",
	"під",
	"через",
	"після",
	"між",
	"серед",
	"без",
	"для",
	"про",
	"о",
	"за",
	"від",
	"до",
	"як",
	"якби",
	"коли",
	"де",
	"тому",
	"тому що",
	"що",
	"чому",
	"хто",
	"що",
	"якось",
	"коли-небудь",
	"де-небудь",
	"чимало",
	]

	ukrainian_stopwords_2 = [
	# Articles
	"і",
	"й",
	"у",
	"в",
	"та",
	"і",
	# Pronouns
	"я",
	"ти",
	"він",
	"вона",
	"воно",
	"ми",
	"ви",
	"вони",
	"мене",
	"тебе",
	"його",
	"її",
	"нас",
	"вас",
	"їх",
	"мій",
	"твій",
	"наш",
	"ваш",
	"свій",
	# Prepositions
	"з",
	"до",
	"від",
	"біля",
	"над",
	"під",
	"через",
	"для",
	"без",
	"між",
	"серед",
	"крізь",
	"понад",
	"поза",
	"крім",
	# Conjunctions
	"та",
	"і",
	"але",
	"або",
	"однак",
	"проте",
	"тому",
	"тому що",
	"оскільки",
	"якщо",
	"коли",
	"хоча",
	# Auxiliary words
	"так",
	"ні",
	"не",
	"бути",
	"мати",
	"можна",
	"треба",
	# Common filler words
	"цей",
	"той",
	"це",
	"те",
	"такий",
	"який",
	"котрий",
	# Modal words
	"мабуть",
	"напевно",
	"звичайно",
	"можливо",
	# Particles
	"ось",
	"ніби",
	"майже",
	"майже що",
	"саме",
	"лише",
	"тільки",
	]

	ukrainian_stopwords = list(set(ukrainian_stopwords_1 + ukrainian_stopwords_2))


	final_df = pd.read_excel(LP_INFO_FILE)

	if USE_META:
	category_columns = [
	col for col in final_df.columns if re.match(r"category_\d+$", col)
	]

	text_columns = ["title", "text_lp", "category_all"] + category_columns
	final_df[text_columns] = final_df[text_columns].apply(clean_string)

	final_df["category_search"] = final_df[category_columns].apply(
	lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1
	)
	final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern)

	legal_position_title_category = [
	Document(
	text=row["text_lp"], # type: ignore
	metadata={ # type: ignore
	"lp_id": row["id"],
	"title": row["title"],
	"doc_id": row["document_ids"],
	"category_filter": find_matching_pattern(row["category_all"]),
	"category_search": row["category_search"],
	},
	excluded_embed_metadata_keys=["doc_id", "category_filter"],
	excluded_llm_metadata_keys=["doc_id", "category_filter"],
	)
	for _, row in final_df.iterrows()
	]
	else:
	final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string)
	legal_position_title_category = [
	Document(
	text=row["text_lp"], # type: ignore
	metadata={ # type: ignore
	"title": row["title"],
	},
	excluded_embed_metadata_keys=["title"],
	excluded_llm_metadata_keys=["title"],
	)
	for _, row in final_df.iterrows()
	]


	# Copied from BM25Retriever __init__ method, but note that output looks awful and might work worse (this needs checking)
	corpus = [node_to_metadata_dict(node) for node in legal_position_title_category]
	corpus_tokens = bm25s.tokenize(
	[
	node.get_content(metadata_mode=MetadataMode.EMBED)
	for node in legal_position_title_category
	],
	stopwords=ukrainian_stopwords,
	)

	existing_bm25 = bm25s.BM25(
	k1=1.88,
	b=1.25,
	delta=0.5,
	method="robertson",
	# No corpus is saved without this line:
	corpus=corpus, # prevents TypeError: 'NoneType' object is not subscriptable
	)
	existing_bm25.index(corpus=corpus_tokens)

	bm25_retriever = BM25Retriever(
	existing_bm25=existing_bm25,
	similarity_top_k=20,
	)

	bm25_retriever.persist(str(PERSIST_PATH / INDEX_NAME))

	# Returns an error on invalid corpus
	loaded_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME))