Spaces:
Runtime error
Runtime error
import re | |
import unicodedata | |
from pathlib import Path | |
import bm25s | |
import pandas as pd | |
from llama_index.core import Document | |
from llama_index.core.schema import MetadataMode | |
from llama_index.core.vector_stores.utils import node_to_metadata_dict | |
from llama_index.retrievers.bm25 import BM25Retriever | |
PERSIST_PATH = Path("Save_Index_Local") | |
LP_INFO_FILE = "legal_position_with_categories_documents_all.xlsx" | |
INDEX_NAME = "bm25_retriever" | |
USE_META = False | |
# INDEX_NAME = "bm25_retriever_meta" | |
# USE_META = True | |
def clean_string(text: pd.Series): | |
text = text.fillna("") | |
text = text.str.replace(r"«|»", '"', regex=True) | |
text = text.str.replace(r"\xa0", " ") | |
text = text.str.replace(r"§", "№") | |
# Handle unicode fractions | |
text = text.apply(lambda t: unicodedata.normalize("NFKC", t)) # type: ignore | |
text = text.str.replace("⁄", "/") | |
return text | |
def find_matching_pattern(categories): | |
""" | |
Search for matching patterns in the categories list and return the first match found. | |
Args: | |
categories: List of strings or string to search in | |
Returns: | |
str: Matching pattern or empty string if no match found | |
""" | |
patterns = [ | |
"Велика Палата", | |
"Касаційний кримінальний суд", | |
"Касаційний адміністративний суд", | |
"Касаційний господарський суд", | |
"Касаційний цивільний суд", | |
] | |
# Handle both string and list inputs | |
if isinstance(categories, str): | |
categories = [categories] | |
elif isinstance(categories, list): | |
# If list contains lists, flatten it | |
categories = [item for sublist in categories for item in sublist] | |
# Search for patterns | |
for category in categories: | |
for pattern in patterns: | |
if pattern in category: | |
return pattern | |
return "" | |
ukrainian_stopwords_1 = [ | |
"я", | |
"ти", | |
"він", | |
"вона", | |
"воно", | |
"ми", | |
"ви", | |
"вони", | |
"це", | |
"той", | |
"така", | |
"таке", | |
"такі", | |
"цей", | |
"моя", | |
"твоя", | |
"його", | |
"її", | |
"наш", | |
"ваш", | |
"їх", | |
"де", | |
"чи", | |
"а", | |
"але", | |
"і", | |
"або", | |
"так", | |
"ні", | |
"чи", | |
"в", | |
"на", | |
"з", | |
"до", | |
"під", | |
"через", | |
"після", | |
"між", | |
"серед", | |
"без", | |
"для", | |
"про", | |
"о", | |
"за", | |
"від", | |
"до", | |
"як", | |
"якби", | |
"коли", | |
"де", | |
"тому", | |
"тому що", | |
"що", | |
"чому", | |
"хто", | |
"що", | |
"якось", | |
"коли-небудь", | |
"де-небудь", | |
"чимало", | |
] | |
ukrainian_stopwords_2 = [ | |
# Articles | |
"і", | |
"й", | |
"у", | |
"в", | |
"та", | |
"і", | |
# Pronouns | |
"я", | |
"ти", | |
"він", | |
"вона", | |
"воно", | |
"ми", | |
"ви", | |
"вони", | |
"мене", | |
"тебе", | |
"його", | |
"її", | |
"нас", | |
"вас", | |
"їх", | |
"мій", | |
"твій", | |
"наш", | |
"ваш", | |
"свій", | |
# Prepositions | |
"з", | |
"до", | |
"від", | |
"біля", | |
"над", | |
"під", | |
"через", | |
"для", | |
"без", | |
"між", | |
"серед", | |
"крізь", | |
"понад", | |
"поза", | |
"крім", | |
# Conjunctions | |
"та", | |
"і", | |
"але", | |
"або", | |
"однак", | |
"проте", | |
"тому", | |
"тому що", | |
"оскільки", | |
"якщо", | |
"коли", | |
"хоча", | |
# Auxiliary words | |
"так", | |
"ні", | |
"не", | |
"бути", | |
"мати", | |
"можна", | |
"треба", | |
# Common filler words | |
"цей", | |
"той", | |
"це", | |
"те", | |
"такий", | |
"який", | |
"котрий", | |
# Modal words | |
"мабуть", | |
"напевно", | |
"звичайно", | |
"можливо", | |
# Particles | |
"ось", | |
"ніби", | |
"майже", | |
"майже що", | |
"саме", | |
"лише", | |
"тільки", | |
] | |
ukrainian_stopwords = list(set(ukrainian_stopwords_1 + ukrainian_stopwords_2)) | |
final_df = pd.read_excel(LP_INFO_FILE) | |
if USE_META: | |
category_columns = [ | |
col for col in final_df.columns if re.match(r"category_\d+$", col) | |
] | |
text_columns = ["title", "text_lp", "category_all"] + category_columns | |
final_df[text_columns] = final_df[text_columns].apply(clean_string) | |
final_df["category_search"] = final_df[category_columns].apply( | |
lambda row: ", ".join([str(val) for val in row if pd.notna(val)]), axis=1 | |
) | |
final_df["category_filter"] = final_df["category_all"].apply(find_matching_pattern) | |
legal_position_title_category = [ | |
Document( | |
text=row["text_lp"], # type: ignore | |
metadata={ # type: ignore | |
"lp_id": row["id"], | |
"title": row["title"], | |
"doc_id": row["document_ids"], | |
"category_filter": find_matching_pattern(row["category_all"]), | |
"category_search": row["category_search"], | |
}, | |
excluded_embed_metadata_keys=["doc_id", "category_filter"], | |
excluded_llm_metadata_keys=["doc_id", "category_filter"], | |
) | |
for _, row in final_df.iterrows() | |
] | |
else: | |
final_df[["title", "text_lp"]] = final_df[["title", "text_lp"]].apply(clean_string) | |
legal_position_title_category = [ | |
Document( | |
text=row["text_lp"], # type: ignore | |
metadata={ # type: ignore | |
"title": row["title"], | |
}, | |
excluded_embed_metadata_keys=["title"], | |
excluded_llm_metadata_keys=["title"], | |
) | |
for _, row in final_df.iterrows() | |
] | |
# Copied from BM25Retriever __init__ method, but note that output looks awful and might work worse (this needs checking) | |
corpus = [node_to_metadata_dict(node) for node in legal_position_title_category] | |
corpus_tokens = bm25s.tokenize( | |
[ | |
node.get_content(metadata_mode=MetadataMode.EMBED) | |
for node in legal_position_title_category | |
], | |
stopwords=ukrainian_stopwords, | |
) | |
existing_bm25 = bm25s.BM25( | |
k1=1.88, | |
b=1.25, | |
delta=0.5, | |
method="robertson", | |
# No corpus is saved without this line: | |
corpus=corpus, # prevents TypeError: 'NoneType' object is not subscriptable | |
) | |
existing_bm25.index(corpus=corpus_tokens) | |
bm25_retriever = BM25Retriever( | |
existing_bm25=existing_bm25, | |
similarity_top_k=20, | |
) | |
bm25_retriever.persist(str(PERSIST_PATH / INDEX_NAME)) | |
# Returns an error on invalid corpus | |
loaded_retriever = BM25Retriever.from_persist_dir(str(PERSIST_PATH / INDEX_NAME)) | |