LastingBench / utils /util.py
kixx's picture
Upload 34 files
b1e25b1 verified
import re
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize
from rank_bm25 import BM25Okapi
from langchain_text_splitters import NLTKTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from collections import Counter
def replace_case_insensitive(text: str, old: str, new: str) -> str:
pattern = re.compile(re.escape(old), re.IGNORECASE)
return pattern.sub(new, text)
def get_word_list(s1):
# Separate sentences by word, Chinese by word, English by word, numbers by space
regEx = re.compile('[\W]')
res = re.compile(r"([\u4e00-\u9fa5])") # [\u4e00-\u9fa5] for Chinese
p1 = regEx.split(s1.lower())
str1_list = []
for str in p1:
if res.split(str) == None:
str1_list.append(str)
else:
ret = res.split(str)
for ch in ret:
str1_list.append(ch)
list_word1 = [w for w in str1_list if len(w.strip()) > 0]
return list_word1
def get_word_len(s1):
return len(get_word_list(s1))
regex = r'([。?!;\n.!?;]\s*)'
def retriveDoc(text,query,top_k=3):
import os
sentences = sent_tokenize(text)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", base_url=os.environ.get("OPENAI_BASE_URL"),
api_key=os.environ.get("OPENAI_API_KEY"))
# Create vector database through FAISS (built from sentence list)
vector_store = FAISS.from_texts(sentences, embeddings)
retrieved_docs = vector_store.similarity_search(query, k=top_k)
print("Retrieved sentences:", retrieved_docs)
# Return results, can adjust the return structure as needed, here returns a dictionary containing context
return retrieved_docs
def most_similar_sentence_bm25(paragraph, target_sentence):
"""
Use BM25 algorithm to find the most similar sentence to target_sentence in the given paragraph,
return (most similar sentence, score).
"""
# 1. First split the paragraph into a list of sentences
sentences = sent_tokenize(paragraph)
# 2. Tokenize each sentence
tokenized_sentences = [word_tokenize(sent) for sent in sentences]
# 3. Create a retrieval instance using BM25Okapi
bm25 = BM25Okapi(tokenized_sentences)
# 4. Tokenize the target sentence
target_tokens = word_tokenize(target_sentence)
# 5. Use BM25 to calculate similarity scores for each sentence
scores = bm25.get_scores(target_tokens)
# scores.shape == (len(sentences),)
# 6. Find the index of the sentence with the highest score
max_idx = scores.argmax()
# Return the most similar sentence and its score
return sentences[max_idx]
def f1_score_text(pred, gold):
pred_tokens = word_tokenize(pred)
gold_tokens = word_tokenize(gold)
common = Counter(pred_tokens) & Counter(gold_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0.0
precision = num_same / len(pred_tokens)
recall = num_same / len(gold_tokens)
f1 = 2 * precision * recall / (precision + recall)
return f1
def compute_best_sentence_f1(pred_text, gold_text):
pred_sentences = sent_tokenize(pred_text)
gold_sentences = sent_tokenize(gold_text)
f1_scores = []
for pred in pred_sentences:
best_f1 = 0.0
for gold in gold_sentences:
f1 = f1_score_text(pred, gold)
if f1 > best_f1:
best_f1 = f1
f1_scores.append(best_f1)
avg_f1 = sum(f1_scores) / len(pred_sentences) if pred_sentences else 0.0
return avg_f1