|
import re |
|
import nltk |
|
nltk.download('punkt_tab') |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from rank_bm25 import BM25Okapi |
|
from langchain_text_splitters import NLTKTextSplitter |
|
from langchain_community.vectorstores import FAISS |
|
from langchain_openai import OpenAIEmbeddings |
|
from collections import Counter |
|
|
|
def replace_case_insensitive(text: str, old: str, new: str) -> str: |
|
pattern = re.compile(re.escape(old), re.IGNORECASE) |
|
|
|
return pattern.sub(new, text) |
|
def get_word_list(s1): |
|
|
|
regEx = re.compile('[\W]') |
|
res = re.compile(r"([\u4e00-\u9fa5])") |
|
|
|
p1 = regEx.split(s1.lower()) |
|
str1_list = [] |
|
for str in p1: |
|
if res.split(str) == None: |
|
str1_list.append(str) |
|
else: |
|
ret = res.split(str) |
|
for ch in ret: |
|
str1_list.append(ch) |
|
|
|
list_word1 = [w for w in str1_list if len(w.strip()) > 0] |
|
|
|
return list_word1 |
|
def get_word_len(s1): |
|
return len(get_word_list(s1)) |
|
|
|
regex = r'([。?!;\n.!?;]\s*)' |
|
def retriveDoc(text,query,top_k=3): |
|
import os |
|
sentences = sent_tokenize(text) |
|
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", base_url=os.environ.get("OPENAI_BASE_URL"), |
|
api_key=os.environ.get("OPENAI_API_KEY")) |
|
|
|
vector_store = FAISS.from_texts(sentences, embeddings) |
|
|
|
retrieved_docs = vector_store.similarity_search(query, k=top_k) |
|
print("Retrieved sentences:", retrieved_docs) |
|
|
|
|
|
return retrieved_docs |
|
|
|
|
|
def most_similar_sentence_bm25(paragraph, target_sentence): |
|
""" |
|
Use BM25 algorithm to find the most similar sentence to target_sentence in the given paragraph, |
|
return (most similar sentence, score). |
|
""" |
|
|
|
sentences = sent_tokenize(paragraph) |
|
|
|
|
|
tokenized_sentences = [word_tokenize(sent) for sent in sentences] |
|
|
|
|
|
bm25 = BM25Okapi(tokenized_sentences) |
|
|
|
|
|
target_tokens = word_tokenize(target_sentence) |
|
|
|
|
|
scores = bm25.get_scores(target_tokens) |
|
|
|
|
|
|
|
max_idx = scores.argmax() |
|
|
|
|
|
return sentences[max_idx] |
|
|
|
|
|
def f1_score_text(pred, gold): |
|
pred_tokens = word_tokenize(pred) |
|
gold_tokens = word_tokenize(gold) |
|
common = Counter(pred_tokens) & Counter(gold_tokens) |
|
num_same = sum(common.values()) |
|
if num_same == 0: |
|
return 0.0 |
|
precision = num_same / len(pred_tokens) |
|
recall = num_same / len(gold_tokens) |
|
f1 = 2 * precision * recall / (precision + recall) |
|
return f1 |
|
|
|
def compute_best_sentence_f1(pred_text, gold_text): |
|
pred_sentences = sent_tokenize(pred_text) |
|
gold_sentences = sent_tokenize(gold_text) |
|
f1_scores = [] |
|
for pred in pred_sentences: |
|
best_f1 = 0.0 |
|
for gold in gold_sentences: |
|
f1 = f1_score_text(pred, gold) |
|
if f1 > best_f1: |
|
best_f1 = f1 |
|
f1_scores.append(best_f1) |
|
avg_f1 = sum(f1_scores) / len(pred_sentences) if pred_sentences else 0.0 |
|
return avg_f1 |