Update app.py
Browse files
app.py
CHANGED
@@ -540,57 +540,43 @@ def load_and_process_documents(path):
|
|
540 |
|
541 |
doc_texts = load_and_process_documents(folder_path)
|
542 |
|
543 |
-
|
544 |
-
stop_words =
|
545 |
-
|
546 |
-
"یا", "از", "بر", "همچنین", "می", "باید", "شود", "شد", "گفت", "گویا", "داشت", "داشتن", "کنند", "کنیم",
|
547 |
-
"کرد", "کردن", "نیز", "یا", "اگر", "ای", "اینکه", "نه", "باشید", "باشم", "باشی", "در حالی که", "مگر", "چرا"
|
548 |
-
]
|
549 |
-
|
550 |
-
# تابعی برای پاکسازی کلمات اضافی از سوال
|
551 |
def remove_stop_words(text, stop_words):
|
552 |
words = text.split()
|
553 |
return " ".join([word for word in words if word not in stop_words])
|
554 |
|
555 |
-
# تابعی برای استخراج کلمات از متن
|
556 |
def extract_keywords_from_text(text, query_words):
|
557 |
matched_lines = []
|
558 |
lines = text.split("\n")
|
559 |
|
560 |
-
# جستجو برای هر کلمه در هر خط
|
561 |
for line in lines:
|
562 |
if any(query_word in line for query_word in query_words):
|
563 |
matched_lines.append(line)
|
564 |
return matched_lines
|
565 |
|
566 |
-
# تابعی برای پاکسازی متن
|
567 |
def clean_text(text):
|
568 |
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
569 |
|
570 |
-
# تابعی برای پیدا کردن نزدیکترین خطوط به سوال
|
571 |
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
572 |
-
# حذف کلمات اضافی از سوال
|
573 |
cleaned_query = remove_stop_words(query, stop_words)
|
574 |
query_words = cleaned_query.split()
|
575 |
|
576 |
all_matched_lines = []
|
577 |
|
578 |
-
# بررسی محتوای فایلها
|
579 |
for filename, text in doc_texts.items():
|
580 |
matched_lines = extract_keywords_from_text(text, query_words)
|
581 |
for line in matched_lines:
|
582 |
similarity = fuzz.partial_ratio(query, line) # محاسبه شباهت خط با سوال
|
583 |
all_matched_lines.append((line, similarity))
|
584 |
|
585 |
-
# مرتب سازی بر اساس شباهت
|
586 |
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
587 |
|
588 |
-
# انتخاب ۱۰ خط نزدیکتر
|
589 |
closest_lines = [line for line, _ in all_matched_lines[:top_n]]
|
590 |
|
591 |
return closest_lines
|
592 |
|
593 |
-
# تابعی برای حذف کلمات توقف از یک لیست از خطوط
|
594 |
def remove_stop_words_from_lines(lines, stop_words):
|
595 |
cleaned_lines = []
|
596 |
for line in lines:
|
@@ -599,12 +585,9 @@ def remove_stop_words_from_lines(lines, stop_words):
|
|
599 |
cleaned_lines.append(" ".join(cleaned_words))
|
600 |
return cleaned_lines
|
601 |
|
602 |
-
# حالا این رو در کد اصلی استفاده میکنیم:
|
603 |
if query:
|
604 |
-
|
605 |
-
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=3)
|
606 |
|
607 |
-
# حذف کلمات توقف از خطوط نزدیک
|
608 |
cleaned_closest_lines = remove_stop_words_from_lines(closest_lines, stop_words)
|
609 |
|
610 |
if cleaned_closest_lines:
|
|
|
540 |
|
541 |
doc_texts = load_and_process_documents(folder_path)
|
542 |
|
543 |
+
with open('C:/Users/ici/Downloads/Telegram Desktop/45/stopwords.txt', 'r', encoding='utf-8') as file:
|
544 |
+
stop_words = set(file.read().splitlines())
|
545 |
+
|
|
|
|
|
|
|
|
|
|
|
546 |
def remove_stop_words(text, stop_words):
|
547 |
words = text.split()
|
548 |
return " ".join([word for word in words if word not in stop_words])
|
549 |
|
|
|
550 |
def extract_keywords_from_text(text, query_words):
|
551 |
matched_lines = []
|
552 |
lines = text.split("\n")
|
553 |
|
|
|
554 |
for line in lines:
|
555 |
if any(query_word in line for query_word in query_words):
|
556 |
matched_lines.append(line)
|
557 |
return matched_lines
|
558 |
|
|
|
559 |
def clean_text(text):
|
560 |
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
561 |
|
|
|
562 |
def find_closest_lines(query, doc_texts, stop_words, top_n=10):
|
|
|
563 |
cleaned_query = remove_stop_words(query, stop_words)
|
564 |
query_words = cleaned_query.split()
|
565 |
|
566 |
all_matched_lines = []
|
567 |
|
|
|
568 |
for filename, text in doc_texts.items():
|
569 |
matched_lines = extract_keywords_from_text(text, query_words)
|
570 |
for line in matched_lines:
|
571 |
similarity = fuzz.partial_ratio(query, line) # محاسبه شباهت خط با سوال
|
572 |
all_matched_lines.append((line, similarity))
|
573 |
|
|
|
574 |
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
575 |
|
|
|
576 |
closest_lines = [line for line, _ in all_matched_lines[:top_n]]
|
577 |
|
578 |
return closest_lines
|
579 |
|
|
|
580 |
def remove_stop_words_from_lines(lines, stop_words):
|
581 |
cleaned_lines = []
|
582 |
for line in lines:
|
|
|
585 |
cleaned_lines.append(" ".join(cleaned_words))
|
586 |
return cleaned_lines
|
587 |
|
|
|
588 |
if query:
|
589 |
+
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=10)
|
|
|
590 |
|
|
|
591 |
cleaned_closest_lines = remove_stop_words_from_lines(closest_lines, stop_words)
|
592 |
|
593 |
if cleaned_closest_lines:
|