Update app.py
Browse files
app.py
CHANGED
@@ -517,7 +517,10 @@ import concurrent.futures
|
|
517 |
from hazm import Normalizer
|
518 |
from rapidfuzz import fuzz
|
519 |
from langchain.schema import SystemMessage, HumanMessage
|
|
|
|
|
520 |
|
|
|
521 |
folder_path = '46'
|
522 |
normalizer = Normalizer()
|
523 |
|
@@ -543,31 +546,34 @@ def load_and_process_documents(path):
|
|
543 |
|
544 |
return doc_texts
|
545 |
|
|
|
546 |
doc_texts = load_and_process_documents(folder_path)
|
547 |
|
|
|
548 |
with open("stopwords.txt", "r", encoding="utf-8") as f:
|
549 |
stop_words = set(line.strip() for line in f if line.strip())
|
550 |
|
|
|
551 |
def remove_stop_words(text, stop_words):
|
552 |
words = text.split()
|
553 |
return " ".join([word for word in words if word not in stop_words])
|
554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
def extract_keywords_from_text(text, query_words):
|
556 |
matched_lines = []
|
557 |
lines = text.split("\n")
|
558 |
-
|
559 |
for line in lines:
|
560 |
if any(query_word in line for query_word in query_words):
|
561 |
matched_lines.append(line)
|
562 |
return matched_lines
|
563 |
|
564 |
-
|
565 |
-
return re.sub(r'[^آ-ی۰-۹0-9،.؟!؛+\-* ]+', '', text)
|
566 |
-
|
567 |
-
|
568 |
-
from collections import Counter
|
569 |
-
import heapq
|
570 |
-
|
571 |
def summarize_text_by_frequency(text, num_sentences=1):
|
572 |
sentences = text.split('\n')
|
573 |
word_freq = Counter()
|
@@ -586,47 +592,42 @@ def summarize_text_by_frequency(text, num_sentences=1):
|
|
586 |
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
|
587 |
return "\n".join(summarized_sentences)
|
588 |
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
def find_closest_lines(query, doc_texts, stop_words, top_n=15):
|
594 |
cleaned_query = remove_stop_words(query, stop_words)
|
595 |
query_words = cleaned_query.split()
|
596 |
|
597 |
all_matched_lines = []
|
598 |
-
|
599 |
for filename, text in doc_texts.items():
|
600 |
matched_lines = extract_keywords_from_text(text, query_words)
|
601 |
for line in matched_lines:
|
602 |
-
similarity = fuzz.partial_ratio(query, line)
|
603 |
all_matched_lines.append((line, similarity))
|
604 |
-
|
605 |
-
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
606 |
|
|
|
607 |
closest_lines = [line for line, _ in all_matched_lines[:top_n]]
|
608 |
|
609 |
return closest_lines
|
610 |
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
|
616 |
if query:
|
617 |
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=15)
|
618 |
-
|
619 |
-
# حذف
|
620 |
cleaned_closest_lines = [
|
621 |
-
|
622 |
-
|
623 |
]
|
624 |
-
|
|
|
625 |
summarized_text = summarize_text_by_frequency("\n".join(cleaned_closest_lines), num_sentences=1)
|
626 |
-
|
627 |
-
summarized_cleaned = remove_stop_phrases(summarized_text, stop_words)
|
628 |
-
st.markdown(summarized_text)
|
629 |
|
|
|
|
|
630 |
|
631 |
if summarized_text:
|
632 |
prompt = f"""
|
@@ -639,18 +640,16 @@ if query:
|
|
639 |
{summarized_text}
|
640 |
پاسخ نهایی:
|
641 |
"""
|
642 |
-
|
643 |
-
# ارسال پیام به مدل به صورت صحیح
|
644 |
response = llm([
|
645 |
-
SystemMessage(content="تو رزم یار ارتش هستی و
|
646 |
HumanMessage(content=prompt)
|
647 |
])
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
# نمایش نتیجه
|
653 |
st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
|
654 |
-
|
655 |
else:
|
656 |
st.warning("هیچ خط مرتبطی پیدا نشد.")
|
|
|
517 |
from hazm import Normalizer
|
518 |
from rapidfuzz import fuzz
|
519 |
from langchain.schema import SystemMessage, HumanMessage
|
520 |
+
from collections import Counter
|
521 |
+
import heapq
|
522 |
|
523 |
+
# مسیر پوشه اسناد
|
524 |
folder_path = '46'
|
525 |
normalizer = Normalizer()
|
526 |
|
|
|
546 |
|
547 |
return doc_texts
|
548 |
|
549 |
+
# پردازش فایلها
|
550 |
doc_texts = load_and_process_documents(folder_path)
|
551 |
|
552 |
+
# خواندن استاپ وردها
|
553 |
with open("stopwords.txt", "r", encoding="utf-8") as f:
|
554 |
stop_words = set(line.strip() for line in f if line.strip())
|
555 |
|
556 |
+
# حذف استاپوردها از متن
|
557 |
def remove_stop_words(text, stop_words):
|
558 |
words = text.split()
|
559 |
return " ".join([word for word in words if word not in stop_words])
|
560 |
|
561 |
+
# حذف عبارات ایست
|
562 |
+
def remove_stop_phrases(text, stop_words):
|
563 |
+
for phrase in stop_words:
|
564 |
+
text = text.replace(phrase, "")
|
565 |
+
return text
|
566 |
+
|
567 |
+
# استخراج خطوط حاوی کلمات کوئری
|
568 |
def extract_keywords_from_text(text, query_words):
|
569 |
matched_lines = []
|
570 |
lines = text.split("\n")
|
|
|
571 |
for line in lines:
|
572 |
if any(query_word in line for query_word in query_words):
|
573 |
matched_lines.append(line)
|
574 |
return matched_lines
|
575 |
|
576 |
+
# خلاصهسازی بر اساس فراوانی واژگان
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
def summarize_text_by_frequency(text, num_sentences=1):
|
578 |
sentences = text.split('\n')
|
579 |
word_freq = Counter()
|
|
|
592 |
summarized_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
|
593 |
return "\n".join(summarized_sentences)
|
594 |
|
595 |
+
# پیدا کردن خطوط مشابه
|
|
|
|
|
|
|
596 |
def find_closest_lines(query, doc_texts, stop_words, top_n=15):
|
597 |
cleaned_query = remove_stop_words(query, stop_words)
|
598 |
query_words = cleaned_query.split()
|
599 |
|
600 |
all_matched_lines = []
|
|
|
601 |
for filename, text in doc_texts.items():
|
602 |
matched_lines = extract_keywords_from_text(text, query_words)
|
603 |
for line in matched_lines:
|
604 |
+
similarity = fuzz.partial_ratio(query, line)
|
605 |
all_matched_lines.append((line, similarity))
|
|
|
|
|
606 |
|
607 |
+
all_matched_lines.sort(key=lambda x: x[1], reverse=True)
|
608 |
closest_lines = [line for line, _ in all_matched_lines[:top_n]]
|
609 |
|
610 |
return closest_lines
|
611 |
|
612 |
+
# رابط کاربری Streamlit
|
613 |
+
st.title("پاسخدهی به سوالات بر اساس اسناد بارگذاریشده")
|
614 |
+
|
615 |
+
query = st.text_input("سوال خود را وارد کنید:")
|
616 |
|
617 |
if query:
|
618 |
closest_lines = find_closest_lines(query, doc_texts, stop_words, top_n=15)
|
619 |
+
|
620 |
+
# حذف استاپوردها از خطوط
|
621 |
cleaned_closest_lines = [
|
622 |
+
remove_stop_phrases(line, stop_words)
|
623 |
+
for line in closest_lines
|
624 |
]
|
625 |
+
|
626 |
+
# خلاصهسازی
|
627 |
summarized_text = summarize_text_by_frequency("\n".join(cleaned_closest_lines), num_sentences=1)
|
|
|
|
|
|
|
628 |
|
629 |
+
# نمایش خلاصه
|
630 |
+
st.markdown(summarized_text)
|
631 |
|
632 |
if summarized_text:
|
633 |
prompt = f"""
|
|
|
640 |
{summarized_text}
|
641 |
پاسخ نهایی:
|
642 |
"""
|
643 |
+
|
|
|
644 |
response = llm([
|
645 |
+
SystemMessage(content="تو رزم یار ارتش هستی و از کتاب و دیتای موجود به سوالات پاسخ میدی."),
|
646 |
HumanMessage(content=prompt)
|
647 |
])
|
648 |
+
|
649 |
+
rewritten = response.content.strip()
|
650 |
+
|
|
|
651 |
# نمایش نتیجه
|
652 |
st.markdown(f'<div class="chat-message">{rewritten}</div>', unsafe_allow_html=True)
|
653 |
+
|
654 |
else:
|
655 |
st.warning("هیچ خط مرتبطی پیدا نشد.")
|