| import gradio as gr
|
| import pdfplumber
|
| import fitz
|
| from sentence_transformers import SentenceTransformer, util
|
| import faiss
|
| import numpy as np
|
| import re
|
|
|
|
|
| def extract_text_from_pdf(file):
|
| text = ""
|
| with pdfplumber.open(file.name) as pdf:
|
| for page in pdf.pages:
|
| page_text = page.extract_text()
|
| if page_text:
|
| text += page_text + "\n"
|
| return text
|
|
|
|
|
| def clean_text(text):
|
| text = re.sub(r'\n+', '\n', text)
|
| text = re.sub(r'[ \t]+', ' ', text)
|
| return text.strip()
|
|
|
|
|
| def chunk_text(text, chunk_size=500, overlap=50):
|
| sentences = re.split(r'(?<=[.!?]) +', text)
|
| chunks = []
|
| current_chunk = ""
|
| for sentence in sentences:
|
| if len(current_chunk) + len(sentence) <= chunk_size:
|
| current_chunk += " " + sentence
|
| else:
|
| chunks.append(current_chunk.strip())
|
| current_chunk = sentence
|
| if current_chunk:
|
| chunks.append(current_chunk.strip())
|
| return chunks
|
|
|
|
|
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
|
| def answer_question(pdf_file, question):
|
|
|
| raw_text = extract_text_from_pdf(pdf_file)
|
| cleaned_text = clean_text(raw_text)
|
|
|
|
|
| chunks = chunk_text(cleaned_text)
|
|
|
|
|
| embeddings = model.encode(chunks)
|
|
|
|
|
| index = faiss.IndexFlatL2(embeddings.shape[1])
|
| index.add(np.array(embeddings))
|
|
|
|
|
| question_embedding = model.encode([question])
|
|
|
|
|
| D, I = index.search(np.array(question_embedding), k=3)
|
|
|
|
|
| answers = [chunks[i] for i in I[0]]
|
| return "\n\n---\n\n".join(answers)
|
|
|
|
|
| iface = gr.Interface(
|
| fn=answer_question,
|
| inputs=[
|
| gr.File(label="آپلود فایل PDF", file_types=[".pdf"]),
|
| gr.Textbox(label="پرسش خود را وارد کنید")
|
| ],
|
| outputs="text",
|
| title="پاسخ به پرسشها از روی فایل PDF",
|
| description="یک سیستم RAG ساده برای پاسخ به پرسشها از روی محتوای فایل PDF"
|
| )
|
|
|
| iface.launch()
|
|
|