|
import json |
|
import numpy as np |
|
import faiss |
|
from sentence_transformers import SentenceTransformer |
|
from transformers import T5Tokenizer, T5ForConditionalGeneration |
|
from PyPDF2 import PdfReader |
|
import spacy |
|
|
|
|
|
try: |
|
nlp = spacy.load("en_core_web_sm") |
|
except OSError: |
|
from spacy.cli import download |
|
download("en_core_web_sm") |
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") |
|
|
|
tokenizer = T5Tokenizer.from_pretrained("./T5base_Question_Generation") |
|
t5_model = T5ForConditionalGeneration.from_pretrained("./T5base_Question_Generation") |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
reader = PdfReader(pdf_path) |
|
text = "" |
|
for page in reader.pages: |
|
if page.extract_text(): |
|
text += page.extract_text() + "\n" |
|
return text |
|
|
|
def split_into_sentences(text): |
|
doc = nlp(text) |
|
return [sent.text.strip() for sent in doc.sents if sent.text.strip()] |
|
|
|
def create_chunks(sentences, window_size=2): |
|
return [" ".join(sentences[i:i+window_size]) for i in range(len(sentences) - window_size + 1)] |
|
|
|
def generate_embeddings(chunks): |
|
return embedding_model.encode(chunks, show_progress_bar=True) |
|
|
|
def create_faiss_index(embeddings): |
|
dimension = embeddings[0].shape[0] |
|
index = faiss.IndexFlatL2(dimension) |
|
index.add(np.array(embeddings)) |
|
return index |
|
|
|
def retrieve_relevant_chunks(query, chunks, index, top_k=30): |
|
query_embedding = embedding_model.encode([query]) |
|
distances, indices = index.search(np.array(query_embedding), top_k) |
|
return [chunks[i] for i in indices[0]], distances[0] |
|
|
|
def get_questions(tag, difficulty, context, num_questions=3, max_length=150): |
|
input_text = f"<extra_id_97>{tag} <extra_id_98>{difficulty} <extra_id_99> {context}" |
|
features = tokenizer([input_text], return_tensors='pt') |
|
output = t5_model.generate( |
|
input_ids=features['input_ids'], |
|
attention_mask=features['attention_mask'], |
|
max_length=max_length, |
|
num_return_sequences=num_questions, |
|
do_sample=True, |
|
top_p=0.95, |
|
top_k=50 |
|
) |
|
return [tokenizer.decode(out, skip_special_tokens=True) for out in output] |
|
|
|
def process_pdf(pdf_file, tag, difficulty, query): |
|
if pdf_file is None: |
|
return "Please upload a PDF file." |
|
|
|
text = extract_text_from_pdf(pdf_file.name) |
|
sentences = split_into_sentences(text) |
|
chunks = create_chunks(sentences) |
|
embeddings = generate_embeddings(chunks) |
|
index = create_faiss_index(embeddings) |
|
relevant_chunks, _ = retrieve_relevant_chunks(query, chunks, index) |
|
|
|
filtered_chunks = [chunk for chunk in relevant_chunks if len(chunk.split()) > 20][:3] |
|
|
|
if not filtered_chunks: |
|
return "No sufficiently long chunks found. Try another query." |
|
|
|
context = " ".join(filtered_chunks) |
|
questions = get_questions(tag, difficulty, context) |
|
return "\n".join([f"Question {i+1}: {q}" for i, q in enumerate(questions)]) |
|
|