from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import requests import PyPDF2 import gradio as gr # Replace with your Hugging Face API token api_token = "YOUR_HUGGING_FACE_TOKEN" mistral_model_id = "NousResearch/Hermes-2-Pro-Mistral-7B" # Choose appropriate model version tokenizer = AutoTokenizer.from_pretrained(mistral_model_id) model = AutoModelForSeq2SeqLM.from_pretrained(mistral_model_id) def extract_paragraphs(pdf_file): pattern = "IIT GATE " # Adjust the pattern as needed pdf_file = open(pdf_file, "rb") pdf_reader = PyPDF2.PdfReader(pdf_file) num_pages = len(pdf_reader.pages) text = "" for i in range(num_pages): page = pdf_reader.pages[i] text += page.extract_text() pdf_file.close() words = text.split() paragraphs = [] paragraph = "" count = 0 for word in words: paragraph += word + " " count += 1 if count == 200 or word == words[-1]: paragraphs.append(paragraph) count = 0 paragraph = "" return paragraphs def Generate_mcq_from_pdf(pdf_file): paragraphs = extract_paragraphs(pdf_file) for para in paragraphs: template = """Generate only one MCQ question based on text \ that is delimited by triple backticks \ with {pattern} pattern. \ text: `{text}` \ """ prompt = template.format(pattern="IIT GATE", text=para) inputs = tokenizer(prompt, return_tensors="pt") headers = {"Authorization": f"Bearer {api_token}"} url = f"https://api-inference.huggingface.co/models/{mistral_model_id}" response = requests.post(url, headers=headers, json=inputs) response.raise_for_status() # Raise an error if request fails output_ids = response.json()["generated_ids"] output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] output_file = "questions.txt" with open(output_file, "w") as f: f.write(output_text) return output_text, output_file app = gr.Interface( fn=Generate_mcq_from_pdf, inputs=gr.File(type="filepath", file_types=["pdf"]), outputs=[gr.Textbox(label="Questions"), gr.File(label="Output File")], ) app.launch()