pvyas96's picture
Update app.py
73cb37e verified
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import requests
import PyPDF2
import gradio as gr
# Replace with your Hugging Face API token
api_token = "YOUR_HUGGING_FACE_TOKEN"
mistral_model_id = "NousResearch/Hermes-2-Pro-Mistral-7B" # Choose appropriate model version
tokenizer = AutoTokenizer.from_pretrained(mistral_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(mistral_model_id)
def extract_paragraphs(pdf_file):
pattern = "IIT GATE " # Adjust the pattern as needed
pdf_file = open(pdf_file, "rb")
pdf_reader = PyPDF2.PdfReader(pdf_file)
num_pages = len(pdf_reader.pages)
text = ""
for i in range(num_pages):
page = pdf_reader.pages[i]
text += page.extract_text()
pdf_file.close()
words = text.split()
paragraphs = []
paragraph = ""
count = 0
for word in words:
paragraph += word + " "
count += 1
if count == 200 or word == words[-1]:
paragraphs.append(paragraph)
count = 0
paragraph = ""
return paragraphs
def Generate_mcq_from_pdf(pdf_file):
paragraphs = extract_paragraphs(pdf_file)
for para in paragraphs:
template = """Generate only one MCQ question based on text \
that is delimited by triple backticks \
with {pattern} pattern. \
text: `{text}` \
"""
prompt = template.format(pattern="IIT GATE", text=para)
inputs = tokenizer(prompt, return_tensors="pt")
headers = {"Authorization": f"Bearer {api_token}"}
url = f"https://api-inference.huggingface.co/models/{mistral_model_id}"
response = requests.post(url, headers=headers, json=inputs)
response.raise_for_status() # Raise an error if request fails
output_ids = response.json()["generated_ids"]
output_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
output_file = "questions.txt"
with open(output_file, "w") as f:
f.write(output_text)
return output_text, output_file
app = gr.Interface(
fn=Generate_mcq_from_pdf,
inputs=gr.File(type="filepath", file_types=["pdf"]),
outputs=[gr.Textbox(label="Questions"), gr.File(label="Output File")],
)
app.launch()