Bloomsphere-app / helper.py
jayantp2003's picture
Upload 8 files
078d100 verified
import pdfplumber
from langchain.text_splitter import CharacterTextSplitter
from openai import OpenAI
import json
import numpy as np
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
import os
load_dotenv()
def process_file(filepath):
"""Process file and generate chunks"""
content = []
with pdfplumber.open(filepath) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text: # Avoid NoneType errors
content.append(text)
# Join extracted text with proper spacing
full_text = "\n\n".join(content)
# Apply chunking
text_splitter = CharacterTextSplitter(
chunk_size=50000,
chunk_overlap=10
)
chunks = text_splitter.split_text(full_text)
# Vectorize and get similarities
query = ""
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([query] + chunks)
cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
# Select top chunks
if len(chunks) > 8:
top_n = int(len(chunks)/2)
else:
top_n = len(chunks)
if len(query) < 5:
top_n = len(chunks)
top_indices = cosine_similarities.argsort()[-top_n:][::-1]
return chunks
def givemcqquestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions):
if level == 1:
game = "easy and non-tricky with simple options"
elif level == 2:
game = "tricky and medium-level lengthy questions"
elif level == 3:
game = "hard and tricky and lengthy questions"
else:
raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")
prompt = f"""You are an AI designed to generate high-quality {game} level multiple-choice questions (MCQs) for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding.
Instructions:
For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity:
- **Knowledge (Remembering)**: Formulate a factual or recall-based question.
- **Comprehension (Understanding)**: Create a question that requires explanation or interpretation.
- **Application (Applying)**: Develop a question that applies knowledge to a new situation.
- **Analysis (Analyzing)**: Design a question that encourages breaking down concepts.
- **Synthesis (Creating)**: Construct a question requiring idea combination or new approaches.
- **Evaluation (Evaluating)**: Generate a question that involves judgment or assessment.
STRICT RULES:
- Generate exactly **{questions} MCQ** based on the given context and Bloom’s Taxonomy level.
- Return the response as a **structured JSON object** without any additional text.
- The question should reflect the complexity required for the given cognitive level.
- Options should be **plausible, with only one correct answer** clearly identifiable.
- Ensure a structured rubric to evaluate student responses.
Input Parameters:
- **Context**: {chunks} (Relevant learning material)
- **Bloom’s Taxonomy Distribution**:
- Understanding: {understand*100}%
- Analysis: {analyze*100}%
- Evaluation: {evaluate*100}%
- Synthesis: {create*100}%
- Application: {apply*100}%
- Knowledge: {remember*100}%
Expected JSON Output Format:
{{
"question": "<Your MCQ Question>",
"options": {{
"A": "<Option A>",
"B": "<Option B>",
"C": "<Option C>",
"D": "<Option D>"
}},
"correct_answer": "<Correct Option>",
"rubric": {{
"key_concept_assessed": "<Briefly explain what is being tested>",
"criteria_for_correct_answer": "<Explain why the correct answer is correct>",
"common_misconceptions": "<List potential incorrect assumptions>",
"cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>"
}}
}}
"""
print("API KEY",os.getenv("API_KEY"))
print("BASE URL",os.getenv("GENERATOR_BASE_URL"))
print("MODEL NAME",os.getenv("MODEL_NAME"))
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url=os.getenv("GENERATOR_BASE_URL")
)
response = client.chat.completions.create(
model=os.getenv("MODEL_NAME"),
messages=[{"role": "user", "content": prompt}]
)
mcq = response.choices[0].message.content
if "```json" in mcq:
mcq = mcq.replace("```json","")
mcq = mcq.replace("```","")
mcq = mcq.replace("\n","")
mcq = json.loads(mcq)
return mcq,prompt
def givetruefalsequestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions):
if level == 1:
game = "easy and straightforward statements"
elif level == 2:
game = "moderate complexity with slight trickiness"
elif level == 3:
game = "complex and tricky statements requiring deep understanding"
else:
raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")
prompt = f"""You are an AI designed to generate high-quality {game} level **True/False** questions for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured **True/False** question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding.
### **Instructions:**
For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity:
- **Knowledge (Remembering)**: Generate a straightforward fact-based statement.
- **Comprehension (Understanding)**: Formulate a statement that requires explanation or interpretation.
- **Application (Applying)**: Develop a statement that applies knowledge to a new situation.
- **Analysis (Analyzing)**: Design a statement that involves breaking down concepts.
- **Synthesis (Creating)**: Construct a statement requiring combining ideas or new approaches.
- **Evaluation (Evaluating)**: Generate a statement requiring judgment or assessment.
### **STRICT RULES:**
- Generate exactly **{questions}** True/False question.
- Return the response as a **structured JSON object** without any additional text.
- The question should reflect the complexity required for the given cognitive level.
- Ensure a structured rubric to evaluate student responses.
### **Input Parameters:**
- **Context**: {chunks} (Relevant learning material)
- **Bloom’s Taxonomy Distribution**:
- Understanding: {understand*100}%
- Analysis: {analyze*100}%
- Evaluation: {evaluate*100}%
- Synthesis: {create*100}%
- Application: {apply*100}%
- Knowledge: {remember*100}%
### **Expected JSON Output Format:**
```json
{{
"statement": "<Your True/False Statement>",
"correct_answer": "<True or False>",
"rubric": {{
"key_concept_assessed": "<Briefly explain what is being tested>",
"criteria_for_correct_answer": "<Explain why the correct answer is correct>",
"common_misconceptions": "<List potential incorrect assumptions>",
"cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>"
}}
}}
"""
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url=os.getenv("GENERATOR_BASE_URL")
)
response = client.chat.completions.create(
model=os.getenv("MODEL_NAME"),
messages=[{"role": "user", "content": prompt}]
)
tf_question = response.choices[0].message.content
if "```json" in tf_question:
tf_question = tf_question.replace("```json", "").replace("```", "")
tf_question = tf_question.replace("\n", "")
tf_question = json.loads(tf_question)
return tf_question
def giveopenquestion(chunks, create, evaluate, analyze, apply, understand, remember, level, questions):
# Validate input parameters
bloom_params = {
'create': create,
'evaluate': evaluate,
'analyze': analyze,
'apply': apply,
'understand': understand,
'remember': remember
}
if not all(0 <= val <= 1 for val in bloom_params.values()):
raise ValueError("All Bloom's parameters must be between 0 and 1")
if level not in [1, 2, 3]:
raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).")
# Complexity description
complexity_levels = {
1: "simple recall-based questions",
2: "moderate explanation questions",
3: "complex analytical questions"
}
complexity = complexity_levels.get(level)
prompt = f"""Generate {questions} open-ended question(s) based on the provided context, strictly following these requirements:
### CONTEXT:
{chunks}
### BLOOM'S TAXONOMY DISTRIBUTION:
- Creating: {create*100}%
- Evaluating: {evaluate*100}%
- Analyzing: {analyze*100}%
- Applying: {apply*100}%
- Understanding: {understand*100}%
- Remembering: {remember*100}%
### COGNITIVE LEVEL:
{complexity} (Level {level})
### OUTPUT REQUIREMENTS:
- Return ONLY valid JSON format
- Include detailed rubric with cognitive skill mapping
- For each question, specify which Bloom's level it primarily targets
### RESPONSE FORMAT:
```json
{{
"metadata": {{
"blooms_distribution": {{
"create": {create},
"evaluate": {evaluate},
"analyze": {analyze},
"apply": {apply},
"understand": {understand},
"remember": {remember}
}},
"complexity_level": {level}
}},
"questions": [
{{
"question": "Question text",
"primary_blooms_level": "create|evaluate|analyze|apply|understand|remember",
"rubric": {{
"key_concept": "...",
"criteria": "...",
"misconceptions": "...",
"cognitive_skills": {{
"primary": "...",
"secondary": ["...", "..."]
}}
}}
}}
]
}}
```
"""
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url=os.getenv("GENERATOR_BASE_URL")
)
try:
response = client.chat.completions.create(
model=os.getenv("MODEL_NAME"),
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=1200
)
raw_response = response.choices[0].message.content
# Extract JSON from response
json_match = re.search(r'```json(.*?)```', raw_response, re.DOTALL)
json_str = json_match.group(1).strip() if json_match else raw_response.strip()
# Parse and validate JSON
result = json.loads(json_str)
# Validate structure
if not all(key in result for key in ['metadata', 'questions']):
raise ValueError("Response missing required fields")
return result
except json.JSONDecodeError as e:
print(f"JSON Decode Error: {e}")
print(f"Problematic response:\n{raw_response}")
raise
except Exception as e:
print(f"API Error: {e}")
raise
def generate_questions_from_file(filepath, mcq, tf, qna, create, evaluate, analyze, apply, understand, remember, level):
"""Main function to generate questions from file"""
# Process file first
chunks = process_file(filepath)
# Generate questions using existing functionality
MAX_RETRIES = 3
RETRY_DELAY = 1
def get_random_chunk():
return chunks[np.random.randint(len(chunks))] if chunks else ""
def generate_questions(q_type, count, generator):
results = []
for _ in range(count):
for attempt in range(MAX_RETRIES):
try:
chunk = get_random_chunk()
question = generator(chunk, create, evaluate, analyze, apply,
understand, remember, level, questions=1)
results.append(question)
break
except Exception as e:
print(f"Error generating {q_type} question (attempt {attempt+1}): {str(e)}")
if attempt == MAX_RETRIES - 1:
results.append({"error": f"Failed to generate {q_type} question"})
time.sleep(RETRY_DELAY)
return results
return {
'mcq': generate_questions("MCQ", mcq, givemcqquestion),
'tf': generate_questions("True/False", tf, givetruefalsequestion),
'qna': generate_questions("Q&A", qna, giveopenquestion)
}
if __name__ == "__main__":
# Example usage
filepath = "data/eco.pdf"
mcq = 1
tf = 1
qna = 1
level = 1
create = 0.2
evaluate = 0.2
analyze = 0.2
apply = 0.2
understand = 0.2
remember = 0.2
questions = generate_questions_from_file(
filepath, mcq, tf, qna, create, evaluate,
analyze, apply, understand, remember, level
)
print(questions)