Spaces:
Sleeping
Sleeping
| import pdfplumber | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from openai import OpenAI | |
| import json | |
| import numpy as np | |
| import time | |
| import re | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| def process_file(filepath): | |
| """Process file and generate chunks""" | |
| content = [] | |
| with pdfplumber.open(filepath) as pdf: | |
| for page in pdf.pages: | |
| text = page.extract_text() | |
| if text: # Avoid NoneType errors | |
| content.append(text) | |
| # Join extracted text with proper spacing | |
| full_text = "\n\n".join(content) | |
| # Apply chunking | |
| text_splitter = CharacterTextSplitter( | |
| chunk_size=50000, | |
| chunk_overlap=10 | |
| ) | |
| chunks = text_splitter.split_text(full_text) | |
| # Vectorize and get similarities | |
| query = "" | |
| vectorizer = TfidfVectorizer() | |
| vectors = vectorizer.fit_transform([query] + chunks) | |
| cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten() | |
| # Select top chunks | |
| if len(chunks) > 8: | |
| top_n = int(len(chunks)/2) | |
| else: | |
| top_n = len(chunks) | |
| if len(query) < 5: | |
| top_n = len(chunks) | |
| top_indices = cosine_similarities.argsort()[-top_n:][::-1] | |
| return chunks | |
| def givemcqquestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions): | |
| if level == 1: | |
| game = "easy and non-tricky with simple options" | |
| elif level == 2: | |
| game = "tricky and medium-level lengthy questions" | |
| elif level == 3: | |
| game = "hard and tricky and lengthy questions" | |
| else: | |
| raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).") | |
| prompt = f"""You are an AI designed to generate high-quality {game} level multiple-choice questions (MCQs) for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding. | |
| Instructions: | |
| For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity: | |
| - **Knowledge (Remembering)**: Formulate a factual or recall-based question. | |
| - **Comprehension (Understanding)**: Create a question that requires explanation or interpretation. | |
| - **Application (Applying)**: Develop a question that applies knowledge to a new situation. | |
| - **Analysis (Analyzing)**: Design a question that encourages breaking down concepts. | |
| - **Synthesis (Creating)**: Construct a question requiring idea combination or new approaches. | |
| - **Evaluation (Evaluating)**: Generate a question that involves judgment or assessment. | |
| STRICT RULES: | |
| - Generate exactly **{questions} MCQ** based on the given context and Bloom’s Taxonomy level. | |
| - Return the response as a **structured JSON object** without any additional text. | |
| - The question should reflect the complexity required for the given cognitive level. | |
| - Options should be **plausible, with only one correct answer** clearly identifiable. | |
| - Ensure a structured rubric to evaluate student responses. | |
| Input Parameters: | |
| - **Context**: {chunks} (Relevant learning material) | |
| - **Bloom’s Taxonomy Distribution**: | |
| - Understanding: {understand*100}% | |
| - Analysis: {analyze*100}% | |
| - Evaluation: {evaluate*100}% | |
| - Synthesis: {create*100}% | |
| - Application: {apply*100}% | |
| - Knowledge: {remember*100}% | |
| Expected JSON Output Format: | |
| {{ | |
| "question": "<Your MCQ Question>", | |
| "options": {{ | |
| "A": "<Option A>", | |
| "B": "<Option B>", | |
| "C": "<Option C>", | |
| "D": "<Option D>" | |
| }}, | |
| "correct_answer": "<Correct Option>", | |
| "rubric": {{ | |
| "key_concept_assessed": "<Briefly explain what is being tested>", | |
| "criteria_for_correct_answer": "<Explain why the correct answer is correct>", | |
| "common_misconceptions": "<List potential incorrect assumptions>", | |
| "cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>" | |
| }} | |
| }} | |
| """ | |
| print("API KEY",os.getenv("API_KEY")) | |
| print("BASE URL",os.getenv("GENERATOR_BASE_URL")) | |
| print("MODEL NAME",os.getenv("MODEL_NAME")) | |
| client = OpenAI( | |
| api_key=os.getenv("API_KEY"), | |
| base_url=os.getenv("GENERATOR_BASE_URL") | |
| ) | |
| response = client.chat.completions.create( | |
| model=os.getenv("MODEL_NAME"), | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| mcq = response.choices[0].message.content | |
| if "```json" in mcq: | |
| mcq = mcq.replace("```json","") | |
| mcq = mcq.replace("```","") | |
| mcq = mcq.replace("\n","") | |
| mcq = json.loads(mcq) | |
| return mcq,prompt | |
| def givetruefalsequestion(chunks, create, evaluate, analyze, apply, understand, remember, level,questions): | |
| if level == 1: | |
| game = "easy and straightforward statements" | |
| elif level == 2: | |
| game = "moderate complexity with slight trickiness" | |
| elif level == 3: | |
| game = "complex and tricky statements requiring deep understanding" | |
| else: | |
| raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).") | |
| prompt = f"""You are an AI designed to generate high-quality {game} level **True/False** questions for educational assessments, following Bloom’s Taxonomy. Your task is to create a well-structured **True/False** question that aligns with the given cognitive level, ensuring it accurately reflects the required depth of understanding. | |
| ### **Instructions:** | |
| For each Bloom’s Taxonomy level, follow the guidelines below to ensure appropriate question complexity: | |
| - **Knowledge (Remembering)**: Generate a straightforward fact-based statement. | |
| - **Comprehension (Understanding)**: Formulate a statement that requires explanation or interpretation. | |
| - **Application (Applying)**: Develop a statement that applies knowledge to a new situation. | |
| - **Analysis (Analyzing)**: Design a statement that involves breaking down concepts. | |
| - **Synthesis (Creating)**: Construct a statement requiring combining ideas or new approaches. | |
| - **Evaluation (Evaluating)**: Generate a statement requiring judgment or assessment. | |
| ### **STRICT RULES:** | |
| - Generate exactly **{questions}** True/False question. | |
| - Return the response as a **structured JSON object** without any additional text. | |
| - The question should reflect the complexity required for the given cognitive level. | |
| - Ensure a structured rubric to evaluate student responses. | |
| ### **Input Parameters:** | |
| - **Context**: {chunks} (Relevant learning material) | |
| - **Bloom’s Taxonomy Distribution**: | |
| - Understanding: {understand*100}% | |
| - Analysis: {analyze*100}% | |
| - Evaluation: {evaluate*100}% | |
| - Synthesis: {create*100}% | |
| - Application: {apply*100}% | |
| - Knowledge: {remember*100}% | |
| ### **Expected JSON Output Format:** | |
| ```json | |
| {{ | |
| "statement": "<Your True/False Statement>", | |
| "correct_answer": "<True or False>", | |
| "rubric": {{ | |
| "key_concept_assessed": "<Briefly explain what is being tested>", | |
| "criteria_for_correct_answer": "<Explain why the correct answer is correct>", | |
| "common_misconceptions": "<List potential incorrect assumptions>", | |
| "cognitive_skill_tested": "<Describe how it aligns with Bloom’s Taxonomy>" | |
| }} | |
| }} | |
| """ | |
| client = OpenAI( | |
| api_key=os.getenv("API_KEY"), | |
| base_url=os.getenv("GENERATOR_BASE_URL") | |
| ) | |
| response = client.chat.completions.create( | |
| model=os.getenv("MODEL_NAME"), | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| tf_question = response.choices[0].message.content | |
| if "```json" in tf_question: | |
| tf_question = tf_question.replace("```json", "").replace("```", "") | |
| tf_question = tf_question.replace("\n", "") | |
| tf_question = json.loads(tf_question) | |
| return tf_question | |
| def giveopenquestion(chunks, create, evaluate, analyze, apply, understand, remember, level, questions): | |
| # Validate input parameters | |
| bloom_params = { | |
| 'create': create, | |
| 'evaluate': evaluate, | |
| 'analyze': analyze, | |
| 'apply': apply, | |
| 'understand': understand, | |
| 'remember': remember | |
| } | |
| if not all(0 <= val <= 1 for val in bloom_params.values()): | |
| raise ValueError("All Bloom's parameters must be between 0 and 1") | |
| if level not in [1, 2, 3]: | |
| raise ValueError("Invalid level. Choose 1 (easy), 2 (medium), or 3 (hard).") | |
| # Complexity description | |
| complexity_levels = { | |
| 1: "simple recall-based questions", | |
| 2: "moderate explanation questions", | |
| 3: "complex analytical questions" | |
| } | |
| complexity = complexity_levels.get(level) | |
| prompt = f"""Generate {questions} open-ended question(s) based on the provided context, strictly following these requirements: | |
| ### CONTEXT: | |
| {chunks} | |
| ### BLOOM'S TAXONOMY DISTRIBUTION: | |
| - Creating: {create*100}% | |
| - Evaluating: {evaluate*100}% | |
| - Analyzing: {analyze*100}% | |
| - Applying: {apply*100}% | |
| - Understanding: {understand*100}% | |
| - Remembering: {remember*100}% | |
| ### COGNITIVE LEVEL: | |
| {complexity} (Level {level}) | |
| ### OUTPUT REQUIREMENTS: | |
| - Return ONLY valid JSON format | |
| - Include detailed rubric with cognitive skill mapping | |
| - For each question, specify which Bloom's level it primarily targets | |
| ### RESPONSE FORMAT: | |
| ```json | |
| {{ | |
| "metadata": {{ | |
| "blooms_distribution": {{ | |
| "create": {create}, | |
| "evaluate": {evaluate}, | |
| "analyze": {analyze}, | |
| "apply": {apply}, | |
| "understand": {understand}, | |
| "remember": {remember} | |
| }}, | |
| "complexity_level": {level} | |
| }}, | |
| "questions": [ | |
| {{ | |
| "question": "Question text", | |
| "primary_blooms_level": "create|evaluate|analyze|apply|understand|remember", | |
| "rubric": {{ | |
| "key_concept": "...", | |
| "criteria": "...", | |
| "misconceptions": "...", | |
| "cognitive_skills": {{ | |
| "primary": "...", | |
| "secondary": ["...", "..."] | |
| }} | |
| }} | |
| }} | |
| ] | |
| }} | |
| ``` | |
| """ | |
| client = OpenAI( | |
| api_key=os.getenv("API_KEY"), | |
| base_url=os.getenv("GENERATOR_BASE_URL") | |
| ) | |
| try: | |
| response = client.chat.completions.create( | |
| model=os.getenv("MODEL_NAME"), | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.7, | |
| max_tokens=1200 | |
| ) | |
| raw_response = response.choices[0].message.content | |
| # Extract JSON from response | |
| json_match = re.search(r'```json(.*?)```', raw_response, re.DOTALL) | |
| json_str = json_match.group(1).strip() if json_match else raw_response.strip() | |
| # Parse and validate JSON | |
| result = json.loads(json_str) | |
| # Validate structure | |
| if not all(key in result for key in ['metadata', 'questions']): | |
| raise ValueError("Response missing required fields") | |
| return result | |
| except json.JSONDecodeError as e: | |
| print(f"JSON Decode Error: {e}") | |
| print(f"Problematic response:\n{raw_response}") | |
| raise | |
| except Exception as e: | |
| print(f"API Error: {e}") | |
| raise | |
| def generate_questions_from_file(filepath, mcq, tf, qna, create, evaluate, analyze, apply, understand, remember, level): | |
| """Main function to generate questions from file""" | |
| # Process file first | |
| chunks = process_file(filepath) | |
| # Generate questions using existing functionality | |
| MAX_RETRIES = 3 | |
| RETRY_DELAY = 1 | |
| def get_random_chunk(): | |
| return chunks[np.random.randint(len(chunks))] if chunks else "" | |
| def generate_questions(q_type, count, generator): | |
| results = [] | |
| for _ in range(count): | |
| for attempt in range(MAX_RETRIES): | |
| try: | |
| chunk = get_random_chunk() | |
| question = generator(chunk, create, evaluate, analyze, apply, | |
| understand, remember, level, questions=1) | |
| results.append(question) | |
| break | |
| except Exception as e: | |
| print(f"Error generating {q_type} question (attempt {attempt+1}): {str(e)}") | |
| if attempt == MAX_RETRIES - 1: | |
| results.append({"error": f"Failed to generate {q_type} question"}) | |
| time.sleep(RETRY_DELAY) | |
| return results | |
| return { | |
| 'mcq': generate_questions("MCQ", mcq, givemcqquestion), | |
| 'tf': generate_questions("True/False", tf, givetruefalsequestion), | |
| 'qna': generate_questions("Q&A", qna, giveopenquestion) | |
| } | |
| if __name__ == "__main__": | |
| # Example usage | |
| filepath = "data/eco.pdf" | |
| mcq = 1 | |
| tf = 1 | |
| qna = 1 | |
| level = 1 | |
| create = 0.2 | |
| evaluate = 0.2 | |
| analyze = 0.2 | |
| apply = 0.2 | |
| understand = 0.2 | |
| remember = 0.2 | |
| questions = generate_questions_from_file( | |
| filepath, mcq, tf, qna, create, evaluate, | |
| analyze, apply, understand, remember, level | |
| ) | |
| print(questions) |