Spaces:

SFM2001
/

SimpleAES

Sleeping

File size: 8,381 Bytes

4f591e5

import re
import pandas as pd
from typing import Dict, List, Union, Tuple
from cefrpy import CEFRSpaCyAnalyzer, CEFRLevel
import spacy

def extract_feedback_with_clean_quotes(feedback_str: str) -> Dict[str, Union[str, List[str]]]:
    section_map = {
        "Task Response feedback": "TR_feedback",
        "Coherence and Cohesion feedback": "CC_feedback",
        "Lexical Resource feedback": "LR_feedback",
        "Grammatical Range and Accuracy feedback": "GRA_feedback",
        "Is off topic": "is_off_topic",
        "Word limit satisfied": "word_limit",
        "Corrected essay": "Corrected_essay"
    }
    
    result = {v: None for v in section_map.values()}
    quote_results = {f"{v}_quotes": [] for v in section_map.values() if v.endswith('_feedback')}

    section_pattern = r'"(?P<header>(?:[^"]|\\")+)"\s*:\s*"(?P<content>(?:[^"]|\\")*)"'
    
    for match in re.finditer(section_pattern, feedback_str):
        header = match.group('header')
        content = match.group('content').replace('\\"', '"')
        
        if header in section_map:
            key = section_map[header]
            result[key] = content
            
            # Extract and clean quoted phrases for feedback sections
            if key.endswith('_feedback'):
                quotes = re.findall(r"'(.*?)'", content)
                clean_quotes = []
                for quote in quotes:
                    # Remove trailing punctuation
                    cleaned = re.sub(r'[.,;:!?]+$', '', quote.strip())
                    if cleaned:  # Only keep non-empty strings
                        clean_quotes.append(cleaned)
                quote_results[f"{key}_quotes"] = clean_quotes
    
    # Handle special cases
    for orig, new in [("Is off topic", "is_off_topic"), 
                     ("Word limit satisfied", "word_limit")]:
        if result[new] is None:
            match = re.search(rf'{orig}\s*:\s*"([^"]+)"', feedback_str)
            if match:
                result[new] = match.group(1)
    
    # Handle corrected essay (multi-line)
    if result["Corrected_essay"] is None:
        essay_match = re.search(
            r'"Corrected essay"\s*:\s*"(.*?)"(?=\s*[,\]}]|$)',
            feedback_str, 
            re.DOTALL
        )
        if essay_match:
            result["Corrected_essay"] = essay_match.group(1).replace('\\"', '"')
    
    return pd.Series({**result, **quote_results})


def extract_feedback_keys_values(feedback_str):
    try:
        # Map the feedback sections to standardized column names
        section_map = {
            '"Task Response feedback"': 'TR_feedback',
            '"Coherence and Cohesion feedback"': 'CC_feedback',
            '"Lexical Resource feedback"': 'LR_feedback',
            '"Grammatical Range and Accuracy feedback"': 'GRA_feedback',
            '"Corrected essay"': 'Corrected_essay'
        }
        result = {v: None for v in section_map.values()}  # Initialize with None
        for original_section, new_key in section_map.items():
            # Find the start of the section
            start = feedback_str.find(original_section)
            if start == -1:
                continue  
            # Find the end of this section (either next section or end of string)
            end = len(feedback_str)
            for other_section in section_map:
                if other_section != original_section:
                    other_start = feedback_str.find(other_section, start + 1)
                    if other_start != -1 and other_start < end:
                        end = other_start
            section_content = feedback_str[start:end].strip()
            key_end = section_content.find(':')
            if key_end == -1:
                continue
            value = section_content[key_end+1:].strip().strip(' ,')
            if value.startswith('"') and value.endswith('"'):
                value = value[1:-1]
            result[new_key] = value
        return pd.Series(result)  # Return as Series for DataFrame expansion
    except Exception as e:
        print(f"Error processing feedback: {e}")
        return pd.Series({k: None for k in section_map.values()})
    

def create_train_input(row):
    feedback_parts = [
        f"Task Response Feedback: {row['TR_feedback']}",
        f"Coherence and Cohesion Feedback: {row['CC_feedback']}",
        f"Lexical Resource Feedback: {row['LR_feedback']}",
        f"Grammatical Range and Accuracy Feedback: {row['GRA_feedback']}", 
        f"The essay has {row['word_count']} words and {row['paragraph_count']} paragraphs.",
        f"The CEFR statistics of this essay: {row['cefr_stat']}"
    ]
    feedback_str = "\n".join(feedback_parts)
    
    return (
        "{{TOPIC}}\n" + row['topic'] + 
        "\n{{ESSAY}}\n" + row['essay'] + 
        "\n{{CORRECTED_ESSAY}}\n" + row['Corrected_essay'] + 
        "\n{{FEEDBACK}}\n" + feedback_str
    )

column_mapping = {
    'Task Response': 'TR_score',
    'Coherence and Cohesion': 'CC_score',
    'Lexical Resource': 'LR_score',
    'Grammatical Range and Accuracy': 'GRA_score'
}


nlp = spacy.load("en_core_web_sm")

def get_cefr_stats(text):
    if not isinstance(text, str) or not text.strip():
        return {f'{level}_words': 0 for level in ['A1','A2','B1','B2','C1','C2','unknown']} | {'total_words': 0}
    
    ABBREVIATION_MAPPING = {
        "'m": "am",
        "'s": "is",
        "'re": "are",
        "'ve": "have",
        "'d": "had",
        "n't": "not",
        "'ll": "will"
    }

    ENTITY_TYPES_TO_SKIP_CEFR = {
        'QUANTITY', 'MONEY', 'LANGUAGE', 'LAW',
        'WORK_OF_ART', 'PRODUCT', 'GPE',
        'ORG', 'FAC', 'PERSON'
    }

    def get_word_level_count_statistic(level_tokens: List[Tuple[str, str, bool, float, int, int]]) -> dict:
        """Safe counting of CEFR levels with error handling"""
        difficulty_levels_count = [0] * 6
        unknown_count = 0
        result = {}
        
        for token in level_tokens:
            try:
                level = token[3]
                if level is None:
                    unknown_count += 1
                    continue
                    
                # Safely handle level conversion
                try:
                    level_round = round(float(level))
                    if 1 <= level_round <= 6:
                        difficulty_levels_count[level_round - 1] += 1
                    else:
                        unknown_count += 1
                except (ValueError, TypeError):
                    unknown_count += 1
                    
            except Exception as e:
                print(f"Error processing token: {e}")
                unknown_count += 1
        
        # Convert to CEFR level names
        for i in range(1, 7):
            result[f'{CEFRLevel(i)}_words'] = difficulty_levels_count[i - 1]
        result['unknown_words'] = unknown_count
        result['total_words'] = sum(difficulty_levels_count) + unknown_count
        
        # Calculate percentages
        if result['total_words'] > 0:
            for i in range(1, 7):
                result[f'{CEFRLevel(i)}_pct'] = (difficulty_levels_count[i - 1] / result['total_words']) * 100
            result['unknown_pct'] = (unknown_count / result['total_words']) * 100
        else:
            for i in range(1, 7):
                result[f'{CEFRLevel(i)}_pct'] = 0.0
            result['unknown_pct'] = 0.0
            
        return result

    try:
        # Handle encoding errors by cleaning the text first
        clean_text = text.encode('ascii', errors='ignore').decode('ascii')
        doc = nlp(clean_text)
        text_analyzer = CEFRSpaCyAnalyzer(
            entity_types_to_skip=ENTITY_TYPES_TO_SKIP_CEFR,
            abbreviation_mapping=ABBREVIATION_MAPPING
        )
        tokens = text_analyzer.analize_doc(doc)
        ans = str(get_word_level_count_statistic(tokens))
        return ans
        
    except Exception as e:
        print(f"Error analyzing text: {e}")
        return str({f'{level}_words': 0 for level in ['A1','A2','B1','B2','C1','C2','unknown']} | {'total_words': 0})
    
    

def replace_single_newlines(text):
    # Replace \n not preceded by \n or not followed by \n
    return re.sub(r'(?<!\n)\n(?!\n)', '\\\\n\\\\n', text)
# feedback_data = extract_feedback_with_clean_quotes(feedback_text)
# print(feedback_data["LR_feedback_quotes"])