File size: 10,102 Bytes
8b15528 36623de 8b15528 36623de 8b15528 36623de 8b15528 36623de 8b15528 36623de 8b15528 36623de 8b15528 2c6cadb 36623de 8b15528 2c6cadb 36623de 2c6cadb 36623de 2c6cadb 8b15528 3770ab0 36623de 8b15528 2c6cadb 8b15528 36623de 8b15528 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# language_checker.py
import re
import traceback
from typing import List, Dict, Any
import language_tool_python
import logging # For more persistent error messages
from text_utils import convert_markdown_to_plain_text
# config.py (setting JAVA_HOME) should be imported early in app.py
# Import SpanMarkerModel
try:
from span_marker import SpanMarkerModel
SPAN_MARKER_AVAILABLE = True
except ImportError:
SPAN_MARKER_AVAILABLE = False
SpanMarkerModel = None # Placeholder if not available
print("LT_Checker: Warning: span_marker library not found. Acronym filtering will be disabled.")
print("LT_Checker: Please install it via 'pip install span_marker'")
# --- Global SpanMarker Model for Acronyms ---
_span_marker_model_acronyms = None
_span_marker_model_loaded_successfully = False
_span_marker_model_load_attempted = False
SPAN_MARKER_ACRONYM_MODEL_NAME = "tomaarsen/span-marker-bert-base-uncased-acronyms"
def _load_span_marker_model_if_needed():
global _span_marker_model_acronyms, _span_marker_model_loaded_successfully, _span_marker_model_load_attempted
if not SPAN_MARKER_AVAILABLE or _span_marker_model_load_attempted:
return
_span_marker_model_load_attempted = True
try:
print(f"LT_Checker: Attempting to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' for acronym detection...")
# Ensure you have torch installed, or the appropriate backend for SpanMarkerModel
_span_marker_model_acronyms = SpanMarkerModel.from_pretrained(SPAN_MARKER_ACRONYM_MODEL_NAME)
_span_marker_model_loaded_successfully = True
print(f"LT_Checker: SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}' loaded successfully.")
except Exception as e:
_span_marker_model_loaded_successfully = False
print(f"LT_Checker: CRITICAL ERROR loading SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}")
print(f"LT_Checker: Acronym filtering will be disabled. Please check your installation and model availability.")
logging.error(f"Failed to load SpanMarker model '{SPAN_MARKER_ACRONYM_MODEL_NAME}': {e}", exc_info=True)
# Attempt to load the model when the module is first imported.
# This might slightly delay the initial import if the model is large.
_load_span_marker_model_if_needed()
def _is_text_acronym_related(text_to_check: str, acronym_entities: List[Dict[str, Any]]) -> bool:
"""
Checks if the text_to_check contains any of the acronyms (long or short form)
identified by the SpanMarker model.
"""
if not acronym_entities or not text_to_check:
return False
text_to_check_lower = text_to_check.lower()
for entity in acronym_entities:
acronym_span = entity.get('span', '')
if acronym_span: # Ensure span is not empty
# Check if the identified acronym span is present in the text flagged by LanguageTool
if acronym_span.lower() in text_to_check_lower:
# print(f"Debug AcronymFilter: Text '{text_to_check}' (from LT) contains detected acronym '{acronym_span}'. Filtering.")
return True
return False
def perform_language_checks(markdown_text_from_filtered_pdf: str) -> Dict[str, Any]:
"""
Performs LanguageTool checks on plain text derived from font-filtered Markdown.
Filters issues to only include those between "abstract" and "references/bibliography"
found within this specific text.
Also filters out issues related to acronyms identified by SpanMarker.
"""
if not markdown_text_from_filtered_pdf or not markdown_text_from_filtered_pdf.strip():
print("LT_Checker: Input Markdown text is empty.")
return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
plain_text_from_markdown = convert_markdown_to_plain_text(markdown_text_from_filtered_pdf)
text_for_lt_analysis = plain_text_from_markdown.replace('\n', ' ')
text_for_lt_analysis = re.sub(r'\s+', ' ', text_for_lt_analysis).strip()
if not text_for_lt_analysis:
print("LT_Checker: Plain text derived from Markdown is empty after cleaning.")
return {"total_issues": 0, "issues_list": [], "text_used_for_analysis": ""}
# --- Acronym Detection using SpanMarker ---
acronym_entities = []
if _span_marker_model_loaded_successfully and _span_marker_model_acronyms:
try:
# print(f"LT_Checker: Running SpanMarker on text of length {len(text_for_lt_analysis)} for acronyms.")
acronym_entities = _span_marker_model_acronyms.predict(text_for_lt_analysis)
# if acronym_entities:
# print(f"LT_Checker: SpanMarker found {len(acronym_entities)} acronym entities. Examples: {[e['span'] for e in acronym_entities[:3]]}")
except Exception as sm_e:
print(f"LT_Checker: Error during SpanMarker prediction: {sm_e}")
logging.warning(f"SpanMarker prediction failed: {sm_e}", exc_info=True)
# Proceed without acronym filtering if prediction fails
acronym_entities = []
elif SPAN_MARKER_AVAILABLE and not _span_marker_model_loaded_successfully:
print("LT_Checker: SpanMarker model was available but not loaded successfully. Acronym filtering disabled for this run.")
text_for_lt_analysis_lower = text_for_lt_analysis.lower()
abstract_match = re.search(r'\babstract\b', text_for_lt_analysis_lower)
content_start_index = abstract_match.start() if abstract_match else 0
# ... (rest of abstract/references boundary logic as before) ...
if abstract_match:
print(f"LT_Checker: Found 'abstract' at index {content_start_index} in its text.")
else:
print(f"LT_Checker: Did not find 'abstract', LT analysis from index 0 of its text.")
references_match = re.search(r'\breferences\b', text_for_lt_analysis_lower)
bibliography_match = re.search(r'\bbibliography\b', text_for_lt_analysis_lower)
content_end_index = len(text_for_lt_analysis)
if references_match and bibliography_match:
content_end_index = min(references_match.start(), bibliography_match.start())
print(f"LT_Checker: Found 'references' at {references_match.start()} and 'bibliography' at {bibliography_match.start()}. Using {content_end_index} as end boundary.")
elif references_match:
content_end_index = references_match.start()
print(f"LT_Checker: Found 'references' at {content_end_index}. Using it as end boundary.")
elif bibliography_match:
content_end_index = bibliography_match.start()
print(f"LT_Checker: Found 'bibliography' at {content_end_index}. Using it as end boundary.")
else:
print(f"LT_Checker: Did not find 'references' or 'bibliography'. LT analysis up to end of its text (index {content_end_index}).")
if content_start_index >= content_end_index:
print(f"LT_Checker: Warning: Content start index ({content_start_index}) is not before end index ({content_end_index}) in its text. No LT issues will be reported from this range.")
tool = None
processed_lt_issues: List[Dict[str, Any]] = []
try:
tool = language_tool_python.LanguageTool('en-US')
raw_lt_matches = tool.check(text_for_lt_analysis)
lt_issues_in_range = 0
filtered_acronym_issues = 0
for idx, match in enumerate(raw_lt_matches):
if match.ruleId == "EN_SPLIT_WORDS_HYPHEN": continue # Common rule to ignore
# --- Acronym Filtering Step ---
if acronym_entities and _is_text_acronym_related(match.matchedText, acronym_entities):
filtered_acronym_issues += 1
continue # Skip this LanguageTool match as it's related to a detected acronym
if not (content_start_index <= match.offset < content_end_index):
continue
lt_issues_in_range += 1
error_text_verbatim = match.matchedText
words_around = 1
pre_error_text = text_for_lt_analysis[:match.offset]
words_before = pre_error_text.split()[-words_around:]
post_error_text = text_for_lt_analysis[match.offset + match.errorLength:]
words_after = post_error_text.split()[:words_around]
context_parts = []
if words_before: context_parts.append(" ".join(words_before))
context_parts.append(error_text_verbatim)
if words_after: context_parts.append(" ".join(words_after))
wider_context_str = " ".join(context_parts)
processed_lt_issues.append({
'_internal_id': f"lt_{idx}",
'ruleId': match.ruleId,
'message': match.message,
'context_text': wider_context_str,
'error_text_verbatim': error_text_verbatim,
'offset_in_text': match.offset,
'error_length': match.errorLength,
'replacements_suggestion': match.replacements[:3] if match.replacements else [],
'category_name': match.category,
'source_check_type': 'LanguageTool',
'is_mapped_to_pdf': False,
'pdf_coordinates_list': [],
'mapped_page_number': -1
})
print(f"LT_Checker: LanguageTool found {len(raw_lt_matches)} raw issues.")
if acronym_entities:
print(f"LT_Checker: Filtered out {filtered_acronym_issues} LT issues due to acronym detection.")
print(f"LT_Checker: {lt_issues_in_range} LT issues within defined content range (after acronym filtering).")
return {
"total_issues": len(processed_lt_issues),
"issues_list": processed_lt_issues,
"text_used_for_analysis": text_for_lt_analysis
}
except Exception as e:
print(f"Error in perform_language_checks: {e}\n{traceback.format_exc()}")
return {"error": str(e), "total_issues": 0, "issues_list": [], "text_used_for_analysis": text_for_lt_analysis}
finally:
if tool:
tool.close() |