textmetric-stramlit-1

Sleeping

App Files Files Community

samyak152002 commited on Nov 3, 2024

Commit

97905e6

verified ·

1 Parent(s): eb90936

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -347

app.py CHANGED Viewed

@@ -4,386 +4,221 @@ import fitz  # PyMuPDF
 from pdfminer.high_level import extract_text
 from pdfminer.layout import LAParams
 import language_tool_python
-from typing import List, Dict, Any, Tuple
 from collections import Counter
 import json
 import traceback
 import io
 import tempfile
 import os
 # Set JAVA_HOME environment variable
 os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
-# Optional: Verify Java installation
-# try:
-#     java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT).decode()
-#     st.write(f"Java Version: {java_version}")
-# except Exception as e:
-#     st.error("Java is not installed correctly.")
 # ------------------------------
-# Analysis Functions
 # ------------------------------
 def extract_pdf_text_by_page(file) -> List[str]:
     """Extracts text from a PDF file, page by page, using PyMuPDF."""
-    file.seek(0)
-    with fitz.open(stream=file.read(), filetype="pdf") as doc:
-        return [page.get_text("text") for page in doc]
 def extract_pdf_text(file) -> str:
     """Extracts text from a PDF file using pdfminer."""
-    file.seek(0)
-    return extract_text(file, laparams=LAParams())
-def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
-    """Checks for the presence of required terms in the text."""
-    return {term: term.lower() in full_text.lower() for term in search_terms}
-def label_authors(full_text: str) -> str:
-    """Label authors in the text with 'Authors:' if not already labeled."""
-    author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
-    match = re.search(author_line_regex, full_text, re.MULTILINE)
-    if match:
-        authors = match.group(1).strip()
-        return full_text.replace(authors, f"Authors: {authors}")
-    return full_text
-def check_metadata(full_text: str) -> Dict[str, Any]:
-    """Check for metadata elements."""
-    return {
-        "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
-        "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
-        "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
-        "word_count": len(full_text.split()) or "Missing"
-    }
-def check_disclosures(full_text: str) -> Dict[str, bool]:
-    """Check for disclosure statements."""
-    search_terms = [
-        "author contributions statement",
-        "conflict of interest statement",
-        "ethics statement",
-        "funding statement",
-        "data access statement"
-    ]
-    return check_text_presence(full_text, search_terms)
-def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
-    """Check for figures and tables."""
-    return {
-        "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
-        "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
-        "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
-    }
-def check_references(full_text: str) -> Dict[str, Any]:
-    """Check for references."""
-    return {
-        "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
-        "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
-        "reference_count": len(re.findall(r'\[.*?\]', full_text)),
-        "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
-    }
-def check_structure(full_text: str) -> Dict[str, bool]:
-    """Check document structure."""
-    return {
-        "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
-        "abstract_structure": "structured abstract" in full_text.lower()
-    }
-def check_language_issues(full_text: str) -> Dict[str, Any]:
-    """Check for issues with capitalization, hyphenation, punctuation, spacing, etc."""
-    language_tool = language_tool_python.LanguageTool('en-US')
-    matches = language_tool.check(full_text)
-    word_count = len(full_text.split())
-    issues_count = len(matches)
-    issues_per_1000 = (issues_count / word_count) * 1000 if word_count else 0
-    serializable_matches = [
-        {
-            "message": match.message,
-            "replacements": match.replacements,
-            "offset": match.offset,
-            "errorLength": match.errorLength,
-            "category": match.category,
-            "ruleIssueType": match.ruleIssueType,
-            "sentence": match.sentence
         }
-        for match in matches
-    ]
-    return {
-        "issues_count": issues_count,
-        "issues_per_1000": issues_per_1000,
-        "failed": issues_per_1000 > 20,
-        "matches": serializable_matches
-    }
-def check_language(full_text: str) -> Dict[str, Any]:
-    """Check language quality."""
-    return {
-        "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
-        "readability_issues": False,  # Placeholder for future implementation
-        "language_issues": check_language_issues(full_text)
-    }
-def check_figure_order(full_text: str) -> Dict[str, Any]:
-    """Check if figures are referred to in sequential order."""
-    figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
-    figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
-    figure_numbers = sorted(set(int(num) for num in figure_references))
-    is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
-    if figure_numbers:
-        expected_figures = set(range(1, max(figure_numbers) + 1))
-        missing_figures = list(expected_figures - set(figure_numbers))
-    else:
-        missing_figures = None
-    duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
-    duplicate_numbers = [int(num) for num in duplicates]
-    not_mentioned = list(set(figure_references) - set(duplicates))
-    return {
-        "sequential_order": is_sequential,
-        "figure_count": len(figure_numbers),
-        "missing_figures": missing_figures,
-        "figure_order": figure_numbers,
-        "duplicate_references": duplicates,
-        "not_mentioned": not_mentioned
-    }
-def check_reference_order(full_text: str) -> Dict[str, Any]:
-    """Check if references in the main body text are in order."""
-    reference_pattern = r'\[(\d+)\]'
-    references = re.findall(reference_pattern, full_text)
-    ref_numbers = [int(ref) for ref in references]
-    max_ref = 0
-    out_of_order = []
-    for i, ref in enumerate(ref_numbers):
-        if ref > max_ref + 1:
-            out_of_order.append((i+1, ref))
-        max_ref = max(max_ref, ref)
-    all_refs = set(range(1, max_ref + 1))
-    used_refs = set(ref_numbers)
-    missing_refs = list(all_refs - used_refs)
-    return {
-        "max_reference": max_ref,
-        "out_of_order": out_of_order,
-        "missing_references": missing_refs,
-        "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
-    }
-def check_reference_style(full_text: str) -> Dict[str, Any]:
-    """Check the reference style used in the paper and identify inconsistencies."""
-    reference_section_match = re.search(r'References\b([\s\S]*?)(?:\n\S|\Z)', full_text, re.IGNORECASE)
-    if not reference_section_match:
-        return {"style": "Unknown", "reason": "References section not found", "inconsistent_refs": []}
-    references_text = reference_section_match.group(1)
-    reference_list = re.split(r'\n(?=\[\d+\]|\d+\.\s|\(\w+,\s*\d{4}\))', references_text)
-    references = [ref.strip() for ref in reference_list if ref.strip()]
-    styles = []
-    inconsistent_refs = []
-    patterns = {
-        "IEEE": r'^\[\d+\]',
-        "Harvard": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
-        "APA": r'^[A-Z][a-z]+,?\s[A-Z]\.\s\(?\d{4}\)?',
-        "MLA": r'^[A-Z][a-z]+,\s[A-Z][a-z]+\.',
-        "Vancouver": r'^\d+\.\s',
-        "Chicago": r'^\d+\s[A-Z][a-z]+\s[A-Z]',
-    }
-    for i, ref in enumerate(references, 1):
-        matched = False
-        for style, pattern in patterns.items():
-            if re.match(pattern, ref):
-                styles.append(style)
-                matched = True
-                break
-        if not matched:
-            styles.append("Unknown")
-            inconsistent_refs.append((i, ref, "Unknown"))
-    if not styles:
-        return {"style": "Unknown", "reason": "No references found", "inconsistent_refs": []}
-    style_counts = Counter(styles)
-    majority_style, majority_count = style_counts.most_common(1)[0]
-    for i, style in enumerate(styles, 1):
-        if style != majority_style and style != "Unknown":
-            inconsistent_refs.append((i, references[i-1], style))
-    consistency = majority_count / len(styles)
-    return {
-        "majority_style": majority_style,
-        "inconsistent_refs": inconsistent_refs,
-        "consistency": consistency
-    }
-# ------------------------------
-# Annotation Functions
-# ------------------------------
-def highlight_text(page, words, text, annotation):
-    """Highlight text and add annotation."""
-    text_instances = find_text_instances(words, text)
-    highlighted = False
-    for inst in text_instances:
-        highlight = page.add_highlight_annot(inst)
-        highlight.update()
-        comment = page.add_text_annot(inst[:2], annotation)
-        comment.update()
-        highlighted = True
-    return highlighted
-def find_text_instances(words, text):
-    """Find all instances of text in words."""
-    text_lower = text.lower()
-    text_words = text_lower.split()
-    instances = []
-    for i in range(len(words) - len(text_words) + 1):
-        if all(words[i+j][4].lower() == text_words[j] for j in range(len(text_words))):
-            inst = fitz.Rect(words[i][:4])
-            for j in range(1, len(text_words)):
-                inst = inst | fitz.Rect(words[i+j][:4])
-            instances.append(inst)
-    return instances
-def highlight_issues_in_pdf(file, inconsistent_refs: List[Tuple[int, str, str]], language_matches: List[Dict[str, Any]]) -> bytes:
-    """Highlight inconsistent references and add notes for language issues in a single PDF."""
-    try:
-        file.seek(0)
-        doc = fitz.open(stream=file.read(), filetype="pdf")
-        added_notes = set()
-        for page_number, page in enumerate(doc, start=1):
-            words = page.get_text("words")
-            if inconsistent_refs:
-                for ref_num, ref_text, ref_style in inconsistent_refs:
-                    annotation_text = f"Reference {ref_num}: Inconsistent style ({ref_style}). Should be consolidated to {ref_style}."
-                    highlight_text(page, words, ref_text, annotation_text)
-            if language_matches:
-                for match in language_matches:
-                    issue_text = match['sentence']
-                    error_message = f"{match['message']}\nSuggested correction: {match['replacements'][0] if match['replacements'] else 'No suggestion'}"
-                    issue_key = (issue_text, error_message)
-                    if issue_key not in added_notes:
-                        if highlight_text(page, words, issue_text, error_message):
-                            added_notes.add(issue_key)
-        annotated_pdf_bytes = doc.write()
-        doc.close()
-        return annotated_pdf_bytes
-    except Exception as e:
-        print(f"An error occurred while annotating the PDF: {str(e)}")
-        traceback.print_exc()
-        return b""
-# ------------------------------
-# Main Analysis Function
-# ------------------------------
-def analyze_pdf(file) -> Tuple[Dict[str, Any], bytes]:
-    """
-    Analyze the uploaded PDF and return analysis results and annotated PDF bytes.
-    Returns:
-        Tuple containing:
-            - Analysis results as a dictionary.
-            - Annotated PDF as bytes.
-    """
-    try:
-        # The 'file' is a BytesIO object provided by Streamlit
-        file.seek(0)
-        pages_text = extract_pdf_text_by_page(file)
-        full_text = extract_pdf_text(file)
-        full_text = label_authors(full_text)
-        # Perform analyses
-        metadata = check_metadata(full_text)
-        disclosures = check_disclosures(full_text)
-        figures_and_tables = check_figures_and_tables(full_text)
-        figure_order = check_figure_order(full_text)
-        references = check_references(full_text)
-        reference_order = check_reference_order(full_text)
-        reference_style = check_reference_style(full_text)
-        structure = check_structure(full_text)
-        language = check_language(full_text)
-        # Compile results
-        results = {
-            "metadata": metadata,
-            "disclosures": disclosures,
-            "figures_and_tables": figures_and_tables,
-            "figure_order": figure_order,
-            "references": references,
-            "reference_order": reference_order,
-            "reference_style": reference_style,
-            "structure": structure,
-            "language": language
-        }
-        # Handle annotations
-        inconsistent_refs = reference_style.get("inconsistent_refs", [])
-        language_matches = language.get("language_issues", {}).get("matches", [])
-        if inconsistent_refs or language_matches:
-            annotated_pdf_bytes = highlight_issues_in_pdf(file, inconsistent_refs, language_matches)
-        else:
-            annotated_pdf_bytes = None
-        return results, annotated_pdf_bytes
-    except Exception as e:
-        error_message = {
-            "error": str(e),
-            "traceback": traceback.format_exc()
-        }
-        return error_message, None
-# ------------------------------
-# Streamlit Interface
-# ------------------------------
 def main():
-    st.title("PDF Analyzer")
-    st.write("Upload a PDF document to analyze its structure, references, language, and more.")
-    uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
     if uploaded_file is not None:
-        with st.spinner("Analyzing PDF..."):
-            results, annotated_pdf = analyze_pdf(uploaded_file)
-        st.subheader("Analysis Results")
-        st.json(results)
-        if annotated_pdf:
-            st.subheader("Download Annotated PDF")
-            st.download_button(
-                label="Download Annotated PDF",
-                data=annotated_pdf,
-                file_name="annotated.pdf",
-                mime="application/pdf"
-            )
-        else:
-            st.success("No issues found. No annotated PDF to download.")
-if __name__ == "__main__":
-    main()

 from pdfminer.high_level import extract_text
 from pdfminer.layout import LAParams
 import language_tool_python
+from typing import List, Dict, Any, Tuple, Optional
 from collections import Counter
 import json
 import traceback
 import io
 import tempfile
 import os
+import base64
+from dataclasses import dataclass
 # Set JAVA_HOME environment variable
 os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
+# ------------------------------
+# Data Classes
+# ------------------------------
+@dataclass
+class Highlight:
+    page: int
+    rect: Tuple[float, float, float, float]
+    color: str
+    message: str
+    category: str
+@dataclass
+class AnalysisResult:
+    highlights: List[Highlight]
+    messages: List[Dict[str, Any]]
+    summary: Dict[str, Any]
 # ------------------------------
+# PDF Processing Functions
 # ------------------------------
 def extract_pdf_text_by_page(file) -> List[str]:
     """Extracts text from a PDF file, page by page, using PyMuPDF."""
+    if isinstance(file, (str, bytes, io.BytesIO)):
+        doc = fitz.open(stream=file.read() if hasattr(file, 'read') else file, filetype="pdf")
+        text_by_page = [page.get_text("text") for page in doc]
+        doc.close()
+        return text_by_page
+    return []
 def extract_pdf_text(file) -> str:
     """Extracts text from a PDF file using pdfminer."""
+    if isinstance(file, (str, bytes, io.BytesIO)):
+        return extract_text(file, laparams=LAParams())
+    return ""
+# ... (keep all your existing analysis functions) ...
+# ------------------------------
+# Highlight Processing Functions
+# ------------------------------
+def get_word_coordinates(doc: fitz.Document) -> Dict[int, List[Dict[str, Any]]]:
+    """Extract word coordinates from each page of the PDF."""
+    word_coordinates = {}
+    for page_num, page in enumerate(doc):
+        words = page.get_text("words")
+        word_coordinates[page_num] = [
+            {
+                "text": word[4],
+                "rect": fitz.Rect(word[:4]),
+                "origin": word[5:],
+            }
+            for word in words
+        ]
+    return word_coordinates
+def find_text_location(text: str, word_coordinates: Dict[int, List[Dict[str, Any]]]) -> Optional[Highlight]:
+    """Find the location of text in the PDF and return a Highlight object."""
+    text_lower = text.lower()
+    for page_num, words in word_coordinates.items():
+        for i in range(len(words)):
+            if words[i]["text"].lower() in text_lower:
+                # Find the complete phrase
+                rect = words[i]["rect"]
+                j = i + 1
+                while j < len(words) and j - i < len(text.split()):
+                    rect = rect | words[j]["rect"]
+                    j += 1
+                return Highlight(
+                    page=page_num,
+                    rect=(rect.x0, rect.y0, rect.x1, rect.y1),
+                    color="yellow",
+                    message=text,
+                    category="text"
+                )
+    return None
+# ------------------------------
+# Streamlit Interface
+# ------------------------------
+def create_sidebar():
+    """Create the sidebar with upload and analysis options."""
+    st.sidebar.title("PDF Analyzer")
+    uploaded_file = st.sidebar.file_uploader("Upload PDF", type=['pdf'])
+    analysis_options = st.sidebar.expander("Analysis Options", expanded=False)
+    with analysis_options:
+        options = {
+            "check_language": st.checkbox("Language Analysis", value=True),
+            "check_references": st.checkbox("Reference Analysis", value=True),
+            "check_structure": st.checkbox("Structure Analysis", value=True),
         }
+    return uploaded_file, options
+def display_pdf_viewer(pdf_bytes: bytes, highlights: List[Highlight]):
+    """Display the PDF with highlights using a custom viewer."""
+    # Convert PDF bytes to base64
+    b64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
+    # Create custom HTML for PDF viewer
+    html_content = f"""
+        <div style="position: relative; width: 100%; height: 800px;">
+            <iframe src="data:application/pdf;base64,{b64_pdf}"
+                    width="100%"
+                    height="100%"
+                    style="border: none;">
+            </iframe>
+            <div id="highlight-container">
+                {generate_highlight_overlays(highlights)}
+            </div>
+        </div>
+        <style>
+            .highlight {{
+                position: absolute;
+                opacity: 0.3;
+                pointer-events: all;
+                cursor: pointer;
+                transition: opacity 0.2s;
+            }}
+            .highlight:hover {{
+                opacity: 0.5;
+            }}
+        </style>
+    """
+    st.components.v1.html(html_content, height=800)
+def generate_highlight_overlays(highlights: List[Highlight]) -> str:
+    """Generate HTML for highlight overlays."""
+    overlay_html = ""
+    for i, highlight in enumerate(highlights):
+        overlay_html += f"""
+            <div class="highlight"
+                 style="left: {highlight.rect[0]}px;
+                        top: {highlight.rect[1]}px;
+                        width: {highlight.rect[2] - highlight.rect[0]}px;
+                        height: {highlight.rect[3] - highlight.rect[1]}px;
+                        background-color: {highlight.color};"
+                 onclick="showMessage({i})"
+                 title="{highlight.message}">
+            </div>
+        """
+    return overlay_html
+def display_analysis_results(results: AnalysisResult):
+    """Display analysis results in the sidebar."""
+    st.sidebar.markdown("## Analysis Results")
+    # Display summary statistics
+    st.sidebar.markdown("### Summary")
+    for key, value in results.summary.items():
+        st.sidebar.metric(key, value)
+    # Display messages grouped by category
+    messages_by_category = {}
+    for message in results.messages:
+        category = message.get("category", "Other")
+        if category not in messages_by_category:
+            messages_by_category[category] = []
+        messages_by_category[category].append(message)
+    for category, messages in messages_by_category.items():
+        with st.sidebar.expander(f"{category} ({len(messages)})"):
+            for msg in messages:
+                st.markdown(f"**{msg['title']}**")
+                st.markdown(msg['description'])
+                st.markdown("---")
 def main():
+    st.set_page_config(
+        page_title="PDF Analyzer",
+        page_icon="📄",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    # Create sidebar and get user input
+    uploaded_file, options = create_sidebar()
     if uploaded_file is not None:
+        # Read PDF file
+        pdf_bytes = uploaded_file.read()
+        # Analyze PDF
+        try:
+            results, annotated_pdf = analyze_pdf(io.BytesIO(pdf_bytes))
+            # Create two columns
+            col1, col2 = st.columns([0.7, 0.3])
+            with col1:
+                st.markdown("### Document Preview")
+                # Display PDF with highlights
+                if annotated_pdf:
+                    display_pdf_viewer(annotated_pdf, results.get("highlights", []))
+                else:
+                    display_pdf_viewer(pdf_bytes, [])
+            with col2:
+                st.markdown("