samyak152002 commited on
Commit
f6f6476
1 Parent(s): 044ae3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +423 -474
app.py CHANGED
@@ -1,527 +1,476 @@
1
- # import re
2
- # import fitz # PyMuPDF
3
- # from pdfminer.high_level import extract_text
4
- # from pdfminer.layout import LAParams
5
- # import language_tool_python
6
- # from typing import List, Dict, Any, Tuple
7
- # from collections import Counter
8
- # import json
9
- # import traceback
10
- # import io
11
- # import tempfile
12
- # import os
13
- # import gradio as gr
14
-
15
- # # Set JAVA_HOME environment variable
16
- # os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
17
-
18
- # # ------------------------------
19
- # # Analysis Functions
20
- # # ------------------------------
21
-
22
- # # def extract_pdf_text_by_page(file) -> List[str]:
23
- # # """Extracts text from a PDF file, page by page, using PyMuPDF."""
24
- # # if isinstance(file, str):
25
- # # with fitz.open(file) as doc:
26
- # # return [page.get_text("text") for page in doc]
27
- # # else:
28
- # # with fitz.open(stream=file.read(), filetype="pdf") as doc:
29
- # # return [page.get_text("text") for page in doc]
30
-
31
- # def extract_pdf_text(file) -> str:
32
- # """Extracts full text from a PDF file using PyMuPDF."""
33
- # try:
34
- # doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
35
- # full_text = ""
36
 
37
- # for page_number in range(len(doc)):
38
- # page = doc[page_number]
39
- # words = page.get_text("word")
40
- # full_text += words
41
-
42
- # print(full_text)
43
- # doc.close()
44
- # print(f"Total extracted text length: {len(full_text)} characters.")
45
- # return full_text
46
 
47
- # except Exception as e:
48
- # print(f"Error extracting text from PDF: {e}")
49
- # return ""
50
-
51
- # def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
52
- # """Checks for the presence of required terms in the text."""
53
- # return {term: term.lower() in full_text.lower() for term in search_terms}
54
-
55
- # def label_authors(full_text: str) -> str:
56
- # """Label authors in the text with 'Authors:' if not already labeled."""
57
- # author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
58
- # match = re.search(author_line_regex, full_text, re.MULTILINE)
59
- # if match:
60
- # authors = match.group(1).strip()
61
- # return full_text.replace(authors, f"Authors: {authors}")
62
- # return full_text
63
-
64
- # def check_metadata(full_text: str) -> Dict[str, Any]:
65
- # """Check for metadata elements."""
66
- # return {
67
- # "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
68
- # "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
69
- # "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
70
- # "word_count": len(full_text.split()) or "Missing"
71
- # }
72
-
73
- # def check_disclosures(full_text: str) -> Dict[str, bool]:
74
- # """Check for disclosure statements."""
75
- # search_terms = [
76
- # "author contributions statement",
77
- # "conflict of interest statement",
78
- # "ethics statement",
79
- # "funding statement",
80
- # "data access statement"
81
- # ]
82
- # return check_text_presence(full_text, search_terms)
83
-
84
- # def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
85
- # """Check for figures and tables."""
86
- # return {
87
- # "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
88
- # "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
89
- # "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
90
- # }
91
-
92
- # def check_references(full_text: str) -> Dict[str, Any]:
93
- # """Check for references."""
94
- # return {
95
- # "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
96
- # "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
97
- # "reference_count": len(re.findall(r'\[.*?\]', full_text)),
98
- # "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
99
- # }
100
-
101
- # def check_structure(full_text: str) -> Dict[str, bool]:
102
- # """Check document structure."""
103
- # return {
104
- # "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
105
- # "abstract_structure": "structured abstract" in full_text.lower()
106
- # }
107
-
108
- # def check_language_issues(full_text: str) -> Dict[str, Any]:
109
- # """Check for language issues using LanguageTool and additional regex patterns."""
110
- # try:
111
- # language_tool = language_tool_python.LanguageTool('en-US')
112
- # matches = language_tool.check(full_text)
113
- # issues = []
114
 
115
- # # Process LanguageTool matches
116
- # for match in matches:
117
- # # Ignore issues with rule_id 'EN_SPLIT_WORDS_HYPHEN'
118
- # if match.ruleId == "EN_SPLIT_WORDS_HYPHEN":
119
- # continue
120
 
121
- # issues.append({
122
- # "message": match.message,
123
- # "context": match.context.strip(),
124
- # "suggestions": match.replacements[:3] if match.replacements else [],
125
- # "category": match.category,
126
- # "rule_id": match.ruleId,
127
- # "offset": match.offset,
128
- # "length": match.errorLength,
129
- # "coordinates": [],
130
- # "page": 0
131
- # })
132
- # print(f"Total language issues found: {len(issues)}")
133
 
134
- # # -----------------------------------
135
- # # Additions: Regex-based Issue Detection
136
- # # -----------------------------------
137
 
138
- # # Define regex pattern to find words immediately followed by '[' without space
139
- # regex_pattern = r'\b(\w+)\[(\d+)\]'
140
- # regex_matches = list(re.finditer(regex_pattern, full_text))
141
- # print(f"Total regex issues found: {len(regex_matches)}")
142
 
143
- # # Process regex matches
144
- # for match in regex_matches:
145
- # word = match.group(1)
146
- # number = match.group(2)
147
- # start = match.start()
148
- # end = match.end()
149
- # issues.append({
150
- # "message": f"Missing space before '[' in '{word}[{number}]'. Should be '{word} [{number}]'.",
151
- # "context": full_text[max(match.start() - 30, 0):min(match.end() + 30, len(full_text))].strip(),
152
- # "suggestions": [f"{word} [{number}]", f"{word} [`{number}`]", f"{word} [number {number}]"],
153
- # "category": "Formatting",
154
- # "rule_id": "SPACE_BEFORE_BRACKET",
155
- # "offset": match.start(),
156
- # "length": match.end() - match.start(),
157
- # "coordinates": [],
158
- # "page": 0
159
- # })
160
 
161
- # print(f"Total combined issues found: {len(issues)}")
162
 
163
- # return {
164
- # "total_issues": len(issues),
165
- # "issues": issues
166
- # }
167
- # except Exception as e:
168
- # print(f"Error checking language issues: {e}")
169
- # return {"error": str(e)}
170
-
171
- # def check_language(full_text: str) -> Dict[str, Any]:
172
- # """Check language quality."""
173
- # return {
174
- # "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
175
- # "readability_issues": False, # Placeholder for future implementation
176
- # "language_issues": check_language_issues(full_text)
177
- # }
178
-
179
- # def check_figure_order(full_text: str) -> Dict[str, Any]:
180
- # """Check if figures are referred to in sequential order."""
181
- # figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
182
- # figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
183
- # figure_numbers = sorted(set(int(num) for num in figure_references))
184
 
185
- # is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
186
 
187
- # if figure_numbers:
188
- # expected_figures = set(range(1, max(figure_numbers) + 1))
189
- # missing_figures = list(expected_figures - set(figure_numbers))
190
- # else:
191
- # missing_figures = None
192
 
193
- # duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
194
- # duplicate_numbers = [int(num) for num in duplicates]
195
- # not_mentioned = list(set(figure_references) - set(duplicates))
196
 
197
- # return {
198
- # "sequential_order": is_sequential,
199
- # "figure_count": len(figure_numbers),
200
- # "missing_figures": missing_figures,
201
- # "figure_order": figure_numbers,
202
- # "duplicate_references": duplicates,
203
- # "not_mentioned": not_mentioned
204
- # }
205
-
206
- # def check_reference_order(full_text: str) -> Dict[str, Any]:
207
- # """Check if references in the main body text are in order."""
208
- # reference_pattern = r'\[(\d+)\]'
209
- # references = re.findall(reference_pattern, full_text)
210
- # ref_numbers = [int(ref) for ref in references]
211
 
212
- # max_ref = 0
213
- # out_of_order = []
214
- # for i, ref in enumerate(ref_numbers):
215
- # if ref > max_ref + 1:
216
- # out_of_order.append((i+1, ref))
217
- # max_ref = max(max_ref, ref)
218
 
219
- # all_refs = set(range(1, max_ref + 1))
220
- # used_refs = set(ref_numbers)
221
- # missing_refs = list(all_refs - used_refs)
222
 
223
- # return {
224
- # "max_reference": max_ref,
225
- # "out_of_order": out_of_order,
226
- # "missing_references": missing_refs,
227
- # "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
228
- # }
229
-
230
- # def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
231
- # """
232
- # Highlights language issues in the PDF and returns the annotated PDF as bytes.
233
- # This function maps LanguageTool matches to specific words in the PDF
234
- # and highlights those words.
235
- # """
236
- # try:
237
- # # Open the PDF
238
- # doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
239
- # # print(f"Opened PDF with {len(doc)} pages.")
240
- # # print(language_matches)
241
- # # Extract words with positions from each page
242
- # word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
243
- # for page_number in range(len(doc)):
244
- # page = doc[page_number]
245
- # print(page.get_text("words"))
246
- # words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
247
- # for w in words:
248
- # # print(w)
249
- # word_text = w[4]
250
- # # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
251
- # # if '[' in word_text:
252
- # # word_text = word_text.replace('[', ' [')
253
- # word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
254
- # # print(f"Total words extracted: {len(word_list)}")
255
-
256
- # # Concatenate all words to form the full text
257
- # concatenated_text=""
258
- # concatenated_text = " ".join([w[1] for w in word_list])
259
 
260
- # # print(f"Concatenated text length: {concatenated_text} characters.")
261
 
262
- # # Find "Abstract" section and set the processing start point
263
- # abstract_start = concatenated_text.lower().find("abstract")
264
- # abstract_offset = 0 if abstract_start == -1 else abstract_start
265
 
266
- # # Find "References" section and exclude from processing
267
- # references_start = concatenated_text.lower().find("references")
268
- # references_offset = len(concatenated_text) if references_start == -1 else references_start
269
 
270
- # # Iterate over each language issue
271
- # for idx, issue in enumerate(language_matches, start=1):
272
- # offset = issue["offset"] # offset+line_no-1
273
- # length = issue["length"]
274
 
275
- # # Skip issues in the references section
276
- # if offset < abstract_offset or offset >= references_offset:
277
- # continue
278
 
279
 
280
- # error_text = concatenated_text[offset:offset+length]
281
- # print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
282
-
283
- # # Find the words that fall within the error span
284
- # current_pos = 0
285
- # target_words = []
286
- # for word in word_list:
287
- # word_text = word[1]
288
- # word_length = len(word_text) + 1 # +1 for the space
289
-
290
- # if current_pos + word_length > offset and current_pos < offset + length:
291
- # target_words.append(word)
292
- # current_pos += word_length
293
-
294
- # if not target_words:
295
- # # print("No matching words found for this issue.")
296
- # continue
297
-
298
- # initial_x = target_words[0][2]
299
- # initial_y = target_words[0][3]
300
- # final_x = target_words[len(target_words)-1][4]
301
- # final_y = target_words[len(target_words)-1][5]
302
- # issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
303
- # issue["page"] = target_words[0][0] + 1
304
- # # Add highlight annotations to the target words
305
- # print()
306
- # print("issue", issue)
307
- # print("error text", error_text)
308
- # print(target_words)
309
- # print()
310
- # for target in target_words:
311
- # page_num, word_text, x0, y0, x1, y1 = target
312
- # page = doc[page_num]
313
- # # Define a rectangle around the word with some padding
314
- # rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
315
- # # Add a highlight annotation
316
- # highlight = page.add_highlight_annot(rect)
317
- # highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
318
- # highlight.update()
319
- # # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
320
 
321
 
322
- # # Save annotated PDF to bytes
323
- # byte_stream = io.BytesIO()
324
- # doc.save(byte_stream)
325
- # annotated_pdf_bytes = byte_stream.getvalue()
326
- # doc.close()
327
 
328
- # # Save annotated PDF locally for verification
329
- # with open("annotated_temp.pdf", "wb") as f:
330
- # f.write(annotated_pdf_bytes)
331
- # # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
332
 
333
- # return language_matches, annotated_pdf_bytes
334
- # except Exception as e:
335
- # print(f"Error in highlighting PDF: {e}")
336
- # return b""
337
 
338
 
339
 
340
 
341
- # # ------------------------------
342
- # # Main Analysis Function
343
- # # ------------------------------
344
 
345
- # # server/gradio_client.py
346
 
347
- # def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
348
- # """Analyzes the PDF for language issues and returns results and annotated PDF."""
349
- # try:
350
- # full_text = extract_pdf_text(filepath)
351
- # if not full_text:
352
- # return {"error": "Failed to extract text from PDF."}, None
353
 
354
- # # Create the results structure
355
- # results = {
356
- # "issues": [], # Initialize as empty array
357
- # "regex_checks": {
358
- # "metadata": check_metadata(full_text),
359
- # "disclosures": check_disclosures(full_text),
360
- # "figures_and_tables": check_figures_and_tables(full_text),
361
- # "references": check_references(full_text),
362
- # "structure": check_structure(full_text),
363
- # "figure_order": check_figure_order(full_text),
364
- # "reference_order": check_reference_order(full_text)
365
- # }
366
- # }
367
-
368
- # # Handle language issues
369
- # language_issues = check_language_issues(full_text)
370
- # if "error" in language_issues:
371
- # return {"error": language_issues["error"]}, None
372
-
373
- # issues = language_issues.get("issues", [])
374
- # if issues:
375
- # language_matches, annotated_pdf = highlight_issues_in_pdf(filepath, issues)
376
- # results["issues"] = language_matches # This is already an array from check_language_issues
377
- # return results, annotated_pdf
378
- # else:
379
- # # Keep issues as empty array if none found
380
- # return results, None
381
-
382
- # except Exception as e:
383
- # return {"error": str(e)}, None
384
- # # ------------------------------
385
- # # Gradio Interface
386
- # # ------------------------------
387
-
388
- # def process_upload(file):
389
- # """
390
- # Process the uploaded PDF file and return analysis results and annotated PDF.
391
- # """
392
- # # print(file.name)
393
- # if file is None:
394
- # return json.dumps({"error": "No file uploaded"}, indent=2), None
395
-
396
- # # # Create a temporary file to work with
397
 
398
- # # with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
399
- # # temp_input.write(file)
400
- # # temp_input_path = temp_input.name
401
- # # print(temp_input_path)
402
 
403
- # temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
404
- # temp_input.write(file)
405
- # temp_input_path = temp_input.name
406
- # print(temp_input_path)
407
- # # Analyze the PDF
408
 
409
- # results, annotated_pdf = analyze_pdf(temp_input_path)
410
 
411
- # print(results)
412
- # results_json = json.dumps(results, indent=2)
413
 
414
- # # Clean up the temporary input file
415
- # os.unlink(temp_input_path)
416
 
417
- # # If we have an annotated PDF, save it temporarily
418
- # if annotated_pdf:
419
- # with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
420
- # tmp_file.write(annotated_pdf)
421
- # return results_json, tmp_file.name
422
 
423
- # return results_json, None
424
 
425
- # # except Exception as e:
426
- # # error_message = json.dumps({
427
- # # "error": str(e),
428
- # # "traceback": traceback.format_exc()
429
- # # }, indent=2)
430
- # # return error_message, None
431
 
432
 
433
- # def create_interface():
434
- # with gr.Blocks(title="PDF Analyzer") as interface:
435
- # gr.Markdown("# PDF Analyzer")
436
- # gr.Markdown("Upload a PDF document to analyze its structure, references, language, and more.")
437
 
438
- # with gr.Row():
439
- # file_input = gr.File(
440
- # label="Upload PDF",
441
- # file_types=[".pdf"],
442
- # type="binary"
443
- # )
444
 
445
- # with gr.Row():
446
- # analyze_btn = gr.Button("Analyze PDF")
447
 
448
- # with gr.Row():
449
- # results_output = gr.JSON(
450
- # label="Analysis Results",
451
- # show_label=True
452
- # )
453
 
454
- # with gr.Row():
455
- # pdf_output = gr.File(
456
- # label="Annotated PDF",
457
- # show_label=True
458
- # )
459
 
460
- # analyze_btn.click(
461
- # fn=process_upload,
462
- # inputs=[file_input],
463
- # outputs=[results_output, pdf_output]
464
- # )
465
 
466
- # return interface
467
 
468
- # if __name__ == "__main__":
469
- # interface = create_interface()
470
- # interface.launch(
471
- # share=False, # Set to False in production
472
- # # server_name="0.0.0.0",
473
- # server_port=None
474
- # )
475
-
476
-
477
- import os
478
- import requests
479
- from flask import Flask, jsonify
480
-
481
- app = Flask(__name__)
482
-
483
- # Directory and file configuration
484
- NGRAM_DATA_DIR = "./ngram_data"
485
- NGRAM_FILE_NAME = "ngrams-en-20150817.zip"
486
- NGRAM_FILE_PATH = os.path.join(NGRAM_DATA_DIR, NGRAM_FILE_NAME)
487
- NGRAM_DOWNLOAD_URL = "https://languagetool.org/download/ngram-data/ngrams-en-20150817.zip"
488
-
489
- # Ensure the directory exists
490
- def ensure_directory_exists():
491
- if not os.path.exists(NGRAM_DATA_DIR):
492
- os.makedirs(NGRAM_DATA_DIR)
493
-
494
- # Download the n-gram data if not already downloaded
495
- def download_ngram_data():
496
- if os.path.exists(NGRAM_FILE_PATH):
497
- print(f"File already exists at {NGRAM_FILE_PATH}, skipping download.")
498
- return
499
-
500
- print(f"Downloading n-gram data from {NGRAM_DOWNLOAD_URL}...")
501
- response = requests.get(NGRAM_DOWNLOAD_URL, stream=True)
502
-
503
- if response.status_code == 200:
504
- with open(NGRAM_FILE_PATH, "wb") as f:
505
- for chunk in response.iter_content(chunk_size=8192):
506
- f.write(chunk)
507
- print(f"Downloaded and saved to {NGRAM_FILE_PATH}.")
508
- else:
509
- raise Exception(f"Failed to download n-gram data. HTTP Status Code: {response.status_code}")
510
-
511
- @app.route('/')
512
- def home():
513
- return jsonify({"message": "Welcome to the LanguageTool n-gram downloader!"})
514
 
515
- @app.route('/download-ngram', methods=['GET'])
516
- def download_ngram():
517
- try:
518
- ensure_directory_exists()
519
- download_ngram_data()
520
- return jsonify({"message": "N-gram data is downloaded and saved.", "path": NGRAM_FILE_PATH})
521
- except Exception as e:
522
- return jsonify({"error": str(e)}), 500
523
 
524
- if __name__ == "__main__":
525
- ensure_directory_exists()
526
- download_ngram_data()
527
- app.run(debug=True)
 
1
+ import re
2
+ import fitz # PyMuPDF
3
+ from pdfminer.high_level import extract_text
4
+ from pdfminer.layout import LAParams
5
+ import language_tool_python
6
+ from typing import List, Dict, Any, Tuple
7
+ from collections import Counter
8
+ import json
9
+ import traceback
10
+ import io
11
+ import tempfile
12
+ import os
13
+ import gradio as gr
14
+
15
+ # Set JAVA_HOME environment variable
16
+ os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
17
+
18
+ # ------------------------------
19
+ # Analysis Functions
20
+ # ------------------------------
21
+
22
+ # def extract_pdf_text_by_page(file) -> List[str]:
23
+ # """Extracts text from a PDF file, page by page, using PyMuPDF."""
24
+ # if isinstance(file, str):
25
+ # with fitz.open(file) as doc:
26
+ # return [page.get_text("text") for page in doc]
27
+ # else:
28
+ # with fitz.open(stream=file.read(), filetype="pdf") as doc:
29
+ # return [page.get_text("text") for page in doc]
30
+
31
+ def extract_pdf_text(file) -> str:
32
+ """Extracts full text from a PDF file using PyMuPDF."""
33
+ try:
34
+ doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
35
+ full_text = ""
36
 
37
+ for page_number in range(len(doc)):
38
+ page = doc[page_number]
39
+ words = page.get_text("word")
40
+ full_text += words
41
+
42
+ print(full_text)
43
+ doc.close()
44
+ print(f"Total extracted text length: {len(full_text)} characters.")
45
+ return full_text
46
 
47
+ except Exception as e:
48
+ print(f"Error extracting text from PDF: {e}")
49
+ return ""
50
+
51
+ def check_text_presence(full_text: str, search_terms: List[str]) -> Dict[str, bool]:
52
+ """Checks for the presence of required terms in the text."""
53
+ return {term: term.lower() in full_text.lower() for term in search_terms}
54
+
55
+ def label_authors(full_text: str) -> str:
56
+ """Label authors in the text with 'Authors:' if not already labeled."""
57
+ author_line_regex = r"^(?:.*\n)(.*?)(?:\n\n)"
58
+ match = re.search(author_line_regex, full_text, re.MULTILINE)
59
+ if match:
60
+ authors = match.group(1).strip()
61
+ return full_text.replace(authors, f"Authors: {authors}")
62
+ return full_text
63
+
64
+ def check_metadata(full_text: str) -> Dict[str, Any]:
65
+ """Check for metadata elements."""
66
+ return {
67
+ "author_email": bool(re.search(r'\b[\w.-]+?@\w+?\.\w+?\b', full_text)),
68
+ "list_of_authors": bool(re.search(r'Authors?:', full_text, re.IGNORECASE)),
69
+ "keywords_list": bool(re.search(r'Keywords?:', full_text, re.IGNORECASE)),
70
+ "word_count": len(full_text.split()) or "Missing"
71
+ }
72
+
73
+ def check_disclosures(full_text: str) -> Dict[str, bool]:
74
+ """Check for disclosure statements."""
75
+ search_terms = [
76
+ "author contributions statement",
77
+ "conflict of interest statement",
78
+ "ethics statement",
79
+ "funding statement",
80
+ "data access statement"
81
+ ]
82
+ return check_text_presence(full_text, search_terms)
83
+
84
+ def check_figures_and_tables(full_text: str) -> Dict[str, bool]:
85
+ """Check for figures and tables."""
86
+ return {
87
+ "figures_with_citations": bool(re.search(r'Figure \d+.*?citation', full_text, re.IGNORECASE)),
88
+ "figures_legends": bool(re.search(r'Figure \d+.*?legend', full_text, re.IGNORECASE)),
89
+ "tables_legends": bool(re.search(r'Table \d+.*?legend', full_text, re.IGNORECASE))
90
+ }
91
+
92
+ def check_references(full_text: str) -> Dict[str, Any]:
93
+ """Check for references."""
94
+ return {
95
+ "old_references": bool(re.search(r'\b19[0-9]{2}\b', full_text)),
96
+ "citations_in_abstract": bool(re.search(r'\b(citation|reference)\b', full_text[:1000], re.IGNORECASE)),
97
+ "reference_count": len(re.findall(r'\[.*?\]', full_text)),
98
+ "self_citations": bool(re.search(r'Self-citation', full_text, re.IGNORECASE))
99
+ }
100
+
101
+ def check_structure(full_text: str) -> Dict[str, bool]:
102
+ """Check document structure."""
103
+ return {
104
+ "imrad_structure": all(section in full_text for section in ["Introduction", "Methods", "Results", "Discussion"]),
105
+ "abstract_structure": "structured abstract" in full_text.lower()
106
+ }
107
+
108
+ def check_language_issues(full_text: str) -> Dict[str, Any]:
109
+ """Check for language issues using LanguageTool and additional regex patterns."""
110
+ try:
111
+ language_tool = language_tool_python.LanguageTool('en-US')
112
+ matches = language_tool.check(full_text)
113
+ issues = []
114
 
115
+ # Process LanguageTool matches
116
+ for match in matches:
117
+ # Ignore issues with rule_id 'EN_SPLIT_WORDS_HYPHEN'
118
+ if match.ruleId == "EN_SPLIT_WORDS_HYPHEN":
119
+ continue
120
 
121
+ issues.append({
122
+ "message": match.message,
123
+ "context": match.context.strip(),
124
+ "suggestions": match.replacements[:3] if match.replacements else [],
125
+ "category": match.category,
126
+ "rule_id": match.ruleId,
127
+ "offset": match.offset,
128
+ "length": match.errorLength,
129
+ "coordinates": [],
130
+ "page": 0
131
+ })
132
+ print(f"Total language issues found: {len(issues)}")
133
 
134
+ # -----------------------------------
135
+ # Additions: Regex-based Issue Detection
136
+ # -----------------------------------
137
 
138
+ # Define regex pattern to find words immediately followed by '[' without space
139
+ regex_pattern = r'\b(\w+)\[(\d+)\]'
140
+ regex_matches = list(re.finditer(regex_pattern, full_text))
141
+ print(f"Total regex issues found: {len(regex_matches)}")
142
 
143
+ # Process regex matches
144
+ for match in regex_matches:
145
+ word = match.group(1)
146
+ number = match.group(2)
147
+ start = match.start()
148
+ end = match.end()
149
+ issues.append({
150
+ "message": f"Missing space before '[' in '{word}[{number}]'. Should be '{word} [{number}]'.",
151
+ "context": full_text[max(match.start() - 30, 0):min(match.end() + 30, len(full_text))].strip(),
152
+ "suggestions": [f"{word} [{number}]", f"{word} [`{number}`]", f"{word} [number {number}]"],
153
+ "category": "Formatting",
154
+ "rule_id": "SPACE_BEFORE_BRACKET",
155
+ "offset": match.start(),
156
+ "length": match.end() - match.start(),
157
+ "coordinates": [],
158
+ "page": 0
159
+ })
160
 
161
+ print(f"Total combined issues found: {len(issues)}")
162
 
163
+ return {
164
+ "total_issues": len(issues),
165
+ "issues": issues
166
+ }
167
+ except Exception as e:
168
+ print(f"Error checking language issues: {e}")
169
+ return {"error": str(e)}
170
+
171
+ def check_language(full_text: str) -> Dict[str, Any]:
172
+ """Check language quality."""
173
+ return {
174
+ "plain_language": bool(re.search(r'plain language summary', full_text, re.IGNORECASE)),
175
+ "readability_issues": False, # Placeholder for future implementation
176
+ "language_issues": check_language_issues(full_text)
177
+ }
178
+
179
+ def check_figure_order(full_text: str) -> Dict[str, Any]:
180
+ """Check if figures are referred to in sequential order."""
181
+ figure_pattern = r'(?:Fig(?:ure)?\.?|Figure)\s*(\d+)'
182
+ figure_references = re.findall(figure_pattern, full_text, re.IGNORECASE)
183
+ figure_numbers = sorted(set(int(num) for num in figure_references))
184
 
185
+ is_sequential = all(a + 1 == b for a, b in zip(figure_numbers, figure_numbers[1:]))
186
 
187
+ if figure_numbers:
188
+ expected_figures = set(range(1, max(figure_numbers) + 1))
189
+ missing_figures = list(expected_figures - set(figure_numbers))
190
+ else:
191
+ missing_figures = None
192
 
193
+ duplicates = [num for num, count in Counter(figure_references).items() if count > 1]
194
+ duplicate_numbers = [int(num) for num in duplicates]
195
+ not_mentioned = list(set(figure_references) - set(duplicates))
196
 
197
+ return {
198
+ "sequential_order": is_sequential,
199
+ "figure_count": len(figure_numbers),
200
+ "missing_figures": missing_figures,
201
+ "figure_order": figure_numbers,
202
+ "duplicate_references": duplicates,
203
+ "not_mentioned": not_mentioned
204
+ }
205
+
206
+ def check_reference_order(full_text: str) -> Dict[str, Any]:
207
+ """Check if references in the main body text are in order."""
208
+ reference_pattern = r'\[(\d+)\]'
209
+ references = re.findall(reference_pattern, full_text)
210
+ ref_numbers = [int(ref) for ref in references]
211
 
212
+ max_ref = 0
213
+ out_of_order = []
214
+ for i, ref in enumerate(ref_numbers):
215
+ if ref > max_ref + 1:
216
+ out_of_order.append((i+1, ref))
217
+ max_ref = max(max_ref, ref)
218
 
219
+ all_refs = set(range(1, max_ref + 1))
220
+ used_refs = set(ref_numbers)
221
+ missing_refs = list(all_refs - used_refs)
222
 
223
+ return {
224
+ "max_reference": max_ref,
225
+ "out_of_order": out_of_order,
226
+ "missing_references": missing_refs,
227
+ "is_ordered": len(out_of_order) == 0 and len(missing_refs) == 0
228
+ }
229
+
230
+ def highlight_issues_in_pdf(file, language_matches: List[Dict[str, Any]]) -> bytes:
231
+ """
232
+ Highlights language issues in the PDF and returns the annotated PDF as bytes.
233
+ This function maps LanguageTool matches to specific words in the PDF
234
+ and highlights those words.
235
+ """
236
+ try:
237
+ # Open the PDF
238
+ doc = fitz.open(stream=file.read(), filetype="pdf") if not isinstance(file, str) else fitz.open(file)
239
+ # print(f"Opened PDF with {len(doc)} pages.")
240
+ # print(language_matches)
241
+ # Extract words with positions from each page
242
+ word_list = [] # List of tuples: (page_number, word, x0, y0, x1, y1)
243
+ for page_number in range(len(doc)):
244
+ page = doc[page_number]
245
+ print(page.get_text("words"))
246
+ words = page.get_text("words") # List of tuples: (x0, y0, x1, y1, "word", block_no, line_no, word_no)
247
+ for w in words:
248
+ # print(w)
249
+ word_text = w[4]
250
+ # **Fix:** Insert a space before '[' to ensure "globally [2]" instead of "globally[2]"
251
+ # if '[' in word_text:
252
+ # word_text = word_text.replace('[', ' [')
253
+ word_list.append((page_number, word_text, w[0], w[1], w[2], w[3]))
254
+ # print(f"Total words extracted: {len(word_list)}")
255
+
256
+ # Concatenate all words to form the full text
257
+ concatenated_text=""
258
+ concatenated_text = " ".join([w[1] for w in word_list])
259
 
260
+ # print(f"Concatenated text length: {concatenated_text} characters.")
261
 
262
+ # Find "Abstract" section and set the processing start point
263
+ abstract_start = concatenated_text.lower().find("abstract")
264
+ abstract_offset = 0 if abstract_start == -1 else abstract_start
265
 
266
+ # Find "References" section and exclude from processing
267
+ references_start = concatenated_text.lower().find("references")
268
+ references_offset = len(concatenated_text) if references_start == -1 else references_start
269
 
270
+ # Iterate over each language issue
271
+ for idx, issue in enumerate(language_matches, start=1):
272
+ offset = issue["offset"] # offset+line_no-1
273
+ length = issue["length"]
274
 
275
+ # Skip issues in the references section
276
+ if offset < abstract_offset or offset >= references_offset:
277
+ continue
278
 
279
 
280
+ error_text = concatenated_text[offset:offset+length]
281
+ print(f"\nIssue {idx}: '{error_text}' at offset {offset} with length {length}")
282
+
283
+ # Find the words that fall within the error span
284
+ current_pos = 0
285
+ target_words = []
286
+ for word in word_list:
287
+ word_text = word[1]
288
+ word_length = len(word_text) + 1 # +1 for the space
289
+
290
+ if current_pos + word_length > offset and current_pos < offset + length:
291
+ target_words.append(word)
292
+ current_pos += word_length
293
+
294
+ if not target_words:
295
+ # print("No matching words found for this issue.")
296
+ continue
297
+
298
+ initial_x = target_words[0][2]
299
+ initial_y = target_words[0][3]
300
+ final_x = target_words[len(target_words)-1][4]
301
+ final_y = target_words[len(target_words)-1][5]
302
+ issue["coordinates"] = [initial_x, initial_y, final_x, final_y]
303
+ issue["page"] = target_words[0][0] + 1
304
+ # Add highlight annotations to the target words
305
+ print()
306
+ print("issue", issue)
307
+ print("error text", error_text)
308
+ print(target_words)
309
+ print()
310
+ for target in target_words:
311
+ page_num, word_text, x0, y0, x1, y1 = target
312
+ page = doc[page_num]
313
+ # Define a rectangle around the word with some padding
314
+ rect = fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1)
315
+ # Add a highlight annotation
316
+ highlight = page.add_highlight_annot(rect)
317
+ highlight.set_colors(stroke=(1, 1, 0)) # Yellow color
318
+ highlight.update()
319
+ # print(f"Highlighted '{word_text}' on page {page_num + 1} at position ({x0}, {y0}, {x1}, {y1})")
320
 
321
 
322
+ # Save annotated PDF to bytes
323
+ byte_stream = io.BytesIO()
324
+ doc.save(byte_stream)
325
+ annotated_pdf_bytes = byte_stream.getvalue()
326
+ doc.close()
327
 
328
+ # Save annotated PDF locally for verification
329
+ with open("annotated_temp.pdf", "wb") as f:
330
+ f.write(annotated_pdf_bytes)
331
+ # print("Annotated PDF saved as 'annotated_temp.pdf' for manual verification.")
332
 
333
+ return language_matches, annotated_pdf_bytes
334
+ except Exception as e:
335
+ print(f"Error in highlighting PDF: {e}")
336
+ return b""
337
 
338
 
339
 
340
 
341
+ # ------------------------------
342
+ # Main Analysis Function
343
+ # ------------------------------
344
 
345
+ # server/gradio_client.py
346
 
347
+ def analyze_pdf(filepath: str) -> Tuple[Dict[str, Any], bytes]:
348
+ """Analyzes the PDF for language issues and returns results and annotated PDF."""
349
+ try:
350
+ full_text = extract_pdf_text(filepath)
351
+ if not full_text:
352
+ return {"error": "Failed to extract text from PDF."}, None
353
 
354
+ # Create the results structure
355
+ results = {
356
+ "issues": [], # Initialize as empty array
357
+ "regex_checks": {
358
+ "metadata": check_metadata(full_text),
359
+ "disclosures": check_disclosures(full_text),
360
+ "figures_and_tables": check_figures_and_tables(full_text),
361
+ "references": check_references(full_text),
362
+ "structure": check_structure(full_text),
363
+ "figure_order": check_figure_order(full_text),
364
+ "reference_order": check_reference_order(full_text)
365
+ }
366
+ }
367
+
368
+ # Handle language issues
369
+ language_issues = check_language_issues(full_text)
370
+ if "error" in language_issues:
371
+ return {"error": language_issues["error"]}, None
372
+
373
+ issues = language_issues.get("issues", [])
374
+ if issues:
375
+ language_matches, annotated_pdf = highlight_issues_in_pdf(filepath, issues)
376
+ results["issues"] = language_matches # This is already an array from check_language_issues
377
+ return results, annotated_pdf
378
+ else:
379
+ # Keep issues as empty array if none found
380
+ return results, None
381
+
382
+ except Exception as e:
383
+ return {"error": str(e)}, None
384
+ # ------------------------------
385
+ # Gradio Interface
386
+ # ------------------------------
387
+
388
+ def process_upload(file):
389
+ """
390
+ Process the uploaded PDF file and return analysis results and annotated PDF.
391
+ """
392
+ # print(file.name)
393
+ if file is None:
394
+ return json.dumps({"error": "No file uploaded"}, indent=2), None
395
+
396
+ # # Create a temporary file to work with
397
 
398
+ # with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_input:
399
+ # temp_input.write(file)
400
+ # temp_input_path = temp_input.name
401
+ # print(temp_input_path)
402
 
403
+ temp_input = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
404
+ temp_input.write(file)
405
+ temp_input_path = temp_input.name
406
+ print(temp_input_path)
407
+ # Analyze the PDF
408
 
409
+ results, annotated_pdf = analyze_pdf(temp_input_path)
410
 
411
+ print(results)
412
+ results_json = json.dumps(results, indent=2)
413
 
414
+ # Clean up the temporary input file
415
+ os.unlink(temp_input_path)
416
 
417
+ # If we have an annotated PDF, save it temporarily
418
+ if annotated_pdf:
419
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
420
+ tmp_file.write(annotated_pdf)
421
+ return results_json, tmp_file.name
422
 
423
+ return results_json, None
424
 
425
+ # except Exception as e:
426
+ # error_message = json.dumps({
427
+ # "error": str(e),
428
+ # "traceback": traceback.format_exc()
429
+ # }, indent=2)
430
+ # return error_message, None
431
 
432
 
433
+ def create_interface():
434
+ with gr.Blocks(title="PDF Analyzer") as interface:
435
+ gr.Markdown("# PDF Analyzer")
436
+ gr.Markdown("Upload a PDF document to analyze its structure, references, language, and more.")
437
 
438
+ with gr.Row():
439
+ file_input = gr.File(
440
+ label="Upload PDF",
441
+ file_types=[".pdf"],
442
+ type="binary"
443
+ )
444
 
445
+ with gr.Row():
446
+ analyze_btn = gr.Button("Analyze PDF")
447
 
448
+ with gr.Row():
449
+ results_output = gr.JSON(
450
+ label="Analysis Results",
451
+ show_label=True
452
+ )
453
 
454
+ with gr.Row():
455
+ pdf_output = gr.File(
456
+ label="Annotated PDF",
457
+ show_label=True
458
+ )
459
 
460
+ analyze_btn.click(
461
+ fn=process_upload,
462
+ inputs=[file_input],
463
+ outputs=[results_output, pdf_output]
464
+ )
465
 
466
+ return interface
467
 
468
+ if __name__ == "__main__":
469
+ interface = create_interface()
470
+ interface.launch(
471
+ share=False, # Set to False in production
472
+ # server_name="0.0.0.0",
473
+ server_port=None
474
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
 
 
 
 
 
 
 
 
 
476