Upload 6 files
Browse files- Base-RCNN-FPN.yml +69 -0
- bibtest.ipynb +195 -0
- cascade_dit_base.yml +20 -0
- imagesummary_fun.py +56 -0
- pdfextract_fun.py +192 -0
- pdfsummary_fun.py +61 -0
Base-RCNN-FPN.yml
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
MASK_ON: True
|
3 |
+
META_ARCHITECTURE: "GeneralizedRCNN"
|
4 |
+
PIXEL_MEAN: [123.675, 116.280, 103.530]
|
5 |
+
PIXEL_STD: [58.395, 57.120, 57.375]
|
6 |
+
BACKBONE:
|
7 |
+
NAME: "build_vit_fpn_backbone"
|
8 |
+
VIT:
|
9 |
+
OUT_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
|
10 |
+
DROP_PATH: 0.1
|
11 |
+
IMG_SIZE: [224,224]
|
12 |
+
POS_TYPE: "abs"
|
13 |
+
FPN:
|
14 |
+
IN_FEATURES: ["layer3", "layer5", "layer7", "layer11"]
|
15 |
+
ANCHOR_GENERATOR:
|
16 |
+
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
|
17 |
+
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
|
18 |
+
RPN:
|
19 |
+
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
|
20 |
+
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
|
21 |
+
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
|
22 |
+
# Detectron1 uses 2000 proposals per-batch,
|
23 |
+
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
|
24 |
+
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
|
25 |
+
POST_NMS_TOPK_TRAIN: 1000
|
26 |
+
POST_NMS_TOPK_TEST: 1000
|
27 |
+
ROI_HEADS:
|
28 |
+
NAME: "StandardROIHeads"
|
29 |
+
IN_FEATURES: ["p2", "p3", "p4", "p5"]
|
30 |
+
NUM_CLASSES: 5
|
31 |
+
ROI_BOX_HEAD:
|
32 |
+
NAME: "FastRCNNConvFCHead"
|
33 |
+
NUM_FC: 2
|
34 |
+
POOLER_RESOLUTION: 7
|
35 |
+
ROI_MASK_HEAD:
|
36 |
+
NAME: "MaskRCNNConvUpsampleHead"
|
37 |
+
NUM_CONV: 4
|
38 |
+
POOLER_RESOLUTION: 14
|
39 |
+
DATASETS:
|
40 |
+
TRAIN: ("publaynet_train",)
|
41 |
+
TEST: ("publaynet_val",)
|
42 |
+
SOLVER:
|
43 |
+
LR_SCHEDULER_NAME: "WarmupCosineLR"
|
44 |
+
AMP:
|
45 |
+
ENABLED: True
|
46 |
+
OPTIMIZER: "ADAMW"
|
47 |
+
BACKBONE_MULTIPLIER: 1.0
|
48 |
+
CLIP_GRADIENTS:
|
49 |
+
ENABLED: True
|
50 |
+
CLIP_TYPE: "full_model"
|
51 |
+
CLIP_VALUE: 1.0
|
52 |
+
NORM_TYPE: 2.0
|
53 |
+
WARMUP_FACTOR: 0.01
|
54 |
+
BASE_LR: 0.0004
|
55 |
+
WEIGHT_DECAY: 0.05
|
56 |
+
IMS_PER_BATCH: 32
|
57 |
+
INPUT:
|
58 |
+
CROP:
|
59 |
+
ENABLED: True
|
60 |
+
TYPE: "absolute_range"
|
61 |
+
SIZE: (384, 600)
|
62 |
+
MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
|
63 |
+
FORMAT: "RGB"
|
64 |
+
DATALOADER:
|
65 |
+
FILTER_EMPTY_ANNOTATIONS: False
|
66 |
+
VERSION: 2
|
67 |
+
AUG:
|
68 |
+
DETR: True
|
69 |
+
SEED: 42
|
bibtest.ipynb
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pdf2bib"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 2,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"def extract_metadata(file_path):\n",
|
19 |
+
" pdfextractdata = pdf2bib.pdf2bib(file_path)\n",
|
20 |
+
" #st.write(pdfextractdata)\n",
|
21 |
+
" pdfextractdata_metadata = {} if pdfextractdata.get('metadata', {}) is None else pdfextractdata.get('metadata', {})\n",
|
22 |
+
"\n",
|
23 |
+
" return pdfextractdata_metadata"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 43,
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [
|
31 |
+
{
|
32 |
+
"name": "stderr",
|
33 |
+
"output_type": "stream",
|
34 |
+
"text": [
|
35 |
+
"[pdf2bib]: Trying to extract data to generate the BibTeX entry for the file: boiteau-et-al-2024-relating-molecular-properties-to-the-persistence-of-marine-dissolved-organic-matter-with-liquid.pdf\n",
|
36 |
+
"[pdf2bib]: Calling pdf2doi...\n",
|
37 |
+
"[pdf2doi]: Trying to retrieve a DOI/identifier for the file: boiteau-et-al-2024-relating-molecular-properties-to-the-persistence-of-marine-dissolved-organic-matter-with-liquid.pdf\n",
|
38 |
+
"[pdf2doi]: Method #1: Looking for a valid identifier in the document infos...\n",
|
39 |
+
"[pdf2doi]: Validating the possible DOI 10.1021/acs.est.3c08245 via a query to dx.doi.org...\n",
|
40 |
+
"[pdf2doi]: The DOI 10.1021/acs.est.3c08245 is validated by dx.doi.org.\n",
|
41 |
+
"[pdf2doi]: A valid DOI was found in the document info labelled '/prism:doi'.\n",
|
42 |
+
"[pdf2bib]: pdf2doi found a valid identifier for this paper.\n",
|
43 |
+
"[pdf2bib]: Parsing the info returned by dx.doi.org...\n",
|
44 |
+
"[pdf2bib]: A valid BibTeX entry was generated.\n"
|
45 |
+
]
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"source": [
|
49 |
+
"a = pdf2bib.pdf2bib(\"boiteau-et-al-2024-relating-molecular-properties-to-the-persistence-of-marine-dissolved-organic-matter-with-liquid.pdf\")"
|
50 |
+
]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"cell_type": "code",
|
54 |
+
"execution_count": 47,
|
55 |
+
"metadata": {},
|
56 |
+
"outputs": [
|
57 |
+
{
|
58 |
+
"data": {
|
59 |
+
"text/plain": [
|
60 |
+
"'@article{boiteau2024relating,\\n\\ttitle = {Relating Molecular Properties to the Persistence of Marine Dissolved Organic Matter with Liquid Chromatography–Ultrahigh-Resolution Mass Spectrometry},\\n\\tpublisher = {American Chemical Society (ACS)},\\n\\turl = {http://dx.doi.org/10.1021/acs.est.3c08245},\\n\\tdoi = {10.1021/acs.est.3c08245},\\n\\tjournal = {Environmental Science & Technology},\\n\\tyear = {2024},\\n\\tmonth = {2},\\n\\tauthor = {Rene M. Boiteau and Yuri E. Corilo and William R. Kew and Christian Dewey and Maria Cristina Alvarez Rodriguez and Craig A. Carlson and Tim M. Conway}\\n}'"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
"execution_count": 47,
|
64 |
+
"metadata": {},
|
65 |
+
"output_type": "execute_result"
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"source": [
|
69 |
+
"a.get(\"bibtex\")"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": null,
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [],
|
77 |
+
"source": []
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"cell_type": "code",
|
81 |
+
"execution_count": 12,
|
82 |
+
"metadata": {},
|
83 |
+
"outputs": [],
|
84 |
+
"source": [
|
85 |
+
"import bibtexparser"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": 9,
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [],
|
93 |
+
"source": [
|
94 |
+
"parser = bibtex.Parser()"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": 45,
|
100 |
+
"metadata": {},
|
101 |
+
"outputs": [],
|
102 |
+
"source": [
|
103 |
+
"parser = bibtexparser.bparser.BibTexParser(common_strings=True)\n",
|
104 |
+
"bib_database = bibtexparser.loads(a.get(\"bibtex\"), parser=parser)"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": 41,
|
110 |
+
"metadata": {},
|
111 |
+
"outputs": [],
|
112 |
+
"source": [
|
113 |
+
"def format_author_names(authors_str):\n",
|
114 |
+
" authors = authors_str.split(' and ')\n",
|
115 |
+
" formatted_authors = []\n",
|
116 |
+
" for author in authors:\n",
|
117 |
+
" parts = author.split()\n",
|
118 |
+
" if len(parts) == 2: # Simple case: First Last\n",
|
119 |
+
" last, first = parts[1], parts[0]\n",
|
120 |
+
" formatted_authors.append(f\"{last}, {first[0]}.\")\n",
|
121 |
+
" elif len(parts) > 2: # Handling middle names or initials\n",
|
122 |
+
" last = parts[-1]\n",
|
123 |
+
" initials = ''.join(f\"{part[0]}.\" for part in parts[:-1])\n",
|
124 |
+
" formatted_authors.append(f\"{last}, {initials}\")\n",
|
125 |
+
" if len(formatted_authors) > 1:\n",
|
126 |
+
" formatted_authors_str = ', '.join(formatted_authors[:-1]) + ', & ' + formatted_authors[-1]\n",
|
127 |
+
" else:\n",
|
128 |
+
" formatted_authors_str = formatted_authors[0]\n",
|
129 |
+
" return formatted_authors_str"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 36,
|
135 |
+
"metadata": {},
|
136 |
+
"outputs": [],
|
137 |
+
"source": [
|
138 |
+
"def format_apa(entry):\n",
|
139 |
+
" author = format_author_names(entry.get('author', ''))\n",
|
140 |
+
" year = entry.get('year', '')\n",
|
141 |
+
" title = entry.get('title', '')\n",
|
142 |
+
" journal = entry.get('journal', '')\n",
|
143 |
+
" volume = entry.get('volume', '')\n",
|
144 |
+
" issue = entry.get('issue', '')\n",
|
145 |
+
" pages = entry.get('page', '').replace('-', '–') # En dash for page range\n",
|
146 |
+
" doi = entry.get('doi', '')\n",
|
147 |
+
" \n",
|
148 |
+
" # Constructing the citation\n",
|
149 |
+
" apa_citation = f\"{author} {title}. {journal} {volume}, {pages} ({year}). https://doi.org/{doi}\"\n",
|
150 |
+
" return apa_citation"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"cell_type": "code",
|
155 |
+
"execution_count": 46,
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [
|
158 |
+
{
|
159 |
+
"data": {
|
160 |
+
"text/plain": [
|
161 |
+
"'Boiteau, R.M., Corilo, Y.E., Kew, W.R., Dewey, C., Rodriguez, M.C.A., Carlson, C.A., & Conway, T.M. Relating Molecular Properties to the Persistence of Marine Dissolved Organic Matter with Liquid Chromatography–Ultrahigh-Resolution Mass Spectrometry. Environmental Science & Technology , (2024). https://doi.org/10.1021/acs.est.3c08245'"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
"execution_count": 46,
|
165 |
+
"metadata": {},
|
166 |
+
"output_type": "execute_result"
|
167 |
+
}
|
168 |
+
],
|
169 |
+
"source": [
|
170 |
+
"format_apa(bib_database.entries[0])"
|
171 |
+
]
|
172 |
+
}
|
173 |
+
],
|
174 |
+
"metadata": {
|
175 |
+
"kernelspec": {
|
176 |
+
"display_name": "Python 3",
|
177 |
+
"language": "python",
|
178 |
+
"name": "python3"
|
179 |
+
},
|
180 |
+
"language_info": {
|
181 |
+
"codemirror_mode": {
|
182 |
+
"name": "ipython",
|
183 |
+
"version": 3
|
184 |
+
},
|
185 |
+
"file_extension": ".py",
|
186 |
+
"mimetype": "text/x-python",
|
187 |
+
"name": "python",
|
188 |
+
"nbconvert_exporter": "python",
|
189 |
+
"pygments_lexer": "ipython3",
|
190 |
+
"version": "3.10.2"
|
191 |
+
}
|
192 |
+
},
|
193 |
+
"nbformat": 4,
|
194 |
+
"nbformat_minor": 2
|
195 |
+
}
|
cascade_dit_base.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_BASE_: "Base-RCNN-FPN.yml"
|
2 |
+
MODEL:
|
3 |
+
PIXEL_MEAN: [ 127.5, 127.5, 127.5 ]
|
4 |
+
PIXEL_STD: [ 127.5, 127.5, 127.5 ]
|
5 |
+
WEIGHTS: "https://layoutlm.blob.core.windows.net/dit/dit-pts/dit-base-224-p16-500k-62d53a.pth"
|
6 |
+
VIT:
|
7 |
+
NAME: "dit_base_patch16"
|
8 |
+
ROI_HEADS:
|
9 |
+
NAME: CascadeROIHeads
|
10 |
+
ROI_BOX_HEAD:
|
11 |
+
CLS_AGNOSTIC_BBOX_REG: True
|
12 |
+
RPN:
|
13 |
+
POST_NMS_TOPK_TRAIN: 2000
|
14 |
+
SOLVER:
|
15 |
+
WARMUP_ITERS: 1000
|
16 |
+
IMS_PER_BATCH: 16
|
17 |
+
MAX_ITER: 60000
|
18 |
+
CHECKPOINT_PERIOD: 2000
|
19 |
+
TEST:
|
20 |
+
EVAL_PERIOD: 2000
|
imagesummary_fun.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import requests
|
3 |
+
import os
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
load_dotenv() # This loads the variables from .env
|
7 |
+
openai_api_key = os.getenv('openai_api_key')
|
8 |
+
#openai_api_key = os.getenv('openai_api_key')
|
9 |
+
# Mock function to simulate image encoding and API call
|
10 |
+
def encode_image_to_base64(image_path):
|
11 |
+
with open(image_path, "rb") as image_file:
|
12 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
# Function to get summary from OpenAI GPT-4 Vision API
|
17 |
+
def get_image_summary(image_path):
|
18 |
+
# Encode the selected image
|
19 |
+
base64_image = encode_image_to_base64(image_path)
|
20 |
+
|
21 |
+
# OpenAI API URL and Key
|
22 |
+
api_url = "https://api.openai.com/v1/chat/completions"
|
23 |
+
#openai_api_key = "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"
|
24 |
+
|
25 |
+
headers = {
|
26 |
+
"Content-Type": "application/json",
|
27 |
+
"Authorization": f"Bearer {openai_api_key}"
|
28 |
+
}
|
29 |
+
|
30 |
+
payload = {
|
31 |
+
"model": "gpt-4-vision-preview", # Update this if the model name changes
|
32 |
+
"messages": [
|
33 |
+
{
|
34 |
+
"role": "user",
|
35 |
+
"content": [
|
36 |
+
{
|
37 |
+
"type": "image_url",
|
38 |
+
"image_url": {
|
39 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
40 |
+
}
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"type": "text",
|
44 |
+
"text": "You have provide an explanation for this figure or table. Consider elements like panels, axis, data and labels and etc."
|
45 |
+
}
|
46 |
+
]
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"max_tokens": 1000
|
50 |
+
}
|
51 |
+
|
52 |
+
response = requests.post(api_url, headers=headers, json=payload)
|
53 |
+
if response.status_code == 200:
|
54 |
+
return response.json()["choices"][0]["message"]["content"]
|
55 |
+
else:
|
56 |
+
return "Failed to get summary. Please try again."
|
pdfextract_fun.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3 |
+
import time
|
4 |
+
# Filter warnings about inputs not requiring gradients
|
5 |
+
warnings.filterwarnings("ignore", message="None of the inputs have requires_grad=True. Gradients will be None")
|
6 |
+
warnings.filterwarnings("ignore", message="torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument.")
|
7 |
+
|
8 |
+
import cv2
|
9 |
+
import os
|
10 |
+
import fitz # PyMuPDF
|
11 |
+
import numpy as np
|
12 |
+
import re
|
13 |
+
import pytesseract
|
14 |
+
from PIL import Image
|
15 |
+
from tqdm import tqdm
|
16 |
+
|
17 |
+
from unilm.dit.object_detection.ditod import add_vit_config
|
18 |
+
|
19 |
+
from detectron2.config import CfgNode as CN
|
20 |
+
from detectron2.config import get_cfg
|
21 |
+
from detectron2.utils.visualizer import ColorMode, Visualizer
|
22 |
+
from detectron2.data import MetadataCatalog
|
23 |
+
from detectron2.engine import DefaultPredictor
|
24 |
+
|
25 |
+
|
26 |
+
# Step 1: instantiate config
|
27 |
+
cfg = get_cfg()
|
28 |
+
add_vit_config(cfg)
|
29 |
+
cfg.merge_from_file("cascade_dit_base.yml")
|
30 |
+
|
31 |
+
# Step 2: add model weights URL to config
|
32 |
+
cfg.MODEL.WEIGHTS = "publaynet_dit-b_cascade.pth"
|
33 |
+
|
34 |
+
# Step 3: set device
|
35 |
+
#cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
36 |
+
cfg.MODEL.DEVICE = "cuda"
|
37 |
+
|
38 |
+
# Step 4: define model
|
39 |
+
predictor = DefaultPredictor(cfg)
|
40 |
+
|
41 |
+
def analyze_image(img):
|
42 |
+
|
43 |
+
md = MetadataCatalog.get(cfg.DATASETS.TEST[0])
|
44 |
+
if cfg.DATASETS.TEST[0]=='icdar2019_test':
|
45 |
+
md.set(thing_classes=["table"])
|
46 |
+
else:
|
47 |
+
md.set(thing_classes=["text","title","list","table","figure"])
|
48 |
+
|
49 |
+
output = predictor(img)["instances"]
|
50 |
+
v = Visualizer(img[:, :, ::-1],
|
51 |
+
md,
|
52 |
+
scale=1.0,
|
53 |
+
instance_mode=ColorMode.SEGMENTATION)
|
54 |
+
result = v.draw_instance_predictions(output.to("cpu"))
|
55 |
+
result_image = result.get_image()[:, :, ::-1]
|
56 |
+
|
57 |
+
return result_image, output, v
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
def convert_pdf_to_jpg(pdf_path, output_folder, zoom_factor=2):
|
62 |
+
doc = fitz.open(pdf_path)
|
63 |
+
for page_num in range(len(doc)):
|
64 |
+
page = doc.load_page(page_num)
|
65 |
+
|
66 |
+
# Adjust zoom factor for higher resolution
|
67 |
+
mat = fitz.Matrix(zoom_factor, zoom_factor) # Create a Matrix with the zoom factor
|
68 |
+
pix = page.get_pixmap(matrix=mat) # Render the page using the matrix
|
69 |
+
|
70 |
+
output_file = f"{output_folder}/page_{page_num}.jpg"
|
71 |
+
pix.save(output_file)
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
def process_jpeg_images(output_folder):
|
76 |
+
for page_num in tqdm(range(len(os.listdir(output_folder))), desc="Processing the pdf"):
|
77 |
+
file_path = f"{output_folder}/page_{page_num}.jpg"
|
78 |
+
img = cv2.imread(file_path)
|
79 |
+
if img is None:
|
80 |
+
print(f"Failed to read {file_path}. Skipping.")
|
81 |
+
continue
|
82 |
+
result_image, output, v = analyze_image(img)
|
83 |
+
|
84 |
+
# Saving logic
|
85 |
+
save_extracted_instances(img, output, page_num,output_folder)
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
def save_extracted_instances(img, output, page_num, dest_folder, confidence_threshold=0.8):
|
90 |
+
class_names = {
|
91 |
+
0: "text",
|
92 |
+
1: "title",
|
93 |
+
2: "list",
|
94 |
+
3: "table",
|
95 |
+
4: "figure"
|
96 |
+
}
|
97 |
+
|
98 |
+
threshold_value = 0 # Standard deviation threshold
|
99 |
+
min_height = 0 # Minimum height threshold
|
100 |
+
|
101 |
+
instances = output.to("cpu")
|
102 |
+
boxes = instances.pred_boxes.tensor.numpy()
|
103 |
+
class_ids = instances.pred_classes.tolist()
|
104 |
+
scores = instances.scores.tolist() # Get prediction scores
|
105 |
+
|
106 |
+
image_counter = 1
|
107 |
+
for box, class_id, score in zip(boxes, class_ids, scores):
|
108 |
+
# Check if the prediction score meets the confidence threshold
|
109 |
+
if score >= confidence_threshold:
|
110 |
+
class_name = class_names.get(class_id, "unknown")
|
111 |
+
|
112 |
+
# Save only if class is 'figure' or 'table'
|
113 |
+
if class_name in ["figure", "table","text"]:
|
114 |
+
x1, y1, x2, y2 = map(int, box)
|
115 |
+
cropped_image = img[y1:y2, x1:x2]
|
116 |
+
|
117 |
+
if np.std(cropped_image) > threshold_value and (y2 - y1) > min_height:
|
118 |
+
save_path = os.path.join(dest_folder, f"page_{page_num}_{class_name}_{image_counter}.jpg")
|
119 |
+
cv2.imwrite(save_path, cropped_image)
|
120 |
+
image_counter += 1
|
121 |
+
|
122 |
+
|
123 |
+
def delete_files_in_folder(folder_path):
|
124 |
+
for filename in os.listdir(folder_path):
|
125 |
+
file_path = os.path.join(folder_path, filename)
|
126 |
+
if os.path.isfile(file_path):
|
127 |
+
os.remove(file_path)
|
128 |
+
|
129 |
+
|
130 |
+
|
131 |
+
def rename_files_sequentially(folder_path):
|
132 |
+
# Regex pattern to match 'page_{page_num}_{class_name}_{image_counter}.jpg'
|
133 |
+
pattern = re.compile(r'page_(\d+)_(\w+)_(\d+).jpg', re.IGNORECASE)
|
134 |
+
|
135 |
+
# List files in the folder
|
136 |
+
files = os.listdir(folder_path)
|
137 |
+
|
138 |
+
# Filter and sort files based on the regex pattern
|
139 |
+
sorted_files = sorted(
|
140 |
+
[f for f in files if pattern.match(f)],
|
141 |
+
key=lambda x: (int(pattern.match(x).group(1)), pattern.match(x).group(2).lower(), int(pattern.match(x).group(3)))
|
142 |
+
)
|
143 |
+
|
144 |
+
# Initialize an empty dictionary for counters
|
145 |
+
counters = {}
|
146 |
+
|
147 |
+
for filename in sorted_files:
|
148 |
+
match = pattern.match(filename)
|
149 |
+
if match:
|
150 |
+
page_num, class_name, _ = match.groups()
|
151 |
+
class_name = class_name.lower() # Convert class name to lowercase
|
152 |
+
|
153 |
+
# Initialize counter for this class if it doesn't exist
|
154 |
+
if class_name not in counters:
|
155 |
+
counters[class_name] = 1
|
156 |
+
|
157 |
+
# New filename format: '{class_name}_{sequential_number}.jpg'
|
158 |
+
new_filename = f"{class_name}_{counters[class_name]}.jpg"
|
159 |
+
counters[class_name] += 1
|
160 |
+
|
161 |
+
# Rename the file
|
162 |
+
os.rename(os.path.join(folder_path, filename), os.path.join(folder_path, new_filename))
|
163 |
+
|
164 |
+
#print(f"Renamed '{filename}' to '{new_filename}'")
|
165 |
+
|
166 |
+
|
167 |
+
def ocr_folder(folder_path):
|
168 |
+
# Regex pattern to match 'text_{number}.jpg'
|
169 |
+
pattern = re.compile(r'text_\d+\.jpg', re.IGNORECASE)
|
170 |
+
|
171 |
+
# Create a subfolder for the OCR text files
|
172 |
+
ocr_text_folder = os.path.join(folder_path, "ocr_results")
|
173 |
+
if not os.path.exists(ocr_text_folder):
|
174 |
+
os.makedirs(ocr_text_folder)
|
175 |
+
|
176 |
+
for filename in os.listdir(folder_path):
|
177 |
+
if pattern.match(filename):
|
178 |
+
image_path = os.path.join(folder_path, filename)
|
179 |
+
text = ocr_image(image_path)
|
180 |
+
|
181 |
+
# Save the OCR result to a text file in the subfolder
|
182 |
+
text_file_name = filename.replace('.jpg', '.txt')
|
183 |
+
text_file_path = os.path.join(ocr_text_folder, text_file_name)
|
184 |
+
with open(text_file_path, 'w') as file:
|
185 |
+
file.write(text)
|
186 |
+
|
187 |
+
#print(f"OCR result for {filename} saved to {text_file_path}\n")
|
188 |
+
|
189 |
+
def ocr_image(image_path):
|
190 |
+
image = Image.open(image_path)
|
191 |
+
text = pytesseract.image_to_string(image)
|
192 |
+
return text
|
pdfsummary_fun.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import DirectoryLoader, TextLoader
|
2 |
+
from langchain_openai import OpenAIEmbeddings
|
3 |
+
from sklearn.cluster import KMeans
|
4 |
+
import numpy as np
|
5 |
+
from sklearn.decomposition import PCA
|
6 |
+
from langchain_core.output_parsers import StrOutputParser
|
7 |
+
from langchain_core.prompts import ChatPromptTemplate
|
8 |
+
from langchain_openai import ChatOpenAI
|
9 |
+
import os
|
10 |
+
|
11 |
+
|
12 |
+
from dotenv import load_dotenv
|
13 |
+
load_dotenv() # This loads the variables from .env
|
14 |
+
openai_api_key = os.getenv('openai_api_key')
|
15 |
+
|
16 |
+
def pdf_summary(ocr_results_folder):
|
17 |
+
|
18 |
+
|
19 |
+
#openai_api_key = "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"
|
20 |
+
loader = DirectoryLoader(ocr_results_folder, glob="**/*.txt", loader_cls=TextLoader)
|
21 |
+
|
22 |
+
docs = loader.load()
|
23 |
+
page_contents = [doc.page_content for doc in docs]
|
24 |
+
|
25 |
+
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small",openai_api_key=openai_api_key)
|
26 |
+
embeddings = embeddings_model.embed_documents(page_contents)
|
27 |
+
|
28 |
+
X = np.array(embeddings)
|
29 |
+
num_clusters = 20
|
30 |
+
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)
|
31 |
+
pca = PCA(n_components=2)
|
32 |
+
X_pca = pca.fit_transform(X)
|
33 |
+
centroids = kmeans.cluster_centers_
|
34 |
+
centroids_pca = pca.transform(centroids)
|
35 |
+
|
36 |
+
closest_point_indices = find_closest_point_indices(X, centroids, 1)
|
37 |
+
extracted_contents = [page_contents[index[0]] for index in closest_point_indices[:num_clusters]]
|
38 |
+
|
39 |
+
prompt = ChatPromptTemplate.from_template("Summarize the article based on the texts provided from four aspects: Goal, Method, Results, and Conclusion: {topic}")
|
40 |
+
model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=openai_api_key)
|
41 |
+
output_parser = StrOutputParser()
|
42 |
+
|
43 |
+
chain = prompt | model | output_parser
|
44 |
+
|
45 |
+
results = chain.invoke({"topic": ' '.join(extracted_contents)})
|
46 |
+
|
47 |
+
return results
|
48 |
+
|
49 |
+
def find_closest_point_indices(X, centroids, num_points=1):
|
50 |
+
closest_indices = []
|
51 |
+
for center in centroids:
|
52 |
+
# Calculating Euclidean distances from each point in X to the centroid
|
53 |
+
distances = np.linalg.norm(X - center, axis=1)
|
54 |
+
|
55 |
+
# Getting the indices of the closest 'num_points' points
|
56 |
+
closest_idx = np.argsort(distances)[:num_points]
|
57 |
+
|
58 |
+
# Adding the indices of the closest points for this centroid
|
59 |
+
closest_indices.append(closest_idx)
|
60 |
+
|
61 |
+
return closest_indices
|