upload
Browse files- pdf_parser.py +18 -18
- requirements.txt +2 -1
pdf_parser.py
CHANGED
@@ -4,9 +4,9 @@ from scipdf_utils import parse_pdf_to_dict
|
|
4 |
|
5 |
|
6 |
class GrobidSciPDFPaser(AbstractPDFParser):
|
7 |
-
import pysbd
|
8 |
-
seg_en = pysbd.Segmenter(language="en", clean=False)
|
9 |
-
seg_chinese = pysbd.Segmenter(language="zh", clean=False)
|
10 |
|
11 |
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
|
12 |
"""Initialize the PDF parser
|
@@ -131,18 +131,18 @@ class GrobidSciPDFPaser(AbstractPDFParser):
|
|
131 |
})
|
132 |
return section_pair_list
|
133 |
|
134 |
-
@staticmethod
|
135 |
-
def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
|
|
4 |
|
5 |
|
6 |
class GrobidSciPDFPaser(AbstractPDFParser):
|
7 |
+
# import pysbd
|
8 |
+
# seg_en = pysbd.Segmenter(language="en", clean=False)
|
9 |
+
# seg_chinese = pysbd.Segmenter(language="zh", clean=False)
|
10 |
|
11 |
def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
|
12 |
"""Initialize the PDF parser
|
|
|
131 |
})
|
132 |
return section_pair_list
|
133 |
|
134 |
+
# @staticmethod
|
135 |
+
# def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
|
136 |
+
# """
|
137 |
+
# split based on the some magic rules
|
138 |
+
# """
|
139 |
+
# import pysbd
|
140 |
+
# for section_pair in section_pair_list:
|
141 |
+
# if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
|
142 |
+
# seg = GrobidSciPDFPaser.seg_chinese
|
143 |
+
# else:
|
144 |
+
# seg = GrobidSciPDFPaser.seg_en
|
145 |
+
# section_pair["texts"] = seg.segment(section_pair["texts"])
|
146 |
+
# section_pair["texts"] = [
|
147 |
+
# para for para in section_pair["text"] if len(para) > 2]
|
148 |
+
# return section_pair_list
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ sentence_transformers
|
|
6 |
bs4
|
7 |
openai
|
8 |
matplotlib
|
9 |
-
plotly
|
|
|
|
6 |
bs4
|
7 |
openai
|
8 |
matplotlib
|
9 |
+
plotly
|
10 |
+
pysbd
|