yiyixin commited on
Commit
9b20476
1 Parent(s): 4b78fd7
Files changed (2) hide show
  1. pdf_parser.py +18 -18
  2. requirements.txt +2 -1
pdf_parser.py CHANGED
@@ -4,9 +4,9 @@ from scipdf_utils import parse_pdf_to_dict
4
 
5
 
6
  class GrobidSciPDFPaser(AbstractPDFParser):
7
- import pysbd
8
- seg_en = pysbd.Segmenter(language="en", clean=False)
9
- seg_chinese = pysbd.Segmenter(language="zh", clean=False)
10
 
11
  def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
12
  """Initialize the PDF parser
@@ -131,18 +131,18 @@ class GrobidSciPDFPaser(AbstractPDFParser):
131
  })
132
  return section_pair_list
133
 
134
- @staticmethod
135
- def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
136
- """
137
- split based on the some magic rules
138
- """
139
- import pysbd
140
- for section_pair in section_pair_list:
141
- if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
142
- seg = GrobidSciPDFPaser.seg_chinese
143
- else:
144
- seg = GrobidSciPDFPaser.seg_en
145
- section_pair["texts"] = seg.segment(section_pair["texts"])
146
- section_pair["texts"] = [
147
- para for para in section_pair["text"] if len(para) > 2]
148
- return section_pair_list
 
4
 
5
 
6
  class GrobidSciPDFPaser(AbstractPDFParser):
7
+ # import pysbd
8
+ # seg_en = pysbd.Segmenter(language="en", clean=False)
9
+ # seg_chinese = pysbd.Segmenter(language="zh", clean=False)
10
 
11
  def __init__(self, pdf_link, db_name="grobid_scipdf", short_thereshold=30) -> None:
12
  """Initialize the PDF parser
 
131
  })
132
  return section_pair_list
133
 
134
+ # @staticmethod
135
+ # def _determine_optimal_split_of_pargraphs(section_pair_list) -> None:
136
+ # """
137
+ # split based on the some magic rules
138
+ # """
139
+ # import pysbd
140
+ # for section_pair in section_pair_list:
141
+ # if GrobidSciPDFPaser._check_chinese(section_pair["text"]):
142
+ # seg = GrobidSciPDFPaser.seg_chinese
143
+ # else:
144
+ # seg = GrobidSciPDFPaser.seg_en
145
+ # section_pair["texts"] = seg.segment(section_pair["texts"])
146
+ # section_pair["texts"] = [
147
+ # para for para in section_pair["text"] if len(para) > 2]
148
+ # return section_pair_list
requirements.txt CHANGED
@@ -6,4 +6,5 @@ sentence_transformers
6
  bs4
7
  openai
8
  matplotlib
9
- plotly
 
 
6
  bs4
7
  openai
8
  matplotlib
9
+ plotly
10
+ pysbd