File size: 6,580 Bytes
fcceb43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2621d77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!pip install pdfreader
import pdfreader
from pdfreader import PDFDocument, SimplePDFViewer
#!pip install bs4
from bs4 import BeautifulSoup
import requests
from NER import cleanText
#!pip install tabula-py
import tabula
import fitz  # PyMuPDF
import os

class PDF():
  def __init__(self, pdf, saveFolder, doi=None):
    self.pdf = pdf
    self.doi = doi
    self.saveFolder = saveFolder

  def openPDFFile(self):
    if "https" in self.pdf:
      name = self.pdf.split("/")[-1]
      name = self.downloadPDF(self.saveFolder)
      if name != "no pdfLink to download":
        fileToOpen = os.path.join(self.saveFolder, name)
      else:
        fileToOpen = self.pdf
    else:
      fileToOpen = self.pdf
    return open(fileToOpen, "rb")

  def downloadPDF(self, saveFolder):
    pdfLink = ''
    if ".pdf" not in self.pdf and "https" not in self.pdf:
      r = requests.get(self.pdf)
      soup = BeautifulSoup(r.content, 'html.parser')
      links = soup.find_all("a")
      for link in links:
        if ".pdf" in link.get("href", ""):
          if self.doi in link.get("href"):
            pdfLink = link.get("href")
            break
    else:
      pdfLink = self.pdf

    if pdfLink != '':
      response = requests.get(pdfLink)
      name = pdfLink.split("/")[-1]
      print("inside download PDF and name and link are: ", pdfLink, name)  
      print("saveFolder is: ", saveFolder)  
      with open(os.path.join(saveFolder, name), 'wb') as pdf:
        print("len of response content: ", len(response.content))  
        pdf.write(response.content)
      print("pdf downloaded")
      return name
    else:
      return "no pdfLink to download"

  def extractText(self):
    try:  
        fileToOpen = self.openPDFFile().name
        try:
          doc = fitz.open(fileToOpen)
          text = ""
          for page in doc:
            text += page.get_text("text") + "\n\n"
          doc.close()
    
          if len(text.strip()) < 100:
            print("Fallback to PDFReader due to weak text extraction.")
            text = self.extractTextWithPDFReader()
          return text
        except Exception as e:
          print("Failed with PyMuPDF, fallback to PDFReader:", e)
          return self.extractTextWithPDFReader()
    except:
        return ""
  def extract_text_excluding_tables(self):
    fileToOpen = self.openPDFFile().name
    text = ""
    try:
        doc = fitz.open(fileToOpen)
        for page in doc:
            blocks = page.get_text("dict")["blocks"]
            
            for block in blocks:
                if block["type"] == 0:  # text block
                    lines = block.get("lines", [])
                    
                    if not lines:
                        continue
                    avg_words_per_line = sum(len(l["spans"]) for l in lines) / len(lines)
                    if avg_words_per_line > 1:  # Heuristic: paragraph-like blocks
                        for line in lines:
                            text += " ".join(span["text"] for span in line["spans"]) + "\n"
        doc.close()
        if len(text.strip()) < 100:
          print("Fallback to PDFReader due to weak text extraction.")
          text = self.extractTextWithPDFReader()
        return text
    except Exception as e:
      print("Failed with PyMuPDF, fallback to PDFReader:", e)
      return self.extractTextWithPDFReader()

  def extractTextWithPDFReader(self):
    jsonPage = {}
    try:
        pdf = self.openPDFFile()
        print("open pdf file")  
        print(pdf)  
        doc = PDFDocument(pdf)
        viewer = SimplePDFViewer(pdf)
        all_pages = [p for p in doc.pages()]
        cl = cleanText.cleanGenText()
        pdfText = ""
        for page in range(1, len(all_pages)):
          viewer.navigate(page)
          viewer.render()
          if str(page) not in jsonPage:
            jsonPage[str(page)] = {}
          text = "".join(viewer.canvas.strings)
          clean, filteredWord = cl.textPreprocessing(text)
          jsonPage[str(page)]["normalText"] = [text]
          jsonPage[str(page)]["cleanText"] = [' '.join(filteredWord)]
          jsonPage[str(page)]["image"] = [viewer.canvas.images]
          jsonPage[str(page)]["form"] = [viewer.canvas.forms]
          jsonPage[str(page)]["content"] = [viewer.canvas.text_content]
          jsonPage[str(page)]["inline_image"] = [viewer.canvas.inline_images]
        pdf.close()
    except:
        jsonPage = {}        
    return self.mergeTextinJson(jsonPage)

  def extractTable(self,pages="all",saveFile=None,outputFormat=None):
    '''pages (str, int, iterable of int, optional) –
      An optional values specifying pages to extract from. It allows str,`int`, iterable of :int. Default: 1
      Examples: '1-2,3', 'all', [1,2]'''
    df = []
    if "https" in self.pdf:
      name = self.pdf.split("/")[-1]
      name = self.downloadPDF(self.saveFolder)
      if name != "no pdfLink to download":
        fileToOpen = self.saveFolder + "/" + name
      else: fileToOpen = self.pdf
    else: fileToOpen = self.pdf
    try:
      df = tabula.read_pdf(fileToOpen, pages=pages)
    # saveFile: "/content/drive/MyDrive/CollectData/NER/PDF/tableS1.csv"
    # outputFormat: "csv"
    #tabula.convert_into(self.pdf, saveFile, output_format=outputFormat, pages=pages)
    except:# ValueError:
      df = []
      print("No tables found in PDF file")
    return df

  def mergeTextinJson(self, jsonPDF):
    try:  
        cl = cleanText.cleanGenText()
        pdfText = ""
        if jsonPDF:  
            for page in jsonPDF:
              if len(jsonPDF[page]["normalText"]) > 0:
                for i in range(len(jsonPDF[page]["normalText"])):
                  text = jsonPDF[page]["normalText"][i]
                  if len(text) > 0:
                    text = cl.removeTabWhiteSpaceNewLine(text)
                    text = cl.removeExtraSpaceBetweenWords(text)
                  jsonPDF[page]["normalText"][i] = text
                  if i - 1 > 0:
                    if jsonPDF[page]["normalText"][i - 1][-1] != ".":
                      pdfText += ". "
                  pdfText += jsonPDF[page]["normalText"][i]
                if len(jsonPDF[page]["normalText"][i]) > 0:
                  if jsonPDF[page]["normalText"][i][-1] != ".":
                    pdfText += "."
                pdfText += "\n\n"
        return pdfText
    except:
        return ""

  def getReference(self):
    pass

  def getSupMaterial(self):
    pass

  def removeHeaders(self):
    pass

  def removeFooters(self):
    pass

  def removeReference(self):
    pass