Spaces:
Sleeping
Sleeping
Update document_scrapped.py
Browse files- document_scrapped.py +6 -5
document_scrapped.py
CHANGED
@@ -9,6 +9,7 @@ from io import BytesIO
|
|
9 |
import chardet
|
10 |
from docx import Document
|
11 |
import pandas as pd
|
|
|
12 |
from io import BytesIO
|
13 |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
14 |
from pdfminer.converter import TextConverter
|
@@ -195,17 +196,17 @@ def get_data(url):
|
|
195 |
ext = jo.split(".")[-1]
|
196 |
if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
|
197 |
rs = excel(jo)
|
198 |
-
return rs
|
199 |
elif ext == 'pdf':
|
200 |
rs = pdf(jo)
|
201 |
-
return rs
|
202 |
elif ext == 'docx' or ext == 'doc':
|
203 |
rs = docx(jo)
|
204 |
-
return rs
|
205 |
elif ext == 'csv':
|
206 |
rs = csv(jo)
|
207 |
-
return rs
|
208 |
elif ext == 'pptx' or ext == 'ppt':
|
209 |
rs = pptx(jo)
|
210 |
-
return rs
|
211 |
return "No data returned"
|
|
|
9 |
import chardet
|
10 |
from docx import Document
|
11 |
import pandas as pd
|
12 |
+
from sumarize import summarize
|
13 |
from io import BytesIO
|
14 |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
15 |
from pdfminer.converter import TextConverter
|
|
|
196 |
ext = jo.split(".")[-1]
|
197 |
if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
|
198 |
rs = excel(jo)
|
199 |
+
return summarize.invoke({"input":rs})
|
200 |
elif ext == 'pdf':
|
201 |
rs = pdf(jo)
|
202 |
+
return summarize.invoke({"input":rs})
|
203 |
elif ext == 'docx' or ext == 'doc':
|
204 |
rs = docx(jo)
|
205 |
+
return summarize.invoke({"input":rs})
|
206 |
elif ext == 'csv':
|
207 |
rs = csv(jo)
|
208 |
+
return summarize.invoke({"input":rs})
|
209 |
elif ext == 'pptx' or ext == 'ppt':
|
210 |
rs = pptx(jo)
|
211 |
+
return summarize.invoke({"input":rs})
|
212 |
return "No data returned"
|