Spaces:

tferhan
/

data_gov_ma

Sleeping

tferhan commited on Jun 7, 2024

Commit

2b9b9f9

verified ·

1 Parent(s): d916c61

Update document_scrapped.py

Files changed (1) hide show

document_scrapped.py CHANGED Viewed

@@ -9,6 +9,7 @@ from io import BytesIO
 import chardet
 from docx import Document
 import pandas as pd
 from io import BytesIO
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.converter import TextConverter
@@ -195,17 +196,17 @@ def get_data(url):
   ext = jo.split(".")[-1]
   if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
     rs = excel(jo)
-    return rs
   elif ext == 'pdf':
     rs = pdf(jo)
-    return rs
   elif ext == 'docx' or ext == 'doc':
     rs = docx(jo)
-    return rs
   elif ext == 'csv':
     rs = csv(jo)
-    return rs
   elif ext == 'pptx' or ext == 'ppt':
     rs = pptx(jo)
-    return rs
   return "No data returned"

 import chardet
 from docx import Document
 import pandas as pd
+from sumarize import summarize
 from io import BytesIO
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.converter import TextConverter
   ext = jo.split(".")[-1]
   if ext == 'xlsx' or ext == 'xls' or ext == 'xlsm':
     rs = excel(jo)
+    return summarize.invoke({"input":rs})
   elif ext == 'pdf':
     rs = pdf(jo)
+    return summarize.invoke({"input":rs})
   elif ext == 'docx' or ext == 'doc':
     rs = docx(jo)
+    return summarize.invoke({"input":rs})
   elif ext == 'csv':
     rs = csv(jo)
+    return summarize.invoke({"input":rs})
   elif ext == 'pptx' or ext == 'ppt':
     rs = pptx(jo)
+    return summarize.invoke({"input":rs})
   return "No data returned"