Spaces:

ConradLax
/

document_classifier

Runtime error

App Files Files Community

ConradLax commited on Dec 20, 2023

Commit

3393a89

1 Parent(s): 16d1ed2

feat: downloadable .csv output

Browse files

Files changed (3) hide show

main.py +52 -35
static/index.html +11 -23
static/script.js +24 -54

main.py CHANGED Viewed

@@ -5,6 +5,8 @@ from fastapi import File, UploadFile
 from fastapi.responses import StreamingResponse
 from typing import List
 from pdf2image import convert_from_bytes
 from transformers import pipeline
@@ -21,48 +23,63 @@ def t5(input):
 @app.post("/classify")
 async def classify_doc(files: List[UploadFile] = File(...)):
-    # classificationResults = {}
-    # for file in files:
-    #     try:
-    #         contents = file.file.read()
-    #         filename = file.filename
-    #         if filename.endswith('.pdf'):
-    #             try:
-    #                 pages = convert_from_bytes(open(file, 'rb').read())
-    #                 for pagenum, image in enumerate(pages):
-    #                     if pagenum != 0 and pagenum < len(pages):
-    #                         classificationRes =  classify_acct_dtype_str(contents, filename)
-    #                         classificationResults[f'{pagenum:02d}'] = {
-    #                             'doctype': classificationRes
-    #                         }
-    #             except:
-    #                 return f"Error in opening {filename}"
-    #         else:
-    #             classificationRes = classify_acct_dtype_str(contents, filename)
-    #             classificationResults[f'{0:02d}'] = {
-    #                 'doctype' : classificationRes
-    #             }
-    #     except Exception as err:
-    #         print(Exception, err)
-    #         return {"message": "There was an error in uploading file(s)"}
-    #     finally:
-    #         file.file.close()
-    # return StreamingResponse(
-    #     iter(classificationResults.items()),
-    #     media_type="text/csv",
-    #     headers={"Content-Disposition": f"attachment; filename=data.csv"}
-    # )
-    return {"message": f"{[file.filename for file in files]} : {[classifyFiles(file) for file in files]}"}

 from fastapi.responses import StreamingResponse
 from typing import List
 from pdf2image import convert_from_bytes
+import csv
+import io
 from transformers import pipeline
 @app.post("/classify")
 async def classify_doc(files: List[UploadFile] = File(...)):
+    classificationResults = {}
+    for file in files:
+        try:
+            contents = file.file.read()
+            filename = file.filename
+            if filename.endswith('.pdf'):
+                try:
+                    pages = convert_from_bytes(open(file, 'rb').read())
+                    for pagenum, image in enumerate(pages):
+                        # if pagenum != 0 and pagenum < len(pages):
+                        # if pagenum == 0:
+                            classificationRes =  classify_acct_dtype_str(image, filename)
+                            # add/update classification result dictionary
+                            if (classificationRes in classificationResults):
+                                classificationResults.update({classificationRes : classificationResults[classificationRes] + 1})
+                            else:
+                                classificationResults.update({classificationRes : 1})
+                        # break
+                except:
+                    return f"Error in opening {filename}"
+            else:
+                classificationRes = classify_acct_dtype_str(contents, filename)
+                # add/update classification result dictionary
+                if (classificationRes in classificationResults):
+                    classificationResults.update({classificationRes : classificationResults[classificationRes] + 1})
+                else:
+                    classificationResults.update({classificationRes : 1})
+        except Exception as err:
+            print(Exception, err)
+            return {"message": "There was an error in uploading file(s)"}
+        finally:
+            file.file.close()
+    # Convert dictionary to CSV string
+    csv_data = io.StringIO()
+    csv_writer = csv.writer(csv_data)
+    csv_writer.writerow(["Type", "Count"])  # Header row
+    for key, value in classificationResults.items():
+        csv_writer.writerow([key, value])
+    return StreamingResponse(
+        iter([csv_data.getvalue()]),
+        media_type="text/csv",
+        headers={"Content-Disposition": f"attachment; filename=data.csv"}
+    )
+    # return {"message": f"{[file.filename for file in files]} : {[classifyFiles(file) for file in files]}"}

static/index.html CHANGED Viewed

@@ -9,34 +9,22 @@
   </head>
   <body>
     <main>
-      <section id="text-gen">
-        <h1>Text generation using Flan T5</h1>
         <p>
           Model:
-          <a
-              href="https://huggingface.co/google/flan-t5-small"
-              rel="noreferrer"
-              target="_blank"
-            >google/flan-t5-small</a
-          >
         </p>
-        <form class="text-gen-form">
-          <label for="text-gen-input">Text prompt</label>
-          <input
-            id="text-gen-input"
-            type="text"
-            value="English: Translate There are many ducks. German:"
-          />
-          <button id="text-gen-submit">Submit</button>
-          <p class="text-gen-output"></p>
         </form>
       </section>
-      <section>
-        <input id="img-input" type="file" multiple="multiple" accept="image/jpeg, image/png, image/jpg">
-        <button id="classify-btn"></button>
-        <p id="test-output"></p>
-        <p id="img-output"></p>
-      </section>
     </main>
   </body>
 </html>

   </head>
   <body>
     <main>
+      <section id="doc-classification">
+        <h1>Document classification using donutclassifier_acctdocs_by_doctype</h1>
         <p>
           Model:
+          <a
+          href="https://huggingface.co/calumpianojericho/donutclassifier_acctdocs_by_doctype"
+            rel="noreferrer"
+            target="_blank">donutclassifier_acctdocs_by_doctype</a>
         </p>
+        <form class="doc-classification-form">
+          <input id="doc-input" type="file" multiple="multiple" accept="image/jpeg, image/png, image/jpg">
+          <a id="doc-output-link" style="visibility: hidden;">
+            <button type="button">Download</button>
+          </a>
         </form>
       </section>
     </main>
   </body>
 </html>

static/script.js CHANGED Viewed

@@ -1,70 +1,40 @@
-const textGenForm = document.querySelector('.text-gen-form');
-const translateText = async (text) => {
-    const inferResponse = await fetch(`infer_t5?input=${text}`);
-    const inferJson = await inferResponse.json();
-    return inferJson.output;
-};
-textGenForm.addEventListener('submit', async (event) => {
-  event.preventDefault();
-  const textGenInput = document.getElementById('text-gen-input');
-  const textGenParagraph = document.querySelector('.text-gen-output');
-  try {
-    textGenParagraph.textContent = await translateText(textGenInput.value);
-  } catch (err) {
-    console.error(err);
-  }
-});
-async function submitImages(acceptedFiles) {
   let formData = new FormData();
   for (const file of acceptedFiles){
     formData.append('files', file);
   }
  let classifyResponse = await fetch('classify', {
     method: 'POST',
    body: formData
  });
-  let res = classifyResponse.json();
-  console.log(res);
-  return res;
 }
-const inputImg = document.getElementById('img-input');
-const outputImg = document.getElementById('img-output');
-let imageArray = [];
-inputImg.addEventListener("change", async event =>{
   const files = event.target.files;
-  console.log(files);
-  await submitImages(files).then((resultText) => {
-    outputImg.innerText = "Result: " + resultText["message"];
-  });
-//   outputImg.innerText = "Result is: " + await submitImages(files);
-});
-const classifyBtn = document.getElementById('classify-btn');
-const outputPar = document.getElementById('test-output');
-classifyBtn.addEventListener('onclick', () => {
-  outputPar.innerText = "asa";
 });

+/* classify uploaded images using a Huggingface model */
+async function classifyImages(acceptedFiles) {
   let formData = new FormData();
   for (const file of acceptedFiles){
     formData.append('files', file);
   }
  let classifyResponse = await fetch('classify', {
     method: 'POST',
    body: formData
  });
+  return classifyResponse;
 }
+const inputDoc = document.getElementById('doc-input');
+const outputDocLink = document.getElementById('doc-output-link');
+inputDoc.addEventListener("change", async event =>{
+  /* hide Download button when in classification process */
+  outputDocLink.style.visibility = "hidden";
   const files = event.target.files;
+  /* make Download button's visible, and add the link to download the resulting .csv file */
+  await classifyImages(files).then(async (result) => {
+    outputDocLink.style.visibility = "visible";
+    const disposition = result.headers.get('Content-Disposition');
+    const filenameRegex = /filename[^;=\n]*=((['"]).*?\2|[^;\n]*)/;
+    const matches = filenameRegex.exec(disposition);
+    // const filename = matches != null && matches[1] ? matches[1].replace(/['"]/g, '') : 'data.csv';
+    const filename = 'data.csv';
+    const blob = await result.blob();
+    const url = window.URL.createObjectURL(blob);
+    // set download link to the download button
+    outputDocLink.href = url;
+  });
 });