nickgambirasi commited on
Commit
2703586
1 Parent(s): 30b460c

df experiment

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import re
2
  import gradio
3
  import torch
 
4
 
5
  from PIL import Image
6
  from transformers import DonutProcessor, VisionEncoderDecoderModel
@@ -38,9 +39,10 @@ def process_document(image):
38
  # postprocess
39
  sequence = processor.batch_decode(outputs.sequences)[0]
40
  sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
41
- sequence = re.sub(r"<.*?>", "", sequence).strip() # remove first task start token
42
 
43
- return {'output': sequence}
 
44
 
45
  demo = gradio.Interface(
46
  fn=process_document,
 
1
  import re
2
  import gradio
3
  import torch
4
+ import pandas as pd
5
 
6
  from PIL import Image
7
  from transformers import DonutProcessor, VisionEncoderDecoderModel
 
39
  # postprocess
40
  sequence = processor.batch_decode(outputs.sequences)[0]
41
  sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
42
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
43
 
44
+ js = processor.token2json(sequence)
45
+ return pd.json_normalize(js)
46
 
47
  demo = gradio.Interface(
48
  fn=process_document,