File size: 7,243 Bytes
957432c
14c2644
e4a65f3
 
957432c
 
e4a65f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14c2644
 
 
 
e4a65f3
 
 
 
 
 
 
 
3f0ddb7
e4a65f3
3f0ddb7
 
3ccf190
b0ae90b
e4a65f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e6fb8
e4a65f3
 
 
 
14c2644
 
5514bd2
 
 
 
 
 
3ccf190
5514bd2
28b6f49
5514bd2
221b478
5514bd2
 
 
 
 
e4a65f3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import gradio as gr
from transformers import pipeline
from transformers import MarkupLMTokenizer, MarkupLMModel, MarkupLMForQuestionAnswering
import torch


title = 'MarkupLM QA App'
#context = "There are three bridges in the county. Two of them are covered."
#question = "How many uncovered bridges are there?"





    
    
    
    
    
    
    
    
 #Adapted from NielsRogge markuplm code samples and QA huggingface tutorial from Cloudera Fast Forward Labs





model = MarkupLMForQuestionAnswering.from_pretrained("FuriouslyAsleep/markuplm-large-finetuned-qa")

tokenizer = MarkupLMTokenizer(
    vocab_file="vocab.json",
    merges_file="merges.txt",
    tags_dict= {"a": 0, "abbr": 1, "acronym": 2, "address": 3, "altGlyph": 4, "altGlyphDef": 5, "altGlyphItem": 6, "animate": 7, "animateColor": 8, "animateMotion": 9, "animateTransform": 10, "applet": 11, "area": 12, "article": 13, "aside": 14, "audio": 15, "b": 16, "base": 17, "basefont": 18, "bdi": 19, "bdo": 20, "bgsound": 21, "big": 22, "blink": 23, "blockquote": 24, "body": 25, "br": 26, "button": 27, "canvas": 28, "caption": 29, "center": 30, "circle": 31, "cite": 32, "clipPath": 33, "code": 34, "col": 35, "colgroup": 36, "color-profile": 37, "content": 38, "cursor": 39, "data": 40, "datalist": 41, "dd": 42, "defs": 43, "del": 44, "desc": 45, "details": 46, "dfn": 47, "dialog": 48, "dir": 49, "div": 50, "dl": 51, "dt": 52, "ellipse": 53, "em": 54, "embed": 55, "feBlend": 56, "feColorMatrix": 57, "feComponentTransfer": 58, "feComposite": 59, "feConvolveMatrix": 60, "feDiffuseLighting": 61, "feDisplacementMap": 62, "feDistantLight": 63, "feFlood": 64, "feFuncA": 65, "feFuncB": 66, "feFuncG": 67, "feFuncR": 68, "feGaussianBlur": 69, "feImage": 70, "feMerge": 71, "feMergeNode": 72, "feMorphology": 73, "feOffset": 74, "fePointLight": 75, "feSpecularLighting": 76, "feSpotLight": 77, "feTile": 78, "feTurbulence": 79, "fieldset": 80, "figcaption": 81, "figure": 82, "filter": 83, "font-face-format": 84, "font-face-name": 85, "font-face-src": 86, "font-face-uri": 87, "font-face": 88, "font": 89, "footer": 90, "foreignObject": 91, "form": 92, "frame": 93, "frameset": 94, "g": 95, "glyph": 96, "glyphRef": 97, "h1": 98, "h2": 99, "h3": 100, "h4": 101, "h5": 102, "h6": 103, "head": 104, "header": 105, "hgroup": 106, "hkern": 107, "hr": 108, "html": 109, "i": 110, "iframe": 111, "image": 112, "img": 113, "input": 114, "ins": 115, "kbd": 116, "keygen": 117, "label": 118, "legend": 119, "li": 120, "line": 121, "linearGradient": 122, "link": 123, "main": 124, "map": 125, "mark": 126, "marker": 127, "marquee": 128, "mask": 129, "math": 130, "menu": 131, "menuitem": 132, "meta": 133, "metadata": 134, "meter": 135, "missing-glyph": 136, "mpath": 137, "nav": 138, "nobr": 139, "noembed": 140, "noframes": 141, "noscript": 142, "object": 143, "ol": 144, "optgroup": 145, "option": 146, "output": 147, "p": 148, "param": 149, "path": 150, "pattern": 151, "picture": 152, "plaintext": 153, "polygon": 154, "polyline": 155, "portal": 156, "pre": 157, "progress": 158, "q": 159, "radialGradient": 160, "rb": 161, "rect": 162, "rp": 163, "rt": 164, "rtc": 165, "ruby": 166, "s": 167, "samp": 168, "script": 169, "section": 170, "select": 171, "set": 172, "shadow": 173, "slot": 174, "small": 175, "source": 176, "spacer": 177, "span": 178, "stop": 179, "strike": 180, "strong": 181, "style": 182, "sub": 183, "summary": 184, "sup": 185, "svg": 186, "switch": 187, "symbol": 188, "table": 189, "tbody": 190, "td": 191, "template": 192, "text": 193, "textPath": 194, "textarea": 195, "tfoot": 196, "th": 197, "thead": 198, "time": 199, "title": 200, "tr": 201, "track": 202, "tref": 203, "tspan": 204, "tt": 205, "u": 206, "ul": 207, "use": 208, "var": 209, "video": 210, "view": 211, "vkern": 212, "wbr": 213, "xmp": 214},
    add_prefix_space=True,
)

#page_name_1 = "outputTables.html"

#with open(page_name_1) as f:
#    single_html_string = f.read()

# test not batched
#encoding = tokenizer(
#    single_html_string,
#    padding="max_length",
#    max_length=512,
#    stride=128,
#    truncation=True,
#    return_overflowing_tokens=True,
#    return_tensors="pt",
#)

#Show details of encoding
#for k, v in encoding.items():
#    print(k, v.shape)

#More information prints
#print(encoding.input_ids)
#print(tokenizer.decode(encoding.input_ids[0].tolist()))


#Question Answering





#question_answerer = pipeline("question-answering")
 #result  = question_answerer(question = question, context=context)
 #return result['answer']


#interface = gr.Interface.from_pipeline(question_answerer,
#    title = title,
#    theme = "grass",
#    examples = [[context, question]]).launch()
    
    


def greet(htmlString,userQuestion):
    
    question = userQuestion
    single_html_string = htmlString
    #single_html_string = "<h1>Grid Fin</h1><table><tr><td><p>Cost Risk</p></td><td><p>None</p></td></tr><tr><td><p>Schedule Risk</p></td><td><p>None</p></td></tr></table><h1>Propulsion</h1><table><tr><td><p>Cost Risk</p></td><td><p>Yes, material overrun</p></td></tr><tr><td><p>Schedule Risk</p></td><td><p>None</p></td></tr></table><h1>Nose Cone</h1><table><tr><td><p>Cost Risk</p></td><td><p>Labor rate increases</p></td></tr><tr><td><p>Schedule Risk</p></td><td><p>None</p></td></tr></table>"
    inputs = tokenizer.encode_plus(question, single_html_string, return_tensors="pt", padding="max_length", max_length=512, truncation=True) 
    
    answer_start_scores, answer_end_scores = model(**inputs, return_dict=False)
    answer_start = torch.argmax(answer_start_scores)  # get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # get the most likely end of answer with the argmax of the score
    #print(single_html_string)
    #print(answer_start)
    #print('\n')
    #print(answer_end)
    
    tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    #print(question)
    answerForGradio = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end]))
    #print(tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start:answer_end])))
        
    
    
    return answerForGradio, htmlString






#iface = gr.Interface(fn=greet, inputs="text", outputs="text")

iface = gr.Interface(
    greet,
    [
        gr.inputs.Textbox(
            lines=3, default="<h1>Grid Fin</h1><table><tr><td><p>Cost Risk</p></td><td><p>None</p></td></tr><tr><td><p>Schedule Risk</p></td><td><p>None</p></td></tr></table><h1>Propulsion</h1><table><tr><td><p>Cost Risk</p></td><td><p>Yes, material overrun</p></td></tr><tr><td><p>Schedule Risk</p></td><td><p>None</p></td></tr></table><h1>Nose Cone</h1><table><tr><td><p>Cost Risk</p></td><td><p>Labor rate increases</p></td></tr><tr><td><p>Schedule Risk</p></td><td><p>None</p></td></tr></table>"
        ),
        gr.inputs.Textbox(lines=3, default="What is the propulsion risk? "),
    ],
    [gr.outputs.Textbox(label="Question Answer"),gr.outputs.HTML(label="HTML Render")], title="Test out extractive Question-Answering on a short HTML string"
)
iface.launch()



iface.launch()