Francisco Santos commited on
Commit
2bf0a39
1 Parent(s): 270353e

first commit

Browse files
Files changed (1) hide show
  1. app.py +165 -0
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import time
5
+ import os
6
+ from transformers import AutoTokenizer, pipeline
7
+
8
+ models = {
9
+ "model_n1": "sileod/deberta-v3-base-tasksource-nli",
10
+ # "model_n2": "roberta-large-mnli",
11
+ # "model_n3": "facebook/bart-large-mnli",
12
+ # "model_n4": "cross-encoder/nli-deberta-v3-xsmall"
13
+ }
14
+ def open_html(file):
15
+ with open(file.name, "r") as f:
16
+ content = f.read()
17
+ return content
18
+
19
+ def find_form_fields(html_content):
20
+
21
+ soup = BeautifulSoup(html_content, 'html.parser')
22
+
23
+ # find all form tags
24
+ forms = soup.find_all('form')
25
+
26
+ form_fields = []
27
+
28
+ for form in forms:
29
+ # find all input and select tags within each form
30
+ input_tags = form.find_all('input')
31
+ select_tags = form.find_all('select')
32
+
33
+ for tag in input_tags:
34
+ form_fields.append(str(tag))
35
+
36
+ for tag in select_tags:
37
+ form_fields.append(str(tag))
38
+
39
+ # Convert the list to a single string for display
40
+ return form_fields
41
+
42
+ def load_json(json_file):
43
+ with open(json_file, 'r') as f:
44
+ data = json.load(f)
45
+ return data
46
+
47
+ def classify_lines(text, candidate_labels, model_name):
48
+ start_time = time.time() # Start measuring time
49
+ classifier = pipeline('zero-shot-classification', model=model_name)
50
+
51
+ # Check if the text is already a list or if it needs splitting
52
+ if isinstance(text, list):
53
+ lines = text
54
+ else:
55
+ lines = text.split('\n')
56
+
57
+ classified_lines = []
58
+ for line in lines:
59
+ if line.strip() and (line.strip().startswith("<input") or line.strip().startswith("<select") )and 'hidden' not in line.lower():
60
+ # Skip empty lines, classify lines starting with "<input", and exclude lines with 'hidden'
61
+ results = classifier(line, candidate_labels=candidate_labels)
62
+ top_classifications = results['labels'][:2] # Get the top two classifications
63
+ top_scores = results['scores'][:2] # Get the top two scores
64
+ classified_lines.append((line, list(zip(top_classifications, top_scores))))
65
+ end_time = time.time() # Stop measuring time
66
+ execution_time = end_time - start_time # Calculate execution time
67
+ return classified_lines, execution_time
68
+
69
+ def classify_lines_json(text, json_content, candidate_labels, model_name, output_file_path):
70
+ start_time = time.time() # Start measuring time
71
+ classifier = pipeline('zero-shot-classification', model=model_name)
72
+
73
+ # Check if the text is already a list or if it needs splitting
74
+ if isinstance(text, list):
75
+ lines = text
76
+ else:
77
+ lines = text.split('\n')
78
+
79
+ # Open the output.html file in write mode
80
+ output_content = []
81
+
82
+ with open(output_file_path, 'w') as output_file:
83
+ for line in lines:
84
+
85
+ if line.strip() and (line.strip().startswith("<input") or line.strip().startswith("<select") )and 'hidden' not in line.lower():
86
+ # Skip empty lines, classify lines starting with "<input", and exclude lines with 'hidden'
87
+ results = classifier(line, candidate_labels=candidate_labels)
88
+ top_classifications = results['labels'][:2] # Get the top two classifications
89
+ top_scores = results['scores'][:2] # Get the top two scores
90
+ line = line + f"<!-- Input: {json_content[top_classifications[0]]} with this certainty: {top_scores[0]} -->"
91
+ output_file.write(line + '\n')
92
+ output_content.append(line + '\n')
93
+
94
+
95
+ end_time = time.time() # Stop measuring time
96
+ execution_time = end_time - start_time # Calculate execution time
97
+ return output_content, execution_time
98
+
99
+ def retrieve_fields(data, path=''):
100
+ """Recursively retrieve all fields from a given JSON structure and prompt for filling."""
101
+ fields = {}
102
+
103
+ # If the data is a dictionary
104
+ if isinstance(data, dict):
105
+ for key, value in data.items():
106
+ # Construct the updated path for nested structures
107
+ new_path = f"{path}.{key}" if path else key
108
+ fields.update(retrieve_fields(value, new_path))
109
+
110
+ # If the data is a list, iterate over its items
111
+ elif isinstance(data, list):
112
+ for index, item in enumerate(data):
113
+ new_path = f"{path}[{index}]"
114
+ fields.update(retrieve_fields(item, new_path))
115
+
116
+ # If the data is a simple type (str, int, etc.)
117
+ else:
118
+ prompt = f"Please fill in the {path} field." if not data else data
119
+ fields[path] = prompt
120
+
121
+ return fields
122
+
123
+ def retrieve_fields_from_file(file_path):
124
+ """Load JSON data from a file, then retrieve all fields and prompt for filling."""
125
+ with open(file_path.name, 'r') as f:
126
+ data = f.read()
127
+
128
+ return retrieve_fields(json.loads(data))
129
+
130
+
131
+ def process_files(html_file, json_file):
132
+ # This function will process the files.
133
+ # Replace this with your own logic.
134
+ output_file_path = "./output.html"
135
+ # Open and read the files
136
+ html_content = open_html(html_file)
137
+ #print(html_content)
138
+ html_inputs = find_form_fields(html_content)
139
+
140
+ json_content = retrieve_fields_from_file(json_file)
141
+ #Classificar os inputs do json para ver em que tipo de input ["text", "radio", "checkbox", "button", "date"]
142
+
143
+ # Classify lines and measure execution time
144
+ for model_name in models.values():
145
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
146
+
147
+ html_classified_lines, html_execution_time = classify_lines(html_inputs, ["text", "radio", "checkbox", "button", "date"], model_name)
148
+
149
+ json_classified_lines, json_execution_time = classify_lines_json(html_content, json_content, list(json_content.keys()), model_name, output_file_path)
150
+
151
+ # print(str(html_execution_time) + " - " + str(html_classified_lines))
152
+ # print(str(json_execution_time) + " - " + str(json_classified_lines))
153
+ #FILL HERE
154
+
155
+ print(type(json_classified_lines))
156
+ # Assuming your function returns the processed HTML
157
+ #json_classified_lines
158
+ #return '\n'.join(map(str, html_classified_lines))
159
+ return '\n'.join(map(str, json_classified_lines))
160
+
161
+ iface = gr.Interface(fn=process_files,
162
+ inputs=[gr.inputs.File(label="Upload HTML File"), gr.inputs.File(label="Upload JSON File")],
163
+ outputs="text")
164
+
165
+ iface.launch()