davidmezzetti commited on
Commit
121ffd6
1 Parent(s): 39f298a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +278 -93
app.py CHANGED
@@ -15,7 +15,7 @@ import streamlit as st
15
 
16
  from txtai.embeddings import Documents, Embeddings
17
  from txtai.pipeline import Segmentation, Summary, Tabular, Translation
18
- from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
19
 
20
 
21
  class Application:
@@ -23,11 +23,14 @@ class Application:
23
  Main application.
24
  """
25
 
26
- def __init__(self):
27
  """
28
  Creates a new application.
29
  """
30
 
 
 
 
31
  # Component options
32
  self.components = {}
33
 
@@ -42,20 +45,177 @@ class Application:
42
  self.documents = None
43
  self.data = None
44
 
45
- def number(self, label):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  """
47
- Extracts a number from a text input field.
48
 
49
  Args:
50
- label: label to use for text input field
 
 
 
51
 
52
  Returns:
53
- numeric input
54
  """
55
 
56
- value = st.sidebar.text_input(label)
57
  return int(value) if value else None
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def split(self, text):
60
  """
61
  Splits text on commas and returns a list.
@@ -69,12 +229,13 @@ class Application:
69
 
70
  return [x.strip() for x in text.split(",")]
71
 
72
- def options(self, component):
73
  """
74
  Extracts component settings into a component configuration dict.
75
 
76
  Args:
77
  component: component type
 
78
 
79
  Returns:
80
  dict with component settings
@@ -82,49 +243,62 @@ class Application:
82
 
83
  options = {"type": component}
84
 
85
- st.sidebar.markdown("---")
86
 
87
- if component == "embeddings":
88
- st.sidebar.markdown("**Embeddings Index** \n*Index workflow output*")
89
- options["path"] = st.sidebar.text_input("Embeddings model path", value="sentence-transformers/nli-mpnet-base-v2")
90
- options["upsert"] = st.sidebar.checkbox("Upsert")
 
 
 
 
 
 
 
91
 
92
- elif component == "summary":
93
- st.sidebar.markdown("**Summary** \n*Abstractive text summarization*")
94
- options["path"] = st.sidebar.text_input("Model", value="sshleifer/distilbart-cnn-12-6")
95
- options["minlength"] = self.number("Min length")
96
- options["maxlength"] = self.number("Max length")
97
-
98
- elif component == "segment":
99
- st.sidebar.markdown("**Segment** \n*Split text into semantic units*")
100
-
101
- options["sentences"] = st.sidebar.checkbox("Split sentences")
102
- options["lines"] = st.sidebar.checkbox("Split lines")
103
- options["paragraphs"] = st.sidebar.checkbox("Split paragraphs")
104
- options["join"] = st.sidebar.checkbox("Join tokenized")
105
- options["minlength"] = self.number("Min section length")
106
 
107
  elif component == "service":
108
- options["url"] = st.sidebar.text_input("URL")
109
- options["method"] = st.sidebar.selectbox("Method", ["get", "post"], index=0)
110
- options["params"] = st.sidebar.text_input("URL parameters")
111
- options["batch"] = st.sidebar.checkbox("Run as batch", value=True)
112
- options["extract"] = st.sidebar.text_input("Subsection(s) to extract")
 
113
 
114
  if options["params"]:
115
  options["params"] = {key: None for key in self.split(options["params"])}
116
  if options["extract"]:
117
  options["extract"] = self.split(options["extract"])
118
 
 
 
 
 
 
 
119
  elif component == "tabular":
120
- options["idcolumn"] = st.sidebar.text_input("Id columns")
121
- options["textcolumns"] = st.sidebar.text_input("Text columns")
 
122
  if options["textcolumns"]:
123
  options["textcolumns"] = self.split(options["textcolumns"])
124
 
125
- elif component == "translate":
126
- st.sidebar.markdown("**Translate** \n*Machine translation*")
127
- options["target"] = st.sidebar.text_input("Target language code", value="en")
128
 
129
  return options
130
 
@@ -137,7 +311,7 @@ class Application:
137
  """
138
 
139
  # Clear application
140
- self.__init__()
141
 
142
  # pylint: disable=W0108
143
  tasks = []
@@ -151,12 +325,12 @@ class Application:
151
  self.documents = Documents()
152
  tasks.append(Task(self.documents.add, unpack=False))
153
 
154
- elif wtype == "segment":
155
- self.pipelines[wtype] = Segmentation(**self.components["segment"])
156
- tasks.append(Task(self.pipelines["segment"]))
157
 
158
  elif wtype == "service":
159
- tasks.append(ServiceTask(**self.components["service"]))
160
 
161
  elif wtype == "summary":
162
  self.pipelines[wtype] = Summary(component.pop("path"))
@@ -164,11 +338,11 @@ class Application:
164
 
165
  elif wtype == "tabular":
166
  self.pipelines[wtype] = Tabular(**self.components["tabular"])
167
- tasks.append(Task(self.pipelines["tabular"]))
168
 
169
- elif wtype == "translate":
170
  self.pipelines[wtype] = Translation()
171
- tasks.append(Task(lambda x: self.pipelines["translate"](x, **self.components["translate"])))
172
 
173
  self.workflow = Workflow(tasks)
174
 
@@ -180,11 +354,11 @@ class Application:
180
  components: list of components to export to YAML
181
 
182
  Returns:
183
- YAML string
184
  """
185
 
186
  # pylint: disable=W0108
187
- data = {}
188
  tasks = []
189
  name = None
190
 
@@ -192,43 +366,35 @@ class Application:
192
  component = dict(component)
193
  name = wtype = component.pop("type")
194
 
195
- if wtype == "summary":
196
- data["summary"] = {"path": component.pop("path")}
197
- tasks.append({"action": "summary"})
198
 
199
- elif wtype == "segment":
200
- data["segmentation"] = component
201
- tasks.append({"action": "segmentation"})
 
 
 
 
 
 
202
 
203
  elif wtype == "service":
204
  config = dict(**component)
205
- config["task"] = "service"
206
  tasks.append(config)
207
 
208
- elif wtype == "tabular":
209
- data["tabular"] = component
210
- tasks.append({"action": "tabular"})
211
-
212
- elif wtype == "textract":
213
- data["textractor"] = component
214
- tasks.append({"action": "textractor", "task": "url"})
215
-
216
- elif wtype == "transcribe":
217
- data["transcription"] = {"path": component.pop("path")}
218
- tasks.append({"action": "transcription", "task": "url"})
219
-
220
- elif wtype == "translate":
221
- data["translation"] = {}
222
- tasks.append({"action": "translation", "args": list(component.values())})
223
-
224
- elif wtype == "embeddings":
225
- upsert = component.pop("upsert")
226
 
227
- data["embeddings"] = component
228
- data["writable"] = True
 
229
 
230
- name = "index"
231
- tasks.append({"action": "upsert" if upsert else "index"})
 
232
 
233
  # Add in workflow
234
  data["workflow"] = {name: {"tasks": tasks}}
@@ -248,12 +414,13 @@ class Application:
248
 
249
  return [text for uid, text, _ in self.data if uid == key][0]
250
 
251
- def process(self, data):
252
  """
253
  Processes the current application action.
254
 
255
  Args:
256
  data: input data
 
257
  """
258
 
259
  if data and self.workflow:
@@ -279,10 +446,16 @@ class Application:
279
  self.documents, self.pipelines, self.workflow = None, None, None
280
 
281
  if self.embeddings and self.data:
 
 
 
282
  # Set query and limit
283
- query = st.text_input("Query")
284
  limit = min(5, len(self.data))
285
 
 
 
 
286
  st.markdown(
287
  """
288
  <style>
@@ -323,23 +496,28 @@ class Application:
323
  Runs Streamlit application.
324
  """
325
 
326
- st.sidebar.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
327
- st.sidebar.markdown("# Workflow builder \n*Build and apply workflows to data* \n[GitHub](https://github.com/neuml/txtai) ")
 
 
328
 
329
- # Get selected components
330
- components = ["embeddings", "segment", "service", "summary", "tabular", "translate"]
331
- selected = st.sidebar.multiselect("Select components", components)
332
 
333
- # Get selected options
334
- components = [self.options(component) for component in selected]
335
- st.sidebar.markdown("---")
336
 
337
- with st.sidebar:
 
 
 
 
338
  col1, col2 = st.columns(2)
339
-
340
- # Build or re-build workflow when build button clicked
341
  build = col1.button("Build", help="Build the workflow and run within this application")
342
- if build:
343
  with st.spinner("Building workflow...."):
344
  self.build(components)
345
 
@@ -349,13 +527,20 @@ class Application:
349
  col2.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
350
 
351
  with st.expander("Data", expanded=not self.data):
352
- data = st.text_area("Input", height=10)
 
 
 
 
 
 
 
353
 
354
  # Parse text items
355
  data = self.parse(data) if data else data
356
 
357
  # Process current action
358
- self.process(data)
359
 
360
 
361
  @st.cache(allow_output_mutation=True)
@@ -367,7 +552,7 @@ def create():
367
  Application
368
  """
369
 
370
- return Application()
371
 
372
 
373
  if __name__ == "__main__":
 
15
 
16
  from txtai.embeddings import Documents, Embeddings
17
  from txtai.pipeline import Segmentation, Summary, Tabular, Translation
18
+ from txtai.workflow import ServiceTask, Task, Workflow
19
 
20
 
21
  class Application:
 
23
  Main application.
24
  """
25
 
26
+ def __init__(self, directory):
27
  """
28
  Creates a new application.
29
  """
30
 
31
+ # Workflow configuration directory
32
+ self.directory = directory
33
+
34
  # Component options
35
  self.components = {}
36
 
 
45
  self.documents = None
46
  self.data = None
47
 
48
+ def load(self, components):
49
+ """
50
+ Load an existing workflow file.
51
+
52
+ Args:
53
+ components: list of components to load
54
+
55
+ Returns:
56
+ (names of components loaded, workflow config)
57
+ """
58
+
59
+ with open(os.path.join(self.directory, "config.yml")) as f:
60
+ config = yaml.safe_load(f)
61
+
62
+ names = [row["name"] for row in config]
63
+ files = [row["file"] for row in config]
64
+
65
+ selected = st.selectbox("Load workflow", ["--"] + names)
66
+ if selected != "--":
67
+ index = [x for x, name in enumerate(names) if name == selected][0]
68
+ with open(os.path.join(self.directory, files[index])) as f:
69
+ workflow = yaml.safe_load(f)
70
+
71
+ st.markdown("---")
72
+
73
+ # Get tasks for first workflow
74
+ tasks = list(workflow["workflow"].values())[0]["tasks"]
75
+ selected = []
76
+
77
+ for task in tasks:
78
+ name = task.get("action", task.get("task"))
79
+ if name in components:
80
+ selected.append(name)
81
+ elif name in ["index", "upsert"]:
82
+ selected.append("embeddings")
83
+
84
+ return (selected, workflow)
85
+
86
+ return (None, None)
87
+
88
+ def state(self, key):
89
+ """
90
+ Lookup a session state variable.
91
+
92
+ Args:
93
+ key: variable key
94
+
95
+ Returns:
96
+ variable value
97
+ """
98
+
99
+ if key in st.session_state:
100
+ return st.session_state[key]
101
+
102
+ return None
103
+
104
+ def appsetting(self, workflow, name):
105
+ """
106
+ Looks up an application configuration setting.
107
+
108
+ Args:
109
+ workflow: workflow configuration
110
+ name: setting name
111
+
112
+ Returns:
113
+ app setting value
114
+ """
115
+
116
+ if workflow:
117
+ config = workflow.get("app")
118
+ if config:
119
+ return config.get(name)
120
+
121
+ return None
122
+
123
+ def setting(self, config, name, default=None):
124
+ """
125
+ Looks up a component configuration setting.
126
+
127
+ Args:
128
+ config: component configuration
129
+ name: setting name
130
+ default: default setting value
131
+
132
+ Returns:
133
+ setting value
134
+ """
135
+
136
+ return config.get(name, default) if config else default
137
+
138
+ def text(self, label, config, name, default=None):
139
+ """
140
+ Create a new text input field.
141
+
142
+ Args:
143
+ label: field label
144
+ config: component configuration
145
+ name: setting name
146
+ default: default setting value
147
+
148
+ Returns:
149
+ text input field value
150
+ """
151
+
152
+ default = self.setting(config, name, default)
153
+ if not default:
154
+ default = ""
155
+ elif isinstance(default, list):
156
+ default = ",".join(default)
157
+ elif isinstance(default, dict):
158
+ default = ",".join(default.keys())
159
+
160
+ return st.text_input(label, value=default)
161
+
162
+ def number(self, label, config, name, default=None):
163
  """
164
+ Creates a new numeric input field.
165
 
166
  Args:
167
+ label: field label
168
+ config: component configuration
169
+ name: setting name
170
+ default: default setting value
171
 
172
  Returns:
173
+ numeric value
174
  """
175
 
176
+ value = self.text(label, config, name, default)
177
  return int(value) if value else None
178
 
179
+ def boolean(self, label, config, name, default=False):
180
+ """
181
+ Creates a new checkbox field.
182
+
183
+ Args:
184
+ label: field label
185
+ config: component configuration
186
+ name: setting name
187
+ default: default setting value
188
+
189
+ Returns:
190
+ boolean value
191
+ """
192
+
193
+ default = self.setting(config, name, default)
194
+ return st.checkbox(label, value=default)
195
+
196
+ def select(self, label, config, name, options, default=0):
197
+ """
198
+ Creates a new select box field.
199
+
200
+ Args:
201
+ label: field label
202
+ config: component configuration
203
+ name: setting name
204
+ options: list of dropdown options
205
+ default: default setting value
206
+
207
+ Returns:
208
+ boolean value
209
+ """
210
+
211
+ index = self.setting(config, name)
212
+ index = [x for x, option in enumerate(options) if option == default]
213
+
214
+ # Derive default index
215
+ default = index[0] if index else default
216
+
217
+ return st.selectbox(label, options, index=default)
218
+
219
  def split(self, text):
220
  """
221
  Splits text on commas and returns a list.
 
229
 
230
  return [x.strip() for x in text.split(",")]
231
 
232
+ def options(self, component, workflow):
233
  """
234
  Extracts component settings into a component configuration dict.
235
 
236
  Args:
237
  component: component type
238
+ workflow: existing workflow, can be None
239
 
240
  Returns:
241
  dict with component settings
 
243
 
244
  options = {"type": component}
245
 
246
+ st.markdown("---")
247
 
248
+ # Lookup component configuration
249
+ # - Runtime components have config defined within tasks
250
+ # - Pipeline components have config defined at workflow root
251
+ config = None
252
+ if workflow:
253
+ if component in ["service", "translation"]:
254
+ # Service config is found in tasks section
255
+ tasks = list(workflow["workflow"].values())[0]["tasks"]
256
+ config = [task for task in tasks if task.get("task") == component or task.get("action") == component][0]
257
+ else:
258
+ config = workflow.get(component)
259
 
260
+ if component == "embeddings":
261
+ st.markdown("**Embeddings Index** \n*Index workflow output*")
262
+ options["path"] = self.text("Embeddings model path", config, "path", "sentence-transformers/nli-mpnet-base-v2")
263
+ options["upsert"] = self.boolean("Upsert", config, "upsert")
264
+
265
+ elif component == "segmentation":
266
+ st.markdown("**Segment** \n*Split text into semantic units*")
267
+ options["sentences"] = self.boolean("Split sentences", config, "sentences")
268
+ options["lines"] = self.boolean("Split lines", config, "lines")
269
+ options["paragraphs"] = self.boolean("Split paragraphs", config, "paragraphs")
270
+ options["join"] = self.boolean("Join tokenized", config, "join")
271
+ options["minlength"] = self.number("Min section length", config, "minlength")
 
 
272
 
273
  elif component == "service":
274
+ st.markdown("**Service** \n*Extract data from an API*")
275
+ options["url"] = self.text("URL", config, "url")
276
+ options["method"] = self.select("Method", config, "method", ["get", "post"], 0)
277
+ options["params"] = self.text("URL parameters", config, "params")
278
+ options["batch"] = self.boolean("Run as batch", config, "batch", True)
279
+ options["extract"] = self.text("Subsection(s) to extract", config, "extract")
280
 
281
  if options["params"]:
282
  options["params"] = {key: None for key in self.split(options["params"])}
283
  if options["extract"]:
284
  options["extract"] = self.split(options["extract"])
285
 
286
+ elif component == "summary":
287
+ st.markdown("**Summary** \n*Abstractive text summarization*")
288
+ options["path"] = self.text("Model", config, "path", "sshleifer/distilbart-cnn-12-6")
289
+ options["minlength"] = self.number("Min length", config, "minlength")
290
+ options["maxlength"] = self.number("Max length", config, "maxlength")
291
+
292
  elif component == "tabular":
293
+ st.markdown("**Tabular** \n*Split tabular data into rows and columns*")
294
+ options["idcolumn"] = self.text("Id columns", config, "idcolumn")
295
+ options["textcolumns"] = self.text("Text columns", config, "textcolumns")
296
  if options["textcolumns"]:
297
  options["textcolumns"] = self.split(options["textcolumns"])
298
 
299
+ elif component == "translation":
300
+ st.markdown("**Translate** \n*Machine translation*")
301
+ options["target"] = self.text("Target language code", config, "args", "en")
302
 
303
  return options
304
 
 
311
  """
312
 
313
  # Clear application
314
+ self.__init__(self.directory)
315
 
316
  # pylint: disable=W0108
317
  tasks = []
 
325
  self.documents = Documents()
326
  tasks.append(Task(self.documents.add, unpack=False))
327
 
328
+ elif wtype == "segmentation":
329
+ self.pipelines[wtype] = Segmentation(**self.components[wtype])
330
+ tasks.append(Task(self.pipelines[wtype]))
331
 
332
  elif wtype == "service":
333
+ tasks.append(ServiceTask(**self.components[wtype]))
334
 
335
  elif wtype == "summary":
336
  self.pipelines[wtype] = Summary(component.pop("path"))
 
338
 
339
  elif wtype == "tabular":
340
  self.pipelines[wtype] = Tabular(**self.components["tabular"])
341
+ tasks.append(Task(self.pipelines[wtype]))
342
 
343
+ elif wtype == "translation":
344
  self.pipelines[wtype] = Translation()
345
+ tasks.append(Task(lambda x: self.pipelines["translation"](x, **self.components["translation"])))
346
 
347
  self.workflow = Workflow(tasks)
348
 
 
354
  components: list of components to export to YAML
355
 
356
  Returns:
357
+ (workflow name, YAML string)
358
  """
359
 
360
  # pylint: disable=W0108
361
+ data = {"app": {"data": self.state("data"), "query": self.state("query")}}
362
  tasks = []
363
  name = None
364
 
 
366
  component = dict(component)
367
  name = wtype = component.pop("type")
368
 
369
+ if wtype == "embeddings":
370
+ upsert = component.pop("upsert")
 
371
 
372
+ data[wtype] = component
373
+ data["writable"] = True
374
+
375
+ name = "index"
376
+ tasks.append({"action": "upsert" if upsert else "index"})
377
+
378
+ elif wtype == "segmentation":
379
+ data[wtype] = component
380
+ tasks.append({"action": wtype})
381
 
382
  elif wtype == "service":
383
  config = dict(**component)
384
+ config["task"] = wtype
385
  tasks.append(config)
386
 
387
+ elif wtype == "summary":
388
+ data[wtype] = {"path": component.pop("path")}
389
+ tasks.append({"action": wtype})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
+ elif wtype == "tabular":
392
+ data[wtype] = component
393
+ tasks.append({"action": wtype})
394
 
395
+ elif wtype == "translation":
396
+ data[wtype] = {}
397
+ tasks.append({"action": wtype, "args": list(component.values())})
398
 
399
  # Add in workflow
400
  data["workflow"] = {name: {"tasks": tasks}}
 
414
 
415
  return [text for uid, text, _ in self.data if uid == key][0]
416
 
417
+ def process(self, data, workflow):
418
  """
419
  Processes the current application action.
420
 
421
  Args:
422
  data: input data
423
+ workflow: workflow configuration
424
  """
425
 
426
  if data and self.workflow:
 
446
  self.documents, self.pipelines, self.workflow = None, None, None
447
 
448
  if self.embeddings and self.data:
449
+ default = self.appsetting(workflow, "query")
450
+ default = default if default else ""
451
+
452
  # Set query and limit
453
+ query = st.text_input("Query", value=default)
454
  limit = min(5, len(self.data))
455
 
456
+ # Save query state
457
+ st.session_state["query"] = query
458
+
459
  st.markdown(
460
  """
461
  <style>
 
496
  Runs Streamlit application.
497
  """
498
 
499
+ with st.sidebar:
500
+ st.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
501
+ st.markdown("# Workflow builder \n*Build and apply workflows to data* \n\nRead more on [GitHub](https://github.com/neuml/txtai) ")
502
+ st.markdown("---")
503
 
504
+ # Component configuration
505
+ labels = {"segmentation": "segment", "translation": "translate"}
506
+ components = ["embeddings", "segmentation", "service", "summary", "tabular", "translation"]
507
 
508
+ selected, workflow = self.load(components)
509
+ selected = st.multiselect("Select components", components, default=selected, format_func=lambda text: labels.get(text, text))
 
510
 
511
+ # Get selected options
512
+ components = [self.options(component, workflow) for component in selected]
513
+ st.markdown("---")
514
+
515
+ # Export buttons
516
  col1, col2 = st.columns(2)
517
+
518
+ # Build or re-build workflow when build button clicked or new workflow loaded
519
  build = col1.button("Build", help="Build the workflow and run within this application")
520
+ if build or (workflow and workflow != self.state("workflow")):
521
  with st.spinner("Building workflow...."):
522
  self.build(components)
523
 
 
527
  col2.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
528
 
529
  with st.expander("Data", expanded=not self.data):
530
+ default = self.appsetting(workflow, "data")
531
+ default = default if default else ""
532
+
533
+ data = st.text_area("Input", height=10, value=default)
534
+
535
+ # Save data and workflow state
536
+ st.session_state["data"] = data
537
+ st.session_state["workflow"] = workflow
538
 
539
  # Parse text items
540
  data = self.parse(data) if data else data
541
 
542
  # Process current action
543
+ self.process(data, workflow)
544
 
545
 
546
  @st.cache(allow_output_mutation=True)
 
552
  Application
553
  """
554
 
555
+ return Application(".")
556
 
557
 
558
  if __name__ == "__main__":