davidmezzetti commited on
Commit
5ade8fe
1 Parent(s): f689d08

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -217
app.py CHANGED
@@ -5,8 +5,6 @@ Based on this example: https://github.com/neuml/txtai/blob/master/examples/workf
5
  """
6
 
7
  import os
8
- import re
9
- import uuid
10
 
11
  import nltk
12
  import yaml
@@ -19,18 +17,36 @@ from txtai.pipeline import Segmentation, Summary, Tabular, Textractor, Translati
19
  from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
20
 
21
 
22
- class Application:
23
  """
24
- Main application.
25
  """
26
 
27
- def __init__(self, directory):
 
 
28
  """
29
- Creates a new application.
 
 
 
 
 
 
30
  """
31
 
32
- # Workflow configuration directory
33
- self.directory = directory
 
 
 
 
 
 
 
 
 
 
34
 
35
  # Component options
36
  self.components = {}
@@ -46,38 +62,199 @@ class Application:
46
  self.documents = None
47
  self.data = None
48
 
49
- # Workflow run id
50
- self.runid = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def default(self, names):
53
  """
54
  Gets default workflow index.
55
-
56
  Args:
57
  names: list of workflow names
58
-
59
  Returns:
60
  default workflow index
61
  """
62
-
63
  # Get names as lowercase to match case-insensitive
64
  lnames = [name.lower() for name in names]
65
-
66
  # Get default workflow param
67
  params = st.experimental_get_query_params()
68
  index = params.get("default")
69
  index = index[0].lower() if index else 0
70
-
71
  # Lookup index of workflow name, add 1 to account for "--"
72
  if index and index in lnames:
73
  return lnames.index(index) + 1
74
-
75
  # Workflow not found, default to index 0
76
  return 0
77
-
78
  def load(self, components):
79
  """
80
- Load an existing workflow file.
81
 
82
  Args:
83
  components: list of components to load
@@ -86,7 +263,7 @@ class Application:
86
  (names of components loaded, workflow config)
87
  """
88
 
89
- with open(os.path.join(self.directory, "config.yml")) as f:
90
  config = yaml.safe_load(f)
91
 
92
  names = [row["name"] for row in config]
@@ -95,7 +272,7 @@ class Application:
95
  selected = st.selectbox("Load workflow", ["--"] + names, self.default(names))
96
  if selected != "--":
97
  index = [x for x, name in enumerate(names) if name == selected][0]
98
- with open(os.path.join(self.directory, files[index])) as f:
99
  workflow = yaml.safe_load(f)
100
 
101
  st.markdown("---")
@@ -165,12 +342,13 @@ class Application:
165
 
166
  return config.get(name, default) if config else default
167
 
168
- def text(self, label, config, name, default=None):
169
  """
170
  Create a new text input field.
171
 
172
  Args:
173
  label: field label
 
174
  config: component configuration
175
  name: setting name
176
  default: default setting value
@@ -187,14 +365,15 @@ class Application:
187
  elif isinstance(default, dict):
188
  default = ",".join(default.keys())
189
 
190
- return st.text_input(label, value=default)
191
 
192
- def number(self, label, config, name, default=None):
193
  """
194
  Creates a new numeric input field.
195
 
196
  Args:
197
  label: field label
 
198
  config: component configuration
199
  name: setting name
200
  default: default setting value
@@ -203,15 +382,16 @@ class Application:
203
  numeric value
204
  """
205
 
206
- value = self.text(label, config, name, default)
207
  return int(value) if value else None
208
 
209
- def boolean(self, label, config, name, default=False):
210
  """
211
  Creates a new checkbox field.
212
 
213
  Args:
214
  label: field label
 
215
  config: component configuration
216
  name: setting name
217
  default: default setting value
@@ -221,14 +401,15 @@ class Application:
221
  """
222
 
223
  default = self.setting(config, name, default)
224
- return st.checkbox(label, value=default)
225
 
226
- def select(self, label, config, name, options, default=0):
227
  """
228
  Creates a new select box field.
229
 
230
  Args:
231
  label: field label
 
232
  config: component configuration
233
  name: setting name
234
  options: list of dropdown options
@@ -244,7 +425,7 @@ class Application:
244
  # Derive default index
245
  default = index[0] if index else default
246
 
247
- return st.selectbox(label, options, index=default)
248
 
249
  def split(self, text):
250
  """
@@ -274,8 +455,6 @@ class Application:
274
  # pylint: disable=R0912, R0915
275
  options = {"type": component}
276
 
277
- st.markdown("---")
278
-
279
  # Lookup component configuration
280
  # - Runtime components have config defined within tasks
281
  # - Pipeline components have config defined at workflow root
@@ -292,8 +471,9 @@ class Application:
292
 
293
  if component == "embeddings":
294
  st.markdown("**Embeddings Index** \n*Index workflow output*")
295
- options["path"] = self.text("Embeddings model path", config, "path", "sentence-transformers/nli-mpnet-base-v2")
296
- options["upsert"] = self.boolean("Upsert", config, "upsert")
 
297
 
298
  elif component in ("segmentation", "textractor"):
299
  if component == "segmentation":
@@ -301,19 +481,19 @@ class Application:
301
  else:
302
  st.markdown("**Textract** \n*Extract text from documents*")
303
 
304
- options["sentences"] = self.boolean("Split sentences", config, "sentences")
305
- options["lines"] = self.boolean("Split lines", config, "lines")
306
- options["paragraphs"] = self.boolean("Split paragraphs", config, "paragraphs")
307
- options["join"] = self.boolean("Join tokenized", config, "join")
308
- options["minlength"] = self.number("Min section length", config, "minlength")
309
 
310
  elif component == "service":
311
  st.markdown("**Service** \n*Extract data from an API*")
312
- options["url"] = self.text("URL", config, "url")
313
- options["method"] = self.select("Method", config, "method", ["get", "post"], 0)
314
- options["params"] = self.text("URL parameters", config, "params")
315
- options["batch"] = self.boolean("Run as batch", config, "batch", True)
316
- options["extract"] = self.text("Subsection(s) to extract", config, "extract")
317
 
318
  if options["params"]:
319
  options["params"] = {key: None for key in self.split(options["params"])}
@@ -322,71 +502,30 @@ class Application:
322
 
323
  elif component == "summary":
324
  st.markdown("**Summary** \n*Abstractive text summarization*")
325
- options["path"] = self.text("Model", config, "path", "sshleifer/distilbart-cnn-12-6")
326
- options["minlength"] = self.number("Min length", config, "minlength")
327
- options["maxlength"] = self.number("Max length", config, "maxlength")
328
 
329
  elif component == "tabular":
330
  st.markdown("**Tabular** \n*Split tabular data into rows and columns*")
331
- options["idcolumn"] = self.text("Id columns", config, "idcolumn")
332
- options["textcolumns"] = self.text("Text columns", config, "textcolumns")
 
 
333
  if options["textcolumns"]:
334
  options["textcolumns"] = self.split(options["textcolumns"])
335
 
 
 
 
 
 
336
  elif component == "translation":
337
  st.markdown("**Translate** \n*Machine translation*")
338
- options["target"] = self.text("Target language code", config, "args", "en")
339
 
340
  return options
341
 
342
- def build(self, components):
343
- """
344
- Builds a workflow using components.
345
-
346
- Args:
347
- components: list of components to add to workflow
348
- """
349
-
350
- # Clear application
351
- self.__init__(self.directory)
352
-
353
- # pylint: disable=W0108
354
- tasks = []
355
- for component in components:
356
- component = dict(component)
357
- wtype = component.pop("type")
358
- self.components[wtype] = component
359
-
360
- if wtype == "embeddings":
361
- self.embeddings = Embeddings({**component})
362
- self.documents = Documents()
363
- tasks.append(Task(self.documents.add, unpack=False))
364
-
365
- elif wtype == "segmentation":
366
- self.pipelines[wtype] = Segmentation(**self.components[wtype])
367
- tasks.append(Task(self.pipelines[wtype]))
368
-
369
- elif wtype == "service":
370
- tasks.append(ServiceTask(**self.components[wtype]))
371
-
372
- elif wtype == "summary":
373
- self.pipelines[wtype] = Summary(component.pop("path"))
374
- tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"])))
375
-
376
- elif wtype == "tabular":
377
- self.pipelines[wtype] = Tabular(**self.components[wtype])
378
- tasks.append(Task(self.pipelines[wtype]))
379
-
380
- elif wtype == "textractor":
381
- self.pipelines[wtype] = Textractor(**self.components[wtype])
382
- tasks.append(UrlTask(self.pipelines[wtype]))
383
-
384
- elif wtype == "translation":
385
- self.pipelines[wtype] = Translation()
386
- tasks.append(Task(lambda x: self.pipelines["translation"](x, **self.components["translation"])))
387
-
388
- self.workflow = Workflow(tasks)
389
-
390
  def yaml(self, components):
391
  """
392
  Builds a yaml string for components.
@@ -398,7 +537,6 @@ class Application:
398
  (workflow name, YAML string)
399
  """
400
 
401
- # pylint: disable=W0108
402
  data = {"app": {"data": self.state("data"), "query": self.state("query")}}
403
  tasks = []
404
  name = None
@@ -446,111 +584,75 @@ class Application:
446
 
447
  return (name, yaml.dump(data))
448
 
449
- def find(self, key):
450
  """
451
- Lookup record from cached data by uid key.
452
 
453
  Args:
454
- key: uid to search for
455
 
456
  Returns:
457
- text for matching uid
458
  """
459
 
460
- text = [text for uid, text, _ in self.data if uid == key][0]
461
- if key and key.lower().startswith("http"):
462
- return "<a href='%s' rel='noopener noreferrer' target='blank'>%s</a>" % (key, text)
 
463
 
464
- return text
 
465
 
466
- def process(self, data, workflow):
 
 
 
467
  """
468
- Processes the current application action.
469
 
470
  Args:
471
- data: input data
472
  workflow: workflow configuration
473
- """
474
-
475
- if data and self.workflow:
476
- # Build tuples for embedding index
477
- if self.documents:
478
- data = [(x, element, None) for x, element in enumerate(data)]
479
-
480
- # Process workflow
481
- for result in self.workflow(data):
482
- if not self.documents:
483
- st.write(result)
484
 
485
- # Build embeddings index
486
- if self.documents:
487
- # Cache data
488
- self.data = list(self.documents)
489
-
490
- with st.spinner("Building embedding index...."):
491
- self.embeddings.index(self.documents)
492
- self.documents.close()
493
-
494
- # Clear workflow
495
- self.documents, self.pipelines, self.workflow = None, None, None
496
-
497
- # Generate workflow run id
498
- self.runid = str(uuid.uuid1())
499
- st.session_state["runid"] = self.runid
500
-
501
- if self.runid != self.state("runid"):
502
- st.error("Workflow data changed in another session. Please re-build and re-run workflow.")
503
- elif self.embeddings and self.data:
504
- default = self.appsetting(workflow, "query")
505
- default = default if default else ""
506
 
507
- # Set query and limit
508
- query = st.text_input("Query", value=default)
509
- limit = min(5, len(self.data))
510
 
511
- # Save query state
512
- st.session_state["query"] = query
513
 
514
- st.markdown(
515
- """
516
- <style>
517
- table td:nth-child(1) {
518
- display: none
519
- }
520
- table th:nth-child(1) {
521
- display: none
522
- }
523
- table {text-align: left !important}
524
- </style>
525
- """,
526
- unsafe_allow_html=True,
527
- )
528
 
529
- if query:
530
- df = pd.DataFrame([{"content": self.find(uid), "score": "%.2f" % score} for uid, score in self.embeddings.search(query, limit)])
531
- st.write(df.to_html(escape=False), unsafe_allow_html=True)
532
 
533
- def parse(self, data):
534
  """
535
- Parse input data, splits on new lines depending on type of tasks and format of input.
536
 
537
  Args:
538
- data: input data
539
-
540
- Returns:
541
- parsed data
542
  """
543
 
544
- if re.match(r"^(http|https|file):\/\/", data) or (self.workflow and isinstance(self.workflow.tasks[0], ServiceTask)):
545
- return [x for x in data.split("\n") if x]
546
 
547
- return [data]
 
 
 
 
 
548
 
549
  def run(self):
550
  """
551
  Runs Streamlit application.
552
  """
553
 
 
554
  with st.sidebar:
555
  st.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
556
  st.markdown("# Workflow builder \n*Build and apply workflows to data* ")
@@ -558,68 +660,32 @@ class Application:
558
  st.markdown("---")
559
 
560
  # Component configuration
561
- labels = {"segmentation": "segment", "textractor": "textract", "translation": "translate"}
562
  components = ["embeddings", "segmentation", "service", "summary", "tabular", "textractor", "translation"]
563
 
564
  selected, workflow = self.load(components)
565
- selected = st.multiselect("Select components", components, default=selected, format_func=lambda text: labels.get(text, text))
566
-
567
- # Get selected options
568
- components = [self.options(component, workflow) for component in selected]
569
- st.markdown("---")
570
-
571
- # Export buttons
572
- col1, col2 = st.columns(2)
573
-
574
- # Build or re-build workflow when build button clicked or new workflow loaded
575
- build = col1.button("Build", help="Build the workflow and run within this application")
576
- if build or (workflow and workflow != self.state("workflow")):
577
- with st.spinner("Building workflow...."):
578
- self.build(components)
579
-
580
- # Generate API configuration
581
- _, config = self.yaml(components)
582
-
583
- col2.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
584
-
585
- with st.expander("Data", expanded=not self.data):
586
- default = self.appsetting(workflow, "data")
587
- default = default if default else ""
588
-
589
- data = st.text_area("Input", height=10, value=default)
590
-
591
- # Save data and workflow state
592
- st.session_state["data"] = data
593
- st.session_state["workflow"] = workflow
594
 
595
  if selected:
596
- # Parse text items
597
- data = self.parse(data) if data else data
598
-
599
  # Process current action
600
- self.process(data, workflow)
601
-
602
-
603
- @st.cache(allow_output_mutation=True)
604
- def create():
605
- """
606
- Creates and caches a Streamlit application.
607
-
608
- Returns:
609
- Application
610
- """
611
 
612
- return Application("workflows")
 
 
 
613
 
614
 
615
  if __name__ == "__main__":
616
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
617
 
 
618
  try:
619
  nltk.sent_tokenize("This is a test. Split")
620
  except:
621
  nltk.download("punkt")
622
 
623
  # Create and run application
624
- app = create()
625
  app.run()
 
5
  """
6
 
7
  import os
 
 
8
 
9
  import nltk
10
  import yaml
 
17
  from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
18
 
19
 
20
+ class Process:
21
  """
22
+ Container for an active Workflow process instance.
23
  """
24
 
25
+ @staticmethod
26
+ @st.cache(ttl=30 * 60, max_entries=3, allow_output_mutation=True, show_spinner=False)
27
+ def get(components):
28
  """
29
+ Lookup or creates a new workflow process instance.
30
+
31
+ Args:
32
+ components: input components
33
+
34
+ Returns:
35
+ Process
36
  """
37
 
38
+ process = Process()
39
+
40
+ # Build workflow
41
+ with st.spinner("Building workflow...."):
42
+ process.build(components)
43
+
44
+ return process
45
+
46
+ def __init__(self):
47
+ """
48
+ Creates a new Process.
49
+ """
50
 
51
  # Component options
52
  self.components = {}
 
62
  self.documents = None
63
  self.data = None
64
 
65
+ def build(self, components):
66
+ """
67
+ Builds a workflow using components.
68
+
69
+ Args:
70
+ components: list of components to add to workflow
71
+ """
72
+
73
+ # pylint: disable=W0108
74
+ tasks = []
75
+ for component in components:
76
+ component = dict(component)
77
+ wtype = component.pop("type")
78
+ self.components[wtype] = component
79
+
80
+ if wtype == "embeddings":
81
+ self.embeddings = Embeddings({**component})
82
+ self.documents = Documents()
83
+ tasks.append(Task(self.documents.add, unpack=False))
84
+
85
+ elif wtype == "segmentation":
86
+ self.pipelines[wtype] = Segmentation(**self.components[wtype])
87
+ tasks.append(Task(self.pipelines[wtype]))
88
+
89
+ elif wtype == "service":
90
+ tasks.append(ServiceTask(**self.components[wtype]))
91
+
92
+ elif wtype == "summary":
93
+ self.pipelines[wtype] = Summary(component.pop("path"))
94
+ tasks.append(Task(lambda x: self.pipelines["summary"](x, **self.components["summary"])))
95
+
96
+ elif wtype == "tabular":
97
+ self.pipelines[wtype] = Tabular(**self.components[wtype])
98
+ tasks.append(Task(self.pipelines[wtype]))
99
+
100
+ elif wtype == "textractor":
101
+ self.pipelines[wtype] = Textractor(**self.components[wtype])
102
+ tasks.append(UrlTask(self.pipelines[wtype]))
103
+
104
+ elif wtype == "translation":
105
+ self.pipelines[wtype] = Translation()
106
+ tasks.append(Task(lambda x: self.pipelines["translation"](x, **self.components["translation"])))
107
+
108
+ self.workflow = Workflow(tasks)
109
+
110
+ def run(self, data):
111
+ """
112
+ Runs a workflow using data as input.
113
+
114
+ Args:
115
+ data: input data
116
+ """
117
+
118
+ if data and self.workflow:
119
+ # Build tuples for embedding index
120
+ if self.documents:
121
+ data = [(x, element, None) for x, element in enumerate(data)]
122
+
123
+ # Process workflow
124
+ for result in self.workflow(data):
125
+ if not self.documents:
126
+ st.write(result)
127
+
128
+ # Build embeddings index
129
+ if self.documents:
130
+ # Cache data
131
+ self.data = list(self.documents)
132
+
133
+ with st.spinner("Building embedding index...."):
134
+ self.embeddings.index(self.documents)
135
+ self.documents.close()
136
+
137
+ # Clear workflow
138
+ self.documents, self.pipelines, self.workflow = None, None, None
139
+
140
+ def search(self, query):
141
+ """
142
+ Runs a search.
143
+
144
+ Args:
145
+ query: input query
146
+ """
147
+
148
+ if self.embeddings and query:
149
+ st.markdown(
150
+ """
151
+ <style>
152
+ table td:nth-child(1) {
153
+ display: none
154
+ }
155
+ table th:nth-child(1) {
156
+ display: none
157
+ }
158
+ table {text-align: left !important}
159
+ </style>
160
+ """,
161
+ unsafe_allow_html=True,
162
+ )
163
+
164
+ limit = min(5, len(self.data))
165
+
166
+ results = []
167
+ for result in self.embeddings.search(query, limit):
168
+ # Tuples are returned when an index doesn't have stored content
169
+ if isinstance(result, tuple):
170
+ uid, score = result
171
+ results.append({"text": self.find(uid), "score": f"{score:.2}"})
172
+ else:
173
+ if "id" in result and "text" in result:
174
+ result["text"] = self.content(result.pop("id"), result["text"])
175
+ if "score" in result and result["score"]:
176
+ result["score"] = f'{result["score"]:.2}'
177
+
178
+ results.append(result)
179
+
180
+ df = pd.DataFrame(results)
181
+ st.write(df.to_html(escape=False), unsafe_allow_html=True)
182
+
183
+ def find(self, key):
184
+ """
185
+ Lookup record from cached data by uid key.
186
+
187
+ Args:
188
+ key: id to search for
189
+
190
+ Returns:
191
+ text for matching id
192
+ """
193
+
194
+ # Lookup text by id
195
+ text = [text for uid, text, _ in self.data if uid == key][0]
196
+ return self.content(key, text)
197
+
198
+ def content(self, uid, text):
199
+ """
200
+ Builds a content reference for uid and text.
201
+
202
+ Args:
203
+ uid: record id
204
+ text: record text
205
+
206
+ Returns:
207
+ content
208
+ """
209
+
210
+ if uid and uid.lower().startswith("http"):
211
+ return f"<a href='{uid}' rel='noopener noreferrer' target='blank'>{text}</a>"
212
+
213
+ return text
214
+
215
+
216
+ class Application:
217
+ """
218
+ Main application.
219
+ """
220
+
221
+ def __init__(self, directory):
222
+ """
223
+ Creates a new application.
224
+ """
225
+
226
+ # Workflow configuration directory
227
+ self.directory = directory
228
 
229
  def default(self, names):
230
  """
231
  Gets default workflow index.
232
+
233
  Args:
234
  names: list of workflow names
235
+
236
  Returns:
237
  default workflow index
238
  """
239
+
240
  # Get names as lowercase to match case-insensitive
241
  lnames = [name.lower() for name in names]
242
+
243
  # Get default workflow param
244
  params = st.experimental_get_query_params()
245
  index = params.get("default")
246
  index = index[0].lower() if index else 0
247
+
248
  # Lookup index of workflow name, add 1 to account for "--"
249
  if index and index in lnames:
250
  return lnames.index(index) + 1
251
+
252
  # Workflow not found, default to index 0
253
  return 0
254
+
255
  def load(self, components):
256
  """
257
+ Load an existing workflow file.
258
 
259
  Args:
260
  components: list of components to load
 
263
  (names of components loaded, workflow config)
264
  """
265
 
266
+ with open(os.path.join(self.directory, "config.yml"), encoding="utf-8") as f:
267
  config = yaml.safe_load(f)
268
 
269
  names = [row["name"] for row in config]
 
272
  selected = st.selectbox("Load workflow", ["--"] + names, self.default(names))
273
  if selected != "--":
274
  index = [x for x, name in enumerate(names) if name == selected][0]
275
+ with open(os.path.join(self.directory, files[index]), encoding="utf-8") as f:
276
  workflow = yaml.safe_load(f)
277
 
278
  st.markdown("---")
 
342
 
343
  return config.get(name, default) if config else default
344
 
345
+ def text(self, label, component, config, name, default=None):
346
  """
347
  Create a new text input field.
348
 
349
  Args:
350
  label: field label
351
+ component: component name
352
  config: component configuration
353
  name: setting name
354
  default: default setting value
 
365
  elif isinstance(default, dict):
366
  default = ",".join(default.keys())
367
 
368
+ return st.text_input(label, value=default, key=component + name, disabled=True)
369
 
370
+ def number(self, label, component, config, name, default=None):
371
  """
372
  Creates a new numeric input field.
373
 
374
  Args:
375
  label: field label
376
+ component: component name
377
  config: component configuration
378
  name: setting name
379
  default: default setting value
 
382
  numeric value
383
  """
384
 
385
+ value = self.text(label, component, config, name, default)
386
  return int(value) if value else None
387
 
388
+ def boolean(self, label, component, config, name, default=False):
389
  """
390
  Creates a new checkbox field.
391
 
392
  Args:
393
  label: field label
394
+ component: component name
395
  config: component configuration
396
  name: setting name
397
  default: default setting value
 
401
  """
402
 
403
  default = self.setting(config, name, default)
404
+ return st.checkbox(label, value=default, key=component + name, disabled=True)
405
 
406
+ def select(self, label, component, config, name, options, default=0):
407
  """
408
  Creates a new select box field.
409
 
410
  Args:
411
  label: field label
412
+ component: component name
413
  config: component configuration
414
  name: setting name
415
  options: list of dropdown options
 
425
  # Derive default index
426
  default = index[0] if index else default
427
 
428
+ return st.selectbox(label, options, index=default, key=component + name, disabled=True)
429
 
430
  def split(self, text):
431
  """
 
455
  # pylint: disable=R0912, R0915
456
  options = {"type": component}
457
 
 
 
458
  # Lookup component configuration
459
  # - Runtime components have config defined within tasks
460
  # - Pipeline components have config defined at workflow root
 
471
 
472
  if component == "embeddings":
473
  st.markdown("**Embeddings Index** \n*Index workflow output*")
474
+ options["path"] = self.text("Embeddings model path", component, config, "path", "sentence-transformers/nli-mpnet-base-v2")
475
+ options["upsert"] = self.boolean("Upsert", component, config, "upsert")
476
+ options["content"] = self.boolean("Content", component, config, "content")
477
 
478
  elif component in ("segmentation", "textractor"):
479
  if component == "segmentation":
 
481
  else:
482
  st.markdown("**Textract** \n*Extract text from documents*")
483
 
484
+ options["sentences"] = self.boolean("Split sentences", component, config, "sentences")
485
+ options["lines"] = self.boolean("Split lines", component, config, "lines")
486
+ options["paragraphs"] = self.boolean("Split paragraphs", component, config, "paragraphs")
487
+ options["join"] = self.boolean("Join tokenized", component, config, "join")
488
+ options["minlength"] = self.number("Min section length", component, config, "minlength")
489
 
490
  elif component == "service":
491
  st.markdown("**Service** \n*Extract data from an API*")
492
+ options["url"] = self.text("URL", component, config, "url")
493
+ options["method"] = self.select("Method", component, config, "method", ["get", "post"], 0)
494
+ options["params"] = self.text("URL parameters", component, config, "params")
495
+ options["batch"] = self.boolean("Run as batch", component, config, "batch", True)
496
+ options["extract"] = self.text("Subsection(s) to extract", component, config, "extract")
497
 
498
  if options["params"]:
499
  options["params"] = {key: None for key in self.split(options["params"])}
 
502
 
503
  elif component == "summary":
504
  st.markdown("**Summary** \n*Abstractive text summarization*")
505
+ options["path"] = self.text("Model", component, config, "path", "sshleifer/distilbart-cnn-12-6")
506
+ options["minlength"] = self.number("Min length", component, config, "minlength")
507
+ options["maxlength"] = self.number("Max length", component, config, "maxlength")
508
 
509
  elif component == "tabular":
510
  st.markdown("**Tabular** \n*Split tabular data into rows and columns*")
511
+ options["idcolumn"] = self.text("Id columns", component, config, "idcolumn")
512
+ options["textcolumns"] = self.text("Text columns", component, config, "textcolumns")
513
+ options["content"] = self.text("Content", component, config, "content")
514
+
515
  if options["textcolumns"]:
516
  options["textcolumns"] = self.split(options["textcolumns"])
517
 
518
+ if options["content"]:
519
+ options["content"] = self.split(options["content"])
520
+ if len(options["content"]) == 1 and options["content"][0] == "1":
521
+ options["content"] = options["content"][0]
522
+
523
  elif component == "translation":
524
  st.markdown("**Translate** \n*Machine translation*")
525
+ options["target"] = self.text("Target language code", component, config, "args", "en")
526
 
527
  return options
528
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  def yaml(self, components):
530
  """
531
  Builds a yaml string for components.
 
537
  (workflow name, YAML string)
538
  """
539
 
 
540
  data = {"app": {"data": self.state("data"), "query": self.state("query")}}
541
  tasks = []
542
  name = None
 
584
 
585
  return (name, yaml.dump(data))
586
 
587
+ def data(self, workflow):
588
  """
589
+ Gets input data.
590
 
591
  Args:
592
+ workflow: workflow configuration
593
 
594
  Returns:
595
+ input data
596
  """
597
 
598
+ # Get default data setting
599
+ data = self.appsetting(workflow, "data")
600
+ if not self.appsetting(workflow, "query"):
601
+ data = st.text_input("Input", value=data)
602
 
603
+ # Save data state
604
+ st.session_state["data"] = data
605
 
606
+ # Wrap data as list for workflow processing
607
+ return [data]
608
+
609
+ def query(self, workflow):
610
  """
611
+ Gets input query.
612
 
613
  Args:
 
614
  workflow: workflow configuration
 
 
 
 
 
 
 
 
 
 
 
615
 
616
+ Returns:
617
+ input query
618
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
 
620
+ default = self.appsetting(workflow, "query")
621
+ default = default if default else ""
 
622
 
623
+ # Set query and limit
624
+ query = st.text_input("Query", value=default)
625
 
626
+ # Save query state
627
+ st.session_state["query"] = query
 
 
 
 
 
 
 
 
 
 
 
 
628
 
629
+ return query
 
 
630
 
631
+ def process(self, workflow, components):
632
  """
633
+ Processes the current application action.
634
 
635
  Args:
636
+ workflow: workflow configuration
637
+ components: workflow components
 
 
638
  """
639
 
640
+ # Get workflow process
641
+ process = Process.get(components)
642
 
643
+ # Run workflow process
644
+ process.run(self.data(workflow))
645
+
646
+ # Run search
647
+ if process.embeddings:
648
+ process.search(self.query(workflow))
649
 
650
  def run(self):
651
  """
652
  Runs Streamlit application.
653
  """
654
 
655
+ # Load configuration
656
  with st.sidebar:
657
  st.image("https://github.com/neuml/txtai/raw/master/logo.png", width=256)
658
  st.markdown("# Workflow builder \n*Build and apply workflows to data* ")
 
660
  st.markdown("---")
661
 
662
  # Component configuration
 
663
  components = ["embeddings", "segmentation", "service", "summary", "tabular", "textractor", "translation"]
664
 
665
  selected, workflow = self.load(components)
666
+ if selected:
667
+ # Get selected options
668
+ components = [self.options(component, workflow) for component in selected]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
669
 
670
  if selected:
 
 
 
671
  # Process current action
672
+ self.process(workflow, components)
 
 
 
 
 
 
 
 
 
 
673
 
674
+ with st.sidebar:
675
+ # Generate export button after workflow is complete
676
+ _, config = self.yaml(components)
677
+ st.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
678
 
679
 
680
  if __name__ == "__main__":
681
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
682
 
683
+ # pylint: disable=W0702
684
  try:
685
  nltk.sent_tokenize("This is a test. Split")
686
  except:
687
  nltk.download("punkt")
688
 
689
  # Create and run application
690
+ app = Application("workflows")
691
  app.run()