kasand commited on
Commit
2467132
1 Parent(s): df5d7af

test out with txtai app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -62
app.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import os
2
 
3
  import nltk
@@ -10,42 +16,65 @@ from txtai.embeddings import Documents, Embeddings
10
  from txtai.pipeline import Segmentation, Summary, Tabular, Textractor, Translation
11
  from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
12
 
 
13
  class Process:
 
 
 
14
 
15
  @staticmethod
16
  @st.cache(ttl=60 * 60, max_entries=3, allow_output_mutation=True, show_spinner=False)
17
  def get(components, data):
18
  """
19
- Lookup or creates a new workflow process instance
 
 
 
 
 
 
 
20
  """
21
 
22
  process = Process(data)
23
 
 
24
  with st.spinner("Building workflow...."):
25
  process.build(components)
26
 
27
  return process
28
-
29
  def __init__(self, data):
30
  """
31
- Create new Process
 
 
 
32
  """
33
 
 
34
  self.components = {}
35
 
 
36
  self.pipelines = {}
37
 
38
- self. workflow = []
 
39
 
 
40
  self.embeddings = None
41
  self.documents = None
42
  self.data = data
43
 
44
  def build(self, components):
45
  """
46
- Builds a workflow using components
 
 
 
47
  """
48
 
 
49
  tasks = []
50
  for component in components:
51
  component = dict(component)
@@ -84,11 +113,14 @@ class Process:
84
 
85
  def run(self, data):
86
  """
87
- Runs a workflow using data as input
 
 
 
88
  """
89
 
90
  if data and self.workflow:
91
- # Builds tuples for embedding index
92
  if self.documents:
93
  data = [(x, element, None) for x, element in enumerate(data)]
94
 
@@ -97,7 +129,7 @@ class Process:
97
  if not self.documents:
98
  st.write(result)
99
 
100
- # Build embedding index
101
  if self.documents:
102
  # Cache data
103
  self.data = list(self.documents)
@@ -105,14 +137,18 @@ class Process:
105
  with st.spinner("Building embedding index...."):
106
  self.embeddings.index(self.documents)
107
  self.documents.close()
108
-
109
  # Clear workflow
110
  self.documents, self.pipelines, self.workflow = None, None, None
111
 
112
  def search(self, query):
113
  """
114
- Runs a search for query
 
 
 
115
  """
 
116
  if self.embeddings and query:
117
  st.markdown(
118
  """
@@ -150,31 +186,45 @@ class Process:
150
 
151
  def find(self, key):
152
  """
153
- Lookup record from cached data by uid key
 
 
 
 
 
 
154
  """
155
 
156
  # Lookup text by id
157
  text = [text for uid, text, _ in self.data if uid == key][0]
158
  return self.content(key, text)
159
-
160
  def content(self, uid, text):
161
  """
162
- Builds a content reference for uid and text
 
 
 
 
 
 
 
163
  """
164
 
165
  if uid and uid.lower().startswith("http"):
166
  return f"<a href='{uid}' rel='noopener noreferrer' target='blank'>{text}</a>"
167
-
168
  return text
169
-
 
170
  class Application:
171
  """
172
- Main application
173
  """
174
 
175
  def __init__(self, directory):
176
  """
177
- Creates a new application
178
  """
179
 
180
  # Workflow configuration directory
@@ -182,10 +232,16 @@ class Application:
182
 
183
  def default(self, names):
184
  """
185
- Gets default workflow index
 
 
 
 
 
 
186
  """
187
 
188
- # Gets names as lowercase to match case sensitive
189
  lnames = [name.lower() for name in names]
190
 
191
  # Get default workflow param
@@ -196,13 +252,19 @@ class Application:
196
  # Lookup index of workflow name, add 1 to account for "--"
197
  if index and index in lnames:
198
  return lnames.index(index) + 1
199
-
200
  # Workflow not found, default to index 0
201
  return 0
202
-
203
  def load(self, components):
204
  """
205
- Load an existing workflow file
 
 
 
 
 
 
206
  """
207
 
208
  with open(os.path.join(self.directory, "config.yml"), encoding="utf-8") as f:
@@ -231,41 +293,72 @@ class Application:
231
  selected.append("embeddings")
232
 
233
  return (selected, workflow)
234
-
235
  return (None, None)
236
-
237
  def state(self, key):
238
  """
239
- Lookup a session state variable
 
 
 
 
 
 
240
  """
241
 
242
  if key in st.session_state:
243
  return st.session_state[key]
244
-
245
  return None
246
-
247
  def appsetting(self, workflow, name):
248
  """
249
- Looks up an application configuration setting
 
 
 
 
 
 
 
250
  """
251
 
252
  if workflow:
253
  config = workflow.get("app")
254
  if config:
255
  return config.get(name)
256
-
257
  return None
258
-
259
  def setting(self, config, name, default=None):
260
  """
261
- Looks up a component configuration settings
 
 
 
 
 
 
 
 
262
  """
263
 
264
  return config.get(name, default) if config else default
265
-
266
  def text(self, label, component, config, name, default=None):
267
  """
268
- Create a new text input field
 
 
 
 
 
 
 
 
 
 
269
  """
270
 
271
  default = self.setting(config, name, default)
@@ -279,18 +372,38 @@ class Application:
279
  st.caption(label)
280
  st.code(default, language="yaml")
281
  return default
282
-
283
  def number(self, label, component, config, name, default=None):
284
  """
285
- Creates a new numeric input field
 
 
 
 
 
 
 
 
 
 
286
  """
287
 
288
  value = self.text(label, component, config, name, default)
289
  return int(value) if value else None
290
-
291
- def boolean(self, label, component, config, name, default=None):
292
  """
293
- Creates a new checkbox field
 
 
 
 
 
 
 
 
 
 
294
  """
295
 
296
  default = self.setting(config, name, default)
@@ -298,10 +411,21 @@ class Application:
298
  st.caption(label)
299
  st.markdown(":white_check_mark:" if default else ":white_large_square:")
300
  return default
301
-
302
  def select(self, label, component, config, name, options, default=0):
303
  """
304
- Creates a new select box field
 
 
 
 
 
 
 
 
 
 
 
305
  """
306
 
307
  index = self.setting(config, name)
@@ -313,24 +437,43 @@ class Application:
313
  st.caption(label)
314
  st.code(options[default], language="yaml")
315
  return options[default]
316
-
317
  def split(self, text):
318
  """
319
- Splits text on commas and returns a list
 
 
 
 
 
 
320
  """
321
 
322
  return [x.strip() for x in text.split(",")]
323
-
324
  def options(self, component, workflow, index):
325
  """
326
- Extracts component settings into a component configuration dict
 
 
 
 
 
 
 
 
327
  """
328
 
 
329
  options = {"type": component}
330
 
 
 
 
331
  config = None
332
  if workflow:
333
  if component in ["service", "translation"]:
 
334
  tasks = list(workflow["workflow"].values())[0]["tasks"]
335
  tasks = [task for task in tasks if task.get("task") == component or task.get("action") == component]
336
  if tasks:
@@ -348,12 +491,12 @@ class Application:
348
  if component == "segmentation":
349
  st.markdown(f"** {index + 1}.) Segment** \n*Split text into semantic units*")
350
  else:
351
- st.markdown(f"** {index + 1}.) Textract** \n*Extract text from documents")
352
 
353
  options["sentences"] = self.boolean("Split sentences", component, config, "sentences")
354
  options["lines"] = self.boolean("Split lines", component, config, "lines")
355
  options["paragraphs"] = self.boolean("Split paragraphs", component, config, "paragraphs")
356
- options["joint"] = self.boolean("Join tokenized", component, config, "join")
357
  options["minlength"] = self.number("Min section length", component, config, "minlength")
358
 
359
  elif component == "service":
@@ -396,10 +539,16 @@ class Application:
396
  st.markdown("---")
397
 
398
  return options
399
-
400
  def yaml(self, components):
401
  """
402
- Builds yaml string for components
 
 
 
 
 
 
403
  """
404
 
405
  data = {"app": {"data": self.state("data"), "query": self.state("query")}}
@@ -438,20 +587,26 @@ class Application:
438
 
439
  elif wtype == "textractor":
440
  data[wtype] = component
441
- tasks.append({"action": wtype, "tasks": "url"})
442
 
443
  elif wtype == "translation":
444
- data[wtype] = component
445
  tasks.append({"action": wtype, "args": list(component.values())})
446
 
447
  # Add in workflow
448
  data["workflow"] = {name: {"tasks": tasks}}
449
 
450
  return (name, yaml.dump(data))
451
-
452
  def data(self, workflow):
453
  """
454
- Gets input data
 
 
 
 
 
 
455
  """
456
 
457
  # Get default data setting
@@ -464,10 +619,17 @@ class Application:
464
 
465
  # Wrap data as list for workflow processing
466
  return [data]
467
-
468
  def query(self, workflow, index):
469
  """
470
- Gets input query
 
 
 
 
 
 
 
471
  """
472
 
473
  default = self.appsetting(workflow, "query")
@@ -480,10 +642,15 @@ class Application:
480
  st.session_state["query"] = query
481
 
482
  return query
483
-
484
  def process(self, workflow, components, index):
485
  """
486
- Processes the current application action
 
 
 
 
 
487
  """
488
 
489
  # Get input data and initialize query
@@ -502,12 +669,12 @@ class Application:
502
 
503
  def run(self):
504
  """
505
- Runs Streamlit application
506
  """
507
 
508
  with st.sidebar:
509
- st.markdown("# Workflow builder for Station \n*Build and apply workflows to data about articles* ")
510
- st.markdown("This is a demo for Station and the data used is from [Hugging Face](https://huggingface.co/datasets/ag_news/viewer/default/train).")
511
  st.markdown("---")
512
 
513
  # Component configuration
@@ -525,13 +692,15 @@ class Application:
525
  with st.sidebar:
526
  # Generate export button after workflow is complete
527
  _, config = self.yaml(components)
528
- st.download_button("Export", config, file_name="workflow.yaml", help="Export the API workflow as YAML")
529
  else:
530
- st.info("Selected a workflow from the sidebar")
 
531
 
532
  if __name__ == "__main__":
533
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
534
 
 
535
  try:
536
  nltk.sent_tokenize("This is a test. Split")
537
  except:
@@ -539,4 +708,4 @@ if __name__ == "__main__":
539
 
540
  # Create and run application
541
  app = Application("workflows")
542
- app.run()
 
1
+ """
2
+ Build txtai workflows.
3
+
4
+ Based on this example: https://github.com/neuml/txtai/blob/master/examples/workflows.py
5
+ """
6
+
7
  import os
8
 
9
  import nltk
 
16
  from txtai.pipeline import Segmentation, Summary, Tabular, Textractor, Translation
17
  from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
18
 
19
+
20
  class Process:
21
+ """
22
+ Container for an active Workflow process instance.
23
+ """
24
 
25
  @staticmethod
26
  @st.cache(ttl=60 * 60, max_entries=3, allow_output_mutation=True, show_spinner=False)
27
  def get(components, data):
28
  """
29
+ Lookup or creates a new workflow process instance.
30
+
31
+ Args:
32
+ components: input components
33
+ data: initial data, only passed when indexing
34
+
35
+ Returns:
36
+ Process
37
  """
38
 
39
  process = Process(data)
40
 
41
+ # Build workflow
42
  with st.spinner("Building workflow...."):
43
  process.build(components)
44
 
45
  return process
46
+
47
  def __init__(self, data):
48
  """
49
+ Creates a new Process.
50
+
51
+ Args:
52
+ data: initial data, only passed when indexing
53
  """
54
 
55
+ # Component options
56
  self.components = {}
57
 
58
+ # Defined pipelines
59
  self.pipelines = {}
60
 
61
+ # Current workflow
62
+ self.workflow = []
63
 
64
+ # Embeddings index params
65
  self.embeddings = None
66
  self.documents = None
67
  self.data = data
68
 
69
  def build(self, components):
70
  """
71
+ Builds a workflow using components.
72
+
73
+ Args:
74
+ components: list of components to add to workflow
75
  """
76
 
77
+ # pylint: disable=W0108
78
  tasks = []
79
  for component in components:
80
  component = dict(component)
 
113
 
114
  def run(self, data):
115
  """
116
+ Runs a workflow using data as input.
117
+
118
+ Args:
119
+ data: input data
120
  """
121
 
122
  if data and self.workflow:
123
+ # Build tuples for embedding index
124
  if self.documents:
125
  data = [(x, element, None) for x, element in enumerate(data)]
126
 
 
129
  if not self.documents:
130
  st.write(result)
131
 
132
+ # Build embeddings index
133
  if self.documents:
134
  # Cache data
135
  self.data = list(self.documents)
 
137
  with st.spinner("Building embedding index...."):
138
  self.embeddings.index(self.documents)
139
  self.documents.close()
140
+
141
  # Clear workflow
142
  self.documents, self.pipelines, self.workflow = None, None, None
143
 
144
  def search(self, query):
145
  """
146
+ Runs a search.
147
+
148
+ Args:
149
+ query: input query
150
  """
151
+
152
  if self.embeddings and query:
153
  st.markdown(
154
  """
 
186
 
187
  def find(self, key):
188
  """
189
+ Lookup record from cached data by uid key.
190
+
191
+ Args:
192
+ key: id to search for
193
+
194
+ Returns:
195
+ text for matching id
196
  """
197
 
198
  # Lookup text by id
199
  text = [text for uid, text, _ in self.data if uid == key][0]
200
  return self.content(key, text)
201
+
202
  def content(self, uid, text):
203
  """
204
+ Builds a content reference for uid and text.
205
+
206
+ Args:
207
+ uid: record id
208
+ text: record text
209
+
210
+ Returns:
211
+ content
212
  """
213
 
214
  if uid and uid.lower().startswith("http"):
215
  return f"<a href='{uid}' rel='noopener noreferrer' target='blank'>{text}</a>"
216
+
217
  return text
218
+
219
+
220
  class Application:
221
  """
222
+ Main application.
223
  """
224
 
225
  def __init__(self, directory):
226
  """
227
+ Creates a new application.
228
  """
229
 
230
  # Workflow configuration directory
 
232
 
233
  def default(self, names):
234
  """
235
+ Gets default workflow index.
236
+
237
+ Args:
238
+ names: list of workflow names
239
+
240
+ Returns:
241
+ default workflow index
242
  """
243
 
244
+ # Get names as lowercase to match case-insensitive
245
  lnames = [name.lower() for name in names]
246
 
247
  # Get default workflow param
 
252
  # Lookup index of workflow name, add 1 to account for "--"
253
  if index and index in lnames:
254
  return lnames.index(index) + 1
255
+
256
  # Workflow not found, default to index 0
257
  return 0
258
+
259
  def load(self, components):
260
  """
261
+ Load an existing workflow file.
262
+
263
+ Args:
264
+ components: list of components to load
265
+
266
+ Returns:
267
+ (names of components loaded, workflow config)
268
  """
269
 
270
  with open(os.path.join(self.directory, "config.yml"), encoding="utf-8") as f:
 
293
  selected.append("embeddings")
294
 
295
  return (selected, workflow)
296
+
297
  return (None, None)
298
+
299
  def state(self, key):
300
  """
301
+ Lookup a session state variable.
302
+
303
+ Args:
304
+ key: variable key
305
+
306
+ Returns:
307
+ variable value
308
  """
309
 
310
  if key in st.session_state:
311
  return st.session_state[key]
312
+
313
  return None
314
+
315
  def appsetting(self, workflow, name):
316
  """
317
+ Looks up an application configuration setting.
318
+
319
+ Args:
320
+ workflow: workflow configuration
321
+ name: setting name
322
+
323
+ Returns:
324
+ app setting value
325
  """
326
 
327
  if workflow:
328
  config = workflow.get("app")
329
  if config:
330
  return config.get(name)
331
+
332
  return None
333
+
334
  def setting(self, config, name, default=None):
335
  """
336
+ Looks up a component configuration setting.
337
+
338
+ Args:
339
+ config: component configuration
340
+ name: setting name
341
+ default: default setting value
342
+
343
+ Returns:
344
+ setting value
345
  """
346
 
347
  return config.get(name, default) if config else default
348
+
349
  def text(self, label, component, config, name, default=None):
350
  """
351
+ Create a new text input field.
352
+
353
+ Args:
354
+ label: field label
355
+ component: component name
356
+ config: component configuration
357
+ name: setting name
358
+ default: default setting value
359
+
360
+ Returns:
361
+ text input field value
362
  """
363
 
364
  default = self.setting(config, name, default)
 
372
  st.caption(label)
373
  st.code(default, language="yaml")
374
  return default
375
+
376
  def number(self, label, component, config, name, default=None):
377
  """
378
+ Creates a new numeric input field.
379
+
380
+ Args:
381
+ label: field label
382
+ component: component name
383
+ config: component configuration
384
+ name: setting name
385
+ default: default setting value
386
+
387
+ Returns:
388
+ numeric value
389
  """
390
 
391
  value = self.text(label, component, config, name, default)
392
  return int(value) if value else None
393
+
394
+ def boolean(self, label, component, config, name, default=False):
395
  """
396
+ Creates a new checkbox field.
397
+
398
+ Args:
399
+ label: field label
400
+ component: component name
401
+ config: component configuration
402
+ name: setting name
403
+ default: default setting value
404
+
405
+ Returns:
406
+ boolean value
407
  """
408
 
409
  default = self.setting(config, name, default)
 
411
  st.caption(label)
412
  st.markdown(":white_check_mark:" if default else ":white_large_square:")
413
  return default
414
+
415
  def select(self, label, component, config, name, options, default=0):
416
  """
417
+ Creates a new select box field.
418
+
419
+ Args:
420
+ label: field label
421
+ component: component name
422
+ config: component configuration
423
+ name: setting name
424
+ options: list of dropdown options
425
+ default: default setting value
426
+
427
+ Returns:
428
+ boolean value
429
  """
430
 
431
  index = self.setting(config, name)
 
437
  st.caption(label)
438
  st.code(options[default], language="yaml")
439
  return options[default]
440
+
441
  def split(self, text):
442
  """
443
+ Splits text on commas and returns a list.
444
+
445
+ Args:
446
+ text: input text
447
+
448
+ Returns:
449
+ list
450
  """
451
 
452
  return [x.strip() for x in text.split(",")]
453
+
454
  def options(self, component, workflow, index):
455
  """
456
+ Extracts component settings into a component configuration dict.
457
+
458
+ Args:
459
+ component: component type
460
+ workflow: existing workflow, can be None
461
+ index: task index
462
+
463
+ Returns:
464
+ dict with component settings
465
  """
466
 
467
+ # pylint: disable=R0912, R0915
468
  options = {"type": component}
469
 
470
+ # Lookup component configuration
471
+ # - Runtime components have config defined within tasks
472
+ # - Pipeline components have config defined at workflow root
473
  config = None
474
  if workflow:
475
  if component in ["service", "translation"]:
476
+ # Service config is found in tasks section
477
  tasks = list(workflow["workflow"].values())[0]["tasks"]
478
  tasks = [task for task in tasks if task.get("task") == component or task.get("action") == component]
479
  if tasks:
 
491
  if component == "segmentation":
492
  st.markdown(f"** {index + 1}.) Segment** \n*Split text into semantic units*")
493
  else:
494
+ st.markdown(f"** {index + 1}.) Textract** \n*Extract text from documents*")
495
 
496
  options["sentences"] = self.boolean("Split sentences", component, config, "sentences")
497
  options["lines"] = self.boolean("Split lines", component, config, "lines")
498
  options["paragraphs"] = self.boolean("Split paragraphs", component, config, "paragraphs")
499
+ options["join"] = self.boolean("Join tokenized", component, config, "join")
500
  options["minlength"] = self.number("Min section length", component, config, "minlength")
501
 
502
  elif component == "service":
 
539
  st.markdown("---")
540
 
541
  return options
542
+
543
  def yaml(self, components):
544
  """
545
+ Builds a yaml string for components.
546
+
547
+ Args:
548
+ components: list of components to export to YAML
549
+
550
+ Returns:
551
+ (workflow name, YAML string)
552
  """
553
 
554
  data = {"app": {"data": self.state("data"), "query": self.state("query")}}
 
587
 
588
  elif wtype == "textractor":
589
  data[wtype] = component
590
+ tasks.append({"action": wtype, "task": "url"})
591
 
592
  elif wtype == "translation":
593
+ data[wtype] = {}
594
  tasks.append({"action": wtype, "args": list(component.values())})
595
 
596
  # Add in workflow
597
  data["workflow"] = {name: {"tasks": tasks}}
598
 
599
  return (name, yaml.dump(data))
600
+
601
  def data(self, workflow):
602
  """
603
+ Gets input data.
604
+
605
+ Args:
606
+ workflow: workflow configuration
607
+
608
+ Returns:
609
+ input data
610
  """
611
 
612
  # Get default data setting
 
619
 
620
  # Wrap data as list for workflow processing
621
  return [data]
622
+
623
  def query(self, workflow, index):
624
  """
625
+ Gets input query.
626
+
627
+ Args:
628
+ workflow: workflow configuration
629
+ index: True if this is an indexing workflow
630
+
631
+ Returns:
632
+ input query
633
  """
634
 
635
  default = self.appsetting(workflow, "query")
 
642
  st.session_state["query"] = query
643
 
644
  return query
645
+
646
  def process(self, workflow, components, index):
647
  """
648
+ Processes the current application action.
649
+
650
+ Args:
651
+ workflow: workflow configuration
652
+ components: workflow components
653
+ index: True if this is an indexing workflow
654
  """
655
 
656
  # Get input data and initialize query
 
669
 
670
  def run(self):
671
  """
672
+ Runs Streamlit application.
673
  """
674
 
675
  with st.sidebar:
676
+ st.markdown("# Workflow builder \n*Build and apply workflows to data* ")
677
+ st.markdown("Test workflows for Station. Read more about used data on [Hugging Face](https://huggingface.co/datasets/ag_news) and in the [Docs](https://neuml.github.io/txtai/workflow/).")
678
  st.markdown("---")
679
 
680
  # Component configuration
 
692
  with st.sidebar:
693
  # Generate export button after workflow is complete
694
  _, config = self.yaml(components)
695
+ st.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
696
  else:
697
+ st.info("Select a workflow from the sidebar")
698
+
699
 
700
  if __name__ == "__main__":
701
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
702
 
703
+ # pylint: disable=W0702
704
  try:
705
  nltk.sent_tokenize("This is a test. Split")
706
  except:
 
708
 
709
  # Create and run application
710
  app = Application("workflows")
711
+ app.run()