Spaces:

K2K5
/

Station_Demo

Runtime error

App Files Files Community

kasand commited on Jun 6, 2023

Commit

2467132

•

1 Parent(s): df5d7af

test out with txtai app.py

Browse files

Files changed (1) hide show

app.py +231 -62

app.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import os
 import nltk
@@ -10,42 +16,65 @@ from txtai.embeddings import Documents, Embeddings
 from txtai.pipeline import Segmentation, Summary, Tabular, Textractor, Translation
 from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
 class Process:
     @staticmethod
     @st.cache(ttl=60 * 60, max_entries=3, allow_output_mutation=True, show_spinner=False)
     def get(components, data):
         """
-        Lookup or creates a new workflow process instance
         """
         process = Process(data)
         with st.spinner("Building workflow...."):
             process.build(components)
         return process
     def __init__(self, data):
         """
-        Create new Process
         """
         self.components = {}
         self.pipelines = {}
-        self. workflow = []
         self.embeddings = None
         self.documents = None
         self.data = data
     def build(self, components):
         """
-        Builds a workflow using components
         """
         tasks = []
         for component in components:
             component = dict(component)
@@ -84,11 +113,14 @@ class Process:
     def run(self, data):
         """
-        Runs a workflow using data as input
         """
         if data and self.workflow:
-            # Builds tuples for embedding index
             if self.documents:
                 data = [(x, element, None) for x, element in enumerate(data)]
@@ -97,7 +129,7 @@ class Process:
                 if not self.documents:
                     st.write(result)
-            # Build embedding index
             if self.documents:
                 # Cache data
                 self.data = list(self.documents)
@@ -105,14 +137,18 @@ class Process:
                 with st.spinner("Building embedding index...."):
                     self.embeddings.index(self.documents)
                     self.documents.close()
                 # Clear workflow
                 self.documents, self.pipelines, self.workflow = None, None, None
     def search(self, query):
         """
-        Runs a search for query
         """
         if self.embeddings and query:
             st.markdown(
                 """
@@ -150,31 +186,45 @@ class Process:
     def find(self, key):
         """
-        Lookup record from cached data by uid key
         """
         # Lookup text by id
         text = [text for uid, text, _ in self.data if uid == key][0]
         return self.content(key, text)
     def content(self, uid, text):
         """
-        Builds a content reference for uid and text
         """
         if uid and uid.lower().startswith("http"):
             return f"<a href='{uid}' rel='noopener noreferrer' target='blank'>{text}</a>"
         return text
 class Application:
     """
-    Main application
     """
     def __init__(self, directory):
         """
-        Creates a new application
         """
         # Workflow configuration directory
@@ -182,10 +232,16 @@ class Application:
     def default(self, names):
         """
-        Gets default workflow index
         """
-        # Gets names as lowercase to match case sensitive
         lnames = [name.lower() for name in names]
         # Get default workflow param
@@ -196,13 +252,19 @@ class Application:
         # Lookup index of workflow name, add 1 to account for "--"
         if index and index in lnames:
             return lnames.index(index) + 1
         # Workflow not found, default to index 0
         return 0
     def load(self, components):
         """
-        Load an existing workflow file
         """
         with open(os.path.join(self.directory, "config.yml"), encoding="utf-8") as f:
@@ -231,41 +293,72 @@ class Application:
                     selected.append("embeddings")
             return (selected, workflow)
         return (None, None)
     def state(self, key):
         """
-        Lookup a session state variable
         """
         if key in st.session_state:
             return st.session_state[key]
         return None
     def appsetting(self, workflow, name):
         """
-        Looks up an application configuration setting
         """
         if workflow:
             config = workflow.get("app")
             if config:
                 return config.get(name)
         return None
     def setting(self, config, name, default=None):
         """
-        Looks up a component configuration settings
         """
         return config.get(name, default) if config else default
     def text(self, label, component, config, name, default=None):
         """
-        Create a new text input field
         """
         default = self.setting(config, name, default)
@@ -279,18 +372,38 @@ class Application:
         st.caption(label)
         st.code(default, language="yaml")
         return default
     def number(self, label, component, config, name, default=None):
         """
-        Creates a new numeric input field
         """
         value = self.text(label, component, config, name, default)
         return int(value) if value else None
-    def boolean(self, label, component, config, name, default=None):
         """
-        Creates a new checkbox field
         """
         default = self.setting(config, name, default)
@@ -298,10 +411,21 @@ class Application:
         st.caption(label)
         st.markdown(":white_check_mark:" if default else ":white_large_square:")
         return default
     def select(self, label, component, config, name, options, default=0):
         """
-        Creates a new select box field
         """
         index = self.setting(config, name)
@@ -313,24 +437,43 @@ class Application:
         st.caption(label)
         st.code(options[default], language="yaml")
         return options[default]
     def split(self, text):
         """
-        Splits text on commas and returns a list
         """
         return [x.strip() for x in text.split(",")]
     def options(self, component, workflow, index):
         """
-        Extracts component settings into a component configuration dict
         """
         options = {"type": component}
         config = None
         if workflow:
             if component in ["service", "translation"]:
                 tasks = list(workflow["workflow"].values())[0]["tasks"]
                 tasks = [task for task in tasks if task.get("task") == component or task.get("action") == component]
                 if tasks:
@@ -348,12 +491,12 @@ class Application:
             if component == "segmentation":
                 st.markdown(f"** {index + 1}.) Segment**  \n*Split text into semantic units*")
             else:
-                st.markdown(f"** {index + 1}.) Textract**  \n*Extract text from documents")
             options["sentences"] = self.boolean("Split sentences", component, config, "sentences")
             options["lines"] = self.boolean("Split lines", component, config, "lines")
             options["paragraphs"] = self.boolean("Split paragraphs", component, config, "paragraphs")
-            options["joint"] = self.boolean("Join tokenized", component, config, "join")
             options["minlength"] = self.number("Min section length", component, config, "minlength")
         elif component == "service":
@@ -396,10 +539,16 @@ class Application:
         st.markdown("---")
         return options
     def yaml(self, components):
         """
-        Builds yaml string for components
         """
         data = {"app": {"data": self.state("data"), "query": self.state("query")}}
@@ -438,20 +587,26 @@ class Application:
             elif wtype == "textractor":
                 data[wtype] = component
-                tasks.append({"action": wtype, "tasks": "url"})
             elif wtype == "translation":
-                data[wtype] = component
                 tasks.append({"action": wtype, "args": list(component.values())})
         # Add in workflow
         data["workflow"] = {name: {"tasks": tasks}}
         return (name, yaml.dump(data))
     def data(self, workflow):
         """
-        Gets input data
         """
         # Get default data setting
@@ -464,10 +619,17 @@ class Application:
         # Wrap data as list for workflow processing
         return [data]
     def query(self, workflow, index):
         """
-        Gets input query
         """
         default = self.appsetting(workflow, "query")
@@ -480,10 +642,15 @@ class Application:
         st.session_state["query"] = query
         return query
     def process(self, workflow, components, index):
         """
-        Processes the current application action
         """
         # Get input data and initialize query
@@ -502,12 +669,12 @@ class Application:
     def run(self):
         """
-        Runs Streamlit application
         """
         with st.sidebar:
-            st.markdown("# Workflow builder for Station  \n*Build and apply workflows to data about articles*  ")
-            st.markdown("This is a demo for Station and the data used is from [Hugging Face](https://huggingface.co/datasets/ag_news/viewer/default/train).")
             st.markdown("---")
             # Component configuration
@@ -525,13 +692,15 @@ class Application:
             with st.sidebar:
                 # Generate export button after workflow is complete
                 _, config = self.yaml(components)
-                st.download_button("Export", config, file_name="workflow.yaml", help="Export the API workflow as YAML")
         else:
-            st.info("Selected a workflow from the sidebar")
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
     try:
         nltk.sent_tokenize("This is a test. Split")
     except:
@@ -539,4 +708,4 @@ if __name__ == "__main__":
     # Create and run application
     app = Application("workflows")
-    app.run()

+"""
+Build txtai workflows.
+Based on this example: https://github.com/neuml/txtai/blob/master/examples/workflows.py
+"""
 import os
 import nltk
 from txtai.pipeline import Segmentation, Summary, Tabular, Textractor, Translation
 from txtai.workflow import ServiceTask, Task, UrlTask, Workflow
 class Process:
+    """
+    Container for an active Workflow process instance.
+    """
     @staticmethod
     @st.cache(ttl=60 * 60, max_entries=3, allow_output_mutation=True, show_spinner=False)
     def get(components, data):
         """
+        Lookup or creates a new workflow process instance.
+        Args:
+            components: input components
+            data: initial data, only passed when indexing
+        Returns:
+            Process
         """
         process = Process(data)
+        # Build workflow
         with st.spinner("Building workflow...."):
             process.build(components)
         return process
     def __init__(self, data):
         """
+        Creates a new Process.
+        Args:
+            data: initial data, only passed when indexing
         """
+        # Component options
         self.components = {}
+        # Defined pipelines
         self.pipelines = {}
+        # Current workflow
+        self.workflow = []
+        # Embeddings index params
         self.embeddings = None
         self.documents = None
         self.data = data
     def build(self, components):
         """
+        Builds a workflow using components.
+        Args:
+            components: list of components to add to workflow
         """
+        # pylint: disable=W0108
         tasks = []
         for component in components:
             component = dict(component)
     def run(self, data):
         """
+        Runs a workflow using data as input.
+        Args:
+            data: input data
         """
         if data and self.workflow:
+            # Build tuples for embedding index
             if self.documents:
                 data = [(x, element, None) for x, element in enumerate(data)]
                 if not self.documents:
                     st.write(result)
+            # Build embeddings index
             if self.documents:
                 # Cache data
                 self.data = list(self.documents)
                 with st.spinner("Building embedding index...."):
                     self.embeddings.index(self.documents)
                     self.documents.close()
                 # Clear workflow
                 self.documents, self.pipelines, self.workflow = None, None, None
     def search(self, query):
         """
+        Runs a search.
+        Args:
+            query: input query
         """
         if self.embeddings and query:
             st.markdown(
                 """
     def find(self, key):
         """
+        Lookup record from cached data by uid key.
+        Args:
+            key: id to search for
+        Returns:
+            text for matching id
         """
         # Lookup text by id
         text = [text for uid, text, _ in self.data if uid == key][0]
         return self.content(key, text)
     def content(self, uid, text):
         """
+        Builds a content reference for uid and text.
+        Args:
+            uid: record id
+            text: record text
+        Returns:
+            content
         """
         if uid and uid.lower().startswith("http"):
             return f"<a href='{uid}' rel='noopener noreferrer' target='blank'>{text}</a>"
         return text
 class Application:
     """
+    Main application.
     """
     def __init__(self, directory):
         """
+        Creates a new application.
         """
         # Workflow configuration directory
     def default(self, names):
         """
+        Gets default workflow index.
+        Args:
+            names: list of workflow names
+        Returns:
+           default workflow index
         """
+        # Get names as lowercase to match case-insensitive
         lnames = [name.lower() for name in names]
         # Get default workflow param
         # Lookup index of workflow name, add 1 to account for "--"
         if index and index in lnames:
             return lnames.index(index) + 1
         # Workflow not found, default to index 0
         return 0
     def load(self, components):
         """
+        Load an existing workflow file.
+        Args:
+            components: list of components to load
+        Returns:
+            (names of components loaded, workflow config)
         """
         with open(os.path.join(self.directory, "config.yml"), encoding="utf-8") as f:
                     selected.append("embeddings")
             return (selected, workflow)
         return (None, None)
     def state(self, key):
         """
+        Lookup a session state variable.
+        Args:
+            key: variable key
+        Returns:
+            variable value
         """
         if key in st.session_state:
             return st.session_state[key]
         return None
     def appsetting(self, workflow, name):
         """
+        Looks up an application configuration setting.
+        Args:
+            workflow: workflow configuration
+            name: setting name
+        Returns:
+            app setting value
         """
         if workflow:
             config = workflow.get("app")
             if config:
                 return config.get(name)
         return None
     def setting(self, config, name, default=None):
         """
+        Looks up a component configuration setting.
+        Args:
+            config: component configuration
+            name: setting name
+            default: default setting value
+        Returns:
+            setting value
         """
         return config.get(name, default) if config else default
     def text(self, label, component, config, name, default=None):
         """
+        Create a new text input field.
+        Args:
+            label: field label
+            component: component name
+            config: component configuration
+            name: setting name
+            default: default setting value
+        Returns:
+            text input field value
         """
         default = self.setting(config, name, default)
         st.caption(label)
         st.code(default, language="yaml")
         return default
     def number(self, label, component, config, name, default=None):
         """
+        Creates a new numeric input field.
+        Args:
+            label: field label
+            component: component name
+            config: component configuration
+            name: setting name
+            default: default setting value
+        Returns:
+            numeric value
         """
         value = self.text(label, component, config, name, default)
         return int(value) if value else None
+    def boolean(self, label, component, config, name, default=False):
         """
+        Creates a new checkbox field.
+        Args:
+            label: field label
+            component: component name
+            config: component configuration
+            name: setting name
+            default: default setting value
+        Returns:
+            boolean value
         """
         default = self.setting(config, name, default)
         st.caption(label)
         st.markdown(":white_check_mark:" if default else ":white_large_square:")
         return default
     def select(self, label, component, config, name, options, default=0):
         """
+        Creates a new select box field.
+        Args:
+            label: field label
+            component: component name
+            config: component configuration
+            name: setting name
+            options: list of dropdown options
+            default: default setting value
+        Returns:
+            boolean value
         """
         index = self.setting(config, name)
         st.caption(label)
         st.code(options[default], language="yaml")
         return options[default]
     def split(self, text):
         """
+        Splits text on commas and returns a list.
+        Args:
+            text: input text
+        Returns:
+            list
         """
         return [x.strip() for x in text.split(",")]
     def options(self, component, workflow, index):
         """
+        Extracts component settings into a component configuration dict.
+        Args:
+            component: component type
+            workflow: existing workflow, can be None
+            index: task index
+        Returns:
+            dict with component settings
         """
+        # pylint: disable=R0912, R0915
         options = {"type": component}
+        # Lookup component configuration
+        #   - Runtime components have config defined within tasks
+        #   - Pipeline components have config defined at workflow root
         config = None
         if workflow:
             if component in ["service", "translation"]:
+                # Service config is found in tasks section
                 tasks = list(workflow["workflow"].values())[0]["tasks"]
                 tasks = [task for task in tasks if task.get("task") == component or task.get("action") == component]
                 if tasks:
             if component == "segmentation":
                 st.markdown(f"** {index + 1}.) Segment**  \n*Split text into semantic units*")
             else:
+                st.markdown(f"** {index + 1}.) Textract**  \n*Extract text from documents*")
             options["sentences"] = self.boolean("Split sentences", component, config, "sentences")
             options["lines"] = self.boolean("Split lines", component, config, "lines")
             options["paragraphs"] = self.boolean("Split paragraphs", component, config, "paragraphs")
+            options["join"] = self.boolean("Join tokenized", component, config, "join")
             options["minlength"] = self.number("Min section length", component, config, "minlength")
         elif component == "service":
         st.markdown("---")
         return options
     def yaml(self, components):
         """
+        Builds a yaml string for components.
+        Args:
+            components: list of components to export to YAML
+        Returns:
+            (workflow name, YAML string)
         """
         data = {"app": {"data": self.state("data"), "query": self.state("query")}}
             elif wtype == "textractor":
                 data[wtype] = component
+                tasks.append({"action": wtype, "task": "url"})
             elif wtype == "translation":
+                data[wtype] = {}
                 tasks.append({"action": wtype, "args": list(component.values())})
         # Add in workflow
         data["workflow"] = {name: {"tasks": tasks}}
         return (name, yaml.dump(data))
     def data(self, workflow):
         """
+        Gets input data.
+        Args:
+            workflow: workflow configuration
+        Returns:
+            input data
         """
         # Get default data setting
         # Wrap data as list for workflow processing
         return [data]
     def query(self, workflow, index):
         """
+        Gets input query.
+        Args:
+            workflow: workflow configuration
+            index: True if this is an indexing workflow
+        Returns:
+            input query
         """
         default = self.appsetting(workflow, "query")
         st.session_state["query"] = query
         return query
     def process(self, workflow, components, index):
         """
+        Processes the current application action.
+        Args:
+            workflow: workflow configuration
+            components: workflow components
+            index: True if this is an indexing workflow
         """
         # Get input data and initialize query
     def run(self):
         """
+        Runs Streamlit application.
         """
         with st.sidebar:
+            st.markdown("# Workflow builder  \n*Build and apply workflows to data*  ")
+            st.markdown("Test workflows for Station. Read more about used data on [Hugging Face](https://huggingface.co/datasets/ag_news) and in the [Docs](https://neuml.github.io/txtai/workflow/).")
             st.markdown("---")
             # Component configuration
             with st.sidebar:
                 # Generate export button after workflow is complete
                 _, config = self.yaml(components)
+                st.download_button("Export", config, file_name="workflow.yml", help="Export the API workflow as YAML")
         else:
+            st.info("Select a workflow from the sidebar")
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # pylint: disable=W0702
     try:
         nltk.sent_tokenize("This is a test. Split")
     except:
     # Create and run application
     app = Application("workflows")
+    app.run()