Spaces:

pragmatic-programs
/

pragmatic-synthesizer

Runtime error

App Files Files Community

saujasv commited on Nov 9, 2023

Commit

2869f1d

•

1 Parent(s): 980c3e1

make barebones gradio interface

Browse files

Files changed (4) hide show

app.py +12 -0
listener.py +119 -0
requirements.txt +0 -0
utils.py +97 -0

app.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import gradio as gr
+from listener import Listener
+listener = Listener("pragmatic-programs/pragmatic-ft-listener", {"do_sample": True, "num_return_sequences": 100, "num_beams": 1, "temperature": 1, "top_p": 0.9})
+def synthesize(context):
+    spec = [[[s[:-1], s[-1]] for s in context.split(' ')]]
+    return listener.synthesize(spec).programs[0][0]
+iface = gr.Interface(fn=synthesize, inputs="text", outputs="text")
+iface.launch()

listener.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
+from dataclasses import dataclass
+from typing import List, Optional
+from utils import get_preprocess_function, get_utterance_processing_functions, byt5_decode_batch, consistent
+from utils import PROGRAM_SPECIAL_TOKEN, UTTERANCES_SPECIAL_TOKEN, GT_PROGRAM_SPECIAL_TOKEN
+from greenery import parse
+from greenery.parse import NoMatch
+import numpy as np
+import torch
+class Agent:
+    def __init__(self,
+                model_path: str,
+                gen_config: dict,
+                inference_batch_size: int = 1,
+                ):
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.gen_config = GenerationConfig(**gen_config)
+        self.inference_batch_size = inference_batch_size
+@dataclass
+class ListenerOutput:
+    programs: List[List[str]]
+    idx: Optional[List[List[int]]] = None
+    decoded: Optional[List[List[str]]] = None
+    decoded_scores: Optional[List[List[float]]] = None
+    pruned: Optional[List[List[str]]] = None
+class Listener(Agent):
+    def __init__(self,
+        model_path,
+        gen_config,
+        inference_batch_size=4,
+        label_pos="suffix",
+        idx: bool=True,
+        program_special_token=PROGRAM_SPECIAL_TOKEN,
+        utterances_special_token=UTTERANCES_SPECIAL_TOKEN
+    ):
+        super().__init__(
+            model_path,
+            gen_config,
+            inference_batch_size,
+        )
+        self.label_pos = label_pos
+        self.idx = idx
+        self.program_special_token = program_special_token
+        self.utterances_special_token = utterances_special_token
+        self.utterances_to_string, self.string_to_utterances = (
+            get_utterance_processing_functions(
+                label_pos, idx, separator=utterances_special_token
+                )
+            )
+        self.device = self.model.device
+    def synthesize(self, context, return_scores=False, enforce_consistency=True):
+        # If context is a list of utterances, convert to string
+        if isinstance(context[0], list):
+            context_str = list(map(self.utterances_to_string, context))
+        else:
+            context_str = context
+        context_tokens = self.tokenizer(
+            [f"{self.utterances_special_token}{c}" if not c.startswith(self.utterances_special_token) else c
+            for c in context_str],
+            return_tensors="pt",
+            padding=True
+            ).to(self.device)
+        decoder_inputs = self.tokenizer(
+            [self.program_special_token for _ in context], return_tensors="pt",
+            add_special_tokens=False
+            ).to(self.device)
+        outputs = self.model.generate(**context_tokens,
+                                      decoder_input_ids=decoder_inputs.input_ids,
+                                      generation_config=self.gen_config,
+                                      return_dict_in_generate=True,
+                                      output_scores=True
+                                      )
+        decoded_batch = byt5_decode_batch(outputs.sequences.reshape((len(context), -1, outputs.sequences.shape[-1])).tolist(), skip_position_token=True, skip_special_tokens=True)
+        consistent_programs = []
+        idxs = []
+        for decoded, ctx in zip(decoded_batch, context):
+            cp = []
+            idx = []
+            for i, p in enumerate(decoded):
+                if enforce_consistency:
+                    if consistent(p, ctx):
+                        cp.append(p)
+                        idx.append(i)
+                else:
+                    cp.append(p)
+                    idx.append(i)
+            consistent_programs.append(cp)
+            idxs.append(idx)
+        logprobs = torch.stack(outputs.scores, dim=1).log_softmax(dim=-1)
+        gen_probs = torch.gather(logprobs, 2, outputs.sequences[:, 1:, None]).squeeze(-1)
+        gen_probs.masked_fill_(gen_probs.isinf(), 0)
+        scores = gen_probs.sum(-1)
+        n_decoded = scores.shape[0]
+        n_seq = n_decoded // len(context)
+        scores = scores.reshape((len(context), n_seq))
+        scores_list = scores.tolist()
+        if return_scores:
+            return ListenerOutput(
+                consistent_programs,
+                idxs,
+                decoded_batch,
+                scores_list
+                )
+        else:
+            return ListenerOutput(consistent_programs)

requirements.txt ADDED Viewed

File without changes

utils.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import regex as re
+PROGRAM_SPECIAL_TOKEN="<extra_id_124>"
+UTTERANCES_SPECIAL_TOKEN="<extra_id_123>"
+GT_PROGRAM_SPECIAL_TOKEN="<extra_id_122>"
+def consistent(rx, spec):
+    # spec is in the form of (string, '+'/'-') pairs
+    for s, label in spec:
+        if not label in ['+', '-']:
+            return None
+        try:
+            if re.fullmatch(rx, s, timeout=1):
+                if label == '-':
+                    return False
+            else:
+                if label == '+':
+                    return False
+        except re.error:
+            return None
+        except TimeoutError:
+            return None
+    return True
+def decode(c):
+    if c < 3:
+        return f"<{c}>"
+    elif c < 258:
+        return chr(c - 3)
+    else:
+        return f"<extra_id_{c - 259}>"
+def byt5_decode_batch(outputs, skip_special_tokens=True, skip_position_token=False):
+    skipped_tokens = outputs
+    if skip_special_tokens:
+        skipped_tokens = [
+            [[t for t in x if t >= 3] for x in beam]
+            for beam in skipped_tokens
+            ]
+    if skip_position_token:
+        skipped_tokens = [
+            [[t for t in x if t <= 258] for x in beam]
+            for beam in skipped_tokens
+            ]
+    return [
+        [''.join([decode(t) for t in x]) for x in beam]
+        for beam in skipped_tokens
+    ]
+def get_preprocess_function(tokenizer):
+    def preprocess_function(examples):
+        model_inputs = tokenizer(
+            [' ' if x is None else x for x in examples["context"]],
+            text_target=examples["target"],
+            truncation=True
+        )
+        return model_inputs
+    return preprocess_function
+def get_utterance_processing_functions(label_pos, idx, separator=' '):
+    if label_pos == "suffix":
+        if idx:
+            def utterances_to_string(spec):
+                return ''.join([f"<extra_id_{i}>{s}{label}" for i, (s, label) in enumerate(spec)])
+        else:
+            def utterances_to_string(spec):
+                return separator.join([f"{s}{label}" for s, label in spec])
+    else:
+        if idx:
+            def utterances_to_string(spec):
+                return ''.join([f"<extra_id_{i}>{label}{s}" for i, (s, label) in enumerate(spec)])
+        else:
+            def utterances_to_string(spec):
+                return separator.join([f"{label}{s}" for s, label in spec])
+    if label_pos == "suffix":
+        if idx:
+            def string_to_utterances(string):
+                string = re.sub(r'<extra_id_\d+>', ' ', string)
+                return [(s[:-1], s[-1]) for s in string.split(' ') if len(s) > 0]
+        else:
+            def string_to_utterances(string):
+                return [(s[:-1], s[-1]) for s in string.split(separator) if len(s) > 0]
+    else:
+        if idx:
+            def string_to_utterances(string):
+                string = re.sub(r'<extra_id_\d+>', '', string)
+                return [(s[1:], s[0]) for s in string.split(separator) if len(s) > 0]
+        else:
+            def string_to_utterances(string):
+                return [(s[1:], s[0]) for s in string.split(separator) if len(s) > 0]
+    return utterances_to_string, string_to_utterances