Spaces:

anasampa2
/

parser

Runtime error

App Files Files Community

anasampa2 commited on May 1, 2024

Commit

be8596b

verified ·

1 Parent(s): ee0ec3d

Added wembedding_service folder.

Browse files

Added wembedding_server folder running in vm.

Files changed (5) hide show

wembedding_service/compute_wembeddings.py +72 -0
wembedding_service/start_wembeddings_server.py +85 -0
wembedding_service/wembeddings/__pycache__/wembeddings.cpython-37.pyc +0 -0
wembedding_service/wembeddings/wembeddings.py +183 -0
wembedding_service/wembeddings/wembeddings_server.py +118 -0

wembedding_service/compute_wembeddings.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+#
+# Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
+# Mathematics and Physics, Charles University, Czech Republic.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+import sys
+import zipfile
+import numpy as np
+import wembeddings.wembeddings as wembeddings
+if __name__ == "__main__":
+    import argparse
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_path", type=str, help="Input file")
+    parser.add_argument("output_npz", type=str, help="Output NPZ file")
+    parser.add_argument("--batch_size", default=64, type=int, help="Batch size")
+    parser.add_argument("--dtype", default="float16", type=str, help="Dtype to save as")
+    parser.add_argument("--format", default="conllu", type=str, help="Input format (conllu, conll)")
+    parser.add_argument("--model", default="bert-base-multilingual-uncased-last4", type=str, help="Model name (see wembeddings.py for options)")
+    parser.add_argument("--server", default=None, type=str, help="Use given server to compute the embeddings")
+    parser.add_argument("--threads", default=4, type=int, help="Threads to use")
+    args = parser.parse_args()
+    args.dtype = getattr(np, args.dtype)
+    assert args.format in ["conll", "conllu"]
+    # Load the input file
+    sentences = []
+    with open(args.input_path, mode="r", encoding="utf-8") as input_file:
+        in_sentence = False
+        for line in input_file:
+            line = line.rstrip("\n")
+            if line:
+                if not in_sentence:
+                    sentences.append([])
+                    in_sentence = True
+                columns = line.split("\t")
+                if args.format == "conll":
+                    sentences[-1].append(columns[0])
+                elif args.format == "conllu":
+                    if columns[0].isdigit():
+                        assert len(columns) == 10
+                        sentences[-1].append(columns[1])
+            else:
+                in_sentence = False
+    print("Loaded {} sentences and {} words.".format(len(sentences), sum(map(len, sentences))), file=sys.stderr, flush=True)
+    # Initialize suitable computational class
+    if args.server is not None:
+        wembeddings = wembeddings.WEmbeddings.ClientNetwork(args.server)
+    else:
+        wembeddings = wembeddings.WEmbeddings(threads=args.threads)
+    # Compute word embeddings
+    with zipfile.ZipFile(args.output_npz, mode="w", compression=zipfile.ZIP_STORED) as output_npz:
+        for i in range(0, len(sentences), args.batch_size):
+            sentences_embeddings = wembeddings.compute_embeddings(args.model, sentences[i:i + args.batch_size])
+            for j, sentence_embeddings in enumerate(sentences_embeddings):
+                with output_npz.open("arr_{}".format(i + j), mode="w") as embeddings_file:
+                    np.save(embeddings_file, sentence_embeddings.astype(args.dtype))
+                if (i + j + 1) % 100 == 0:
+                    print("Processed {}/{} sentences.".format(i + j + 1, len(sentences)), file=sys.stderr, flush=True)
+    print("Done, all embeddings saved.", file=sys.stderr, flush=True)

wembedding_service/start_wembeddings_server.py ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/usr/bin/env python3
+# coding=utf-8
+#
+# Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
+# Mathematics and Physics, Charles University, Czech Republic.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+"""Word embeddings server.
+Example setup:
+$ venv/bin/python ./wembeddings_server.py
+Example call:
+$ curl --data-binary @examples/request.json localhost:8000/wembeddings | xxd
+"""
+import signal
+import os
+import sys
+import threading
+import time
+import numpy as np
+import wembeddings.wembeddings as wembeddings
+import wembeddings.wembeddings_server as wembeddings_server
+if __name__ == "__main__":
+    import argparse
+    # Parse arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("port", type=int, help="Port to use")
+    parser.add_argument("--dtype", default="float16", type=str, help="Dtype to serve the embeddings as")
+    parser.add_argument("--logfile", default=None, type=str, help="Log path")
+    parser.add_argument("--preload_models", default=[], nargs="*", type=str, help="Models to preload, or `all`")
+    parser.add_argument("--preload_only", default=False, action="store_true",  help="Only preload models and exit")
+    parser.add_argument("--threads", default=4, type=int, help="Threads to use")
+    args = parser.parse_args()
+    args.dtype = getattr(np, args.dtype)
+    # Log stderr to logfile if given
+    if args.logfile is not None:
+        sys.stderr = open(args.logfile, "a", encoding="utf-8")
+    # Lambda to create the WEmbeddings instance
+    wembeddings_lambda = lambda: wembeddings.WEmbeddings(threads=args.threads, preload_models=args.preload_models)
+    if args.preload_only:
+        print("Preloading models only.", file=sys.stderr)
+        wembeddings_lambda()
+        sys.exit(0)
+    # Create the server and its own thread
+    server = wembeddings_server.WEmbeddingsServer(args.port, args.dtype, wembeddings_lambda)
+    server_thread = threading.Thread(target=server.serve_forever, daemon=True)
+    server_thread.start()
+    print("Starting WEmbeddings server on port {}.".format(args.port), file=sys.stderr)
+    print("To stop it gracefully, either send SIGINT (Ctrl+C) or SIGUSR1.", file=sys.stderr, flush=True)
+    def shutdown():
+        print("Initiating shutdown of the WEmbeddings server.", file=sys.stderr, flush=True)
+        server.shutdown()
+        print("Stopped handling new requests, processing all current ones.", file=sys.stderr, flush=True)
+        server.server_close()
+        print("Finished shutdown of the WEmbeddings server.", file=sys.stderr, flush=True)
+    # Serve
+    if os.name != 'nt':
+        # Wait for one of the signals on Posix systems.
+        signal.pthread_sigmask(signal.SIG_BLOCK, [signal.SIGINT, signal.SIGUSR1])
+        signal.sigwait([signal.SIGINT, signal.SIGUSR1])
+        shutdown()
+    else:
+        # On Windows, allow interruption with Ctrl+C -- for testing only.
+        def signal_handler(sig, frame):
+            shutdown()
+            sys.exit(0)
+        signal.signal(signal.SIGINT, signal_handler)
+        while True:
+            time.sleep(1)

wembedding_service/wembeddings/__pycache__/wembeddings.cpython-37.pyc ADDED Viewed

Binary file (6.5 kB). View file

wembedding_service/wembeddings/wembeddings.py ADDED Viewed

	@@ -0,0 +1,183 @@

+#!/usr/bin/env python3
+# coding=utf-8
+#
+# Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
+# Mathematics and Physics, Charles University, Czech Republic.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+"""Word embeddings computation class."""
+import json
+import sys
+import time
+import urllib.request
+import numpy as np
+class WEmbeddings:
+    """Class to keep multiple constructed word embedding computation models."""
+    MODELS_MAP = {
+        # Key: model name. Value: transformer model name, layer start, layer end.
+        "bert-base-multilingual-uncased-last4": ("bert-base-multilingual-uncased", -4, None),
+        "robeczech-base-last4": ("ufal/robeczech-base", -4, None),
+        "xlm-roberta-base-last4": ("xlm-roberta-base", -4, None),
+        "bert-large-portuguese-cased-last4":("neuralmind/bert-large-portuguese-cased", -4, None),
+        "bert-base-portuguese-cased-last4":("neuralmind/bert-base-portuguese-cased", -4, None),
+    }
+    MAX_SUBWORDS_PER_SENTENCE = 510
+    class _Model:
+        """Construct a tokenizer and transformers model graph."""
+        def __init__(self, transformers_model, layer_start, layer_end, loader_lock):
+            self._model_loaded = False
+            self._transformers_model_name = transformers_model
+            self._layer_start = layer_start
+            self._layer_end = layer_end
+            self._loader_lock = loader_lock
+        def load(self):
+            if self._model_loaded: return
+            with self._loader_lock:
+                import tensorflow as tf
+                import transformers
+                if self._model_loaded: return
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(self._transformers_model_name, use_fast=True)
+                self._transformers_model = transformers.TFAutoModel.from_pretrained(
+                    self._transformers_model_name,
+                    config=transformers.AutoConfig.from_pretrained(self._transformers_model_name, output_hidden_states=True),
+                    from_pt=True
+                )
+                def compute_embeddings(subwords, segments):
+                    subword_embeddings_layers = self._transformers_model(
+                        (tf.maximum(subwords, 0), tf.cast(tf.not_equal(subwords, -1), tf.int32))
+                    ).hidden_states
+                    subword_embeddings = tf.math.reduce_mean(subword_embeddings_layers[self._layer_start:self._layer_end], axis=0)
+                    # Average subwords (word pieces) word embeddings for each token
+                    def average_subwords(embeddings_and_segments):
+                        subword_embeddings, segments = embeddings_and_segments
+                        return tf.math.segment_mean(subword_embeddings, segments)
+                    word_embeddings = tf.map_fn(average_subwords, (subword_embeddings[:, 1:], segments), dtype=tf.float32)[:, :-1]
+                    return word_embeddings
+                self.compute_embeddings = tf.function(compute_embeddings).get_concrete_function(
+                    tf.TensorSpec(shape=[None, None], dtype=tf.int32), tf.TensorSpec(shape=[None, None], dtype=tf.int32)
+                )
+                self._model_loaded = True
+    def __init__(self, max_form_len=64, threads=None, preload_models=[]):
+        import tensorflow as tf
+        import threading
+        # Impose the limit on the number of threads, if given
+        if threads is not None:
+            tf.config.threading.set_inter_op_parallelism_threads(threads)
+            tf.config.threading.set_intra_op_parallelism_threads(threads)
+        self._max_form_len = max_form_len
+        loader_lock = threading.Lock()
+        self._models = {}
+        for model_name, (transformers_model, layer_start, layer_end) in self.MODELS_MAP.items():
+            self._models[model_name] = self._Model(transformers_model, layer_start, layer_end, loader_lock)
+            if model_name in preload_models or "all" in preload_models:
+                self._models[model_name].load()
+    def compute_embeddings(self, model, sentences):
+        """Computes word embeddings.
+        Arguments:
+            model: one of the keys of self.MODELS_MAP.
+            sentences: 2D Python array with sentences with tokens (strings).
+        Returns:
+            embeddings as a Python list of 1D Numpy arrays
+        """
+        if model not in self._models:
+            print("No such WEmbeddings model {}".format(model), file=sys.stderr, flush=True)
+        embeddings = []
+        if sentences:
+            model = self._models[model]
+            model.load()
+            time_tokenization = time.time()
+            sentences_subwords = model.tokenizer(
+                [(" " if i else "") + word[:self._max_form_len] for sentence in sentences for i, word in enumerate(sentence)],
+                add_special_tokens=False
+            ).input_ids
+            subwords, segments, parts = [], [], []
+            for sentence in sentences:
+                segments.append([])
+                subwords.append([])
+                parts.append([0])
+                sentence_subwords, sentences_subwords = sentences_subwords[:len(sentence)], sentences_subwords[len(sentence):]
+                for word_subwords in sentence_subwords:
+                    # Split sentences with too many subwords
+                    if len(subwords[-1]) + len(word_subwords) > self.MAX_SUBWORDS_PER_SENTENCE:
+                        subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1])
+                        segments.append([])
+                        subwords.append([])
+                        parts[-1].append(0)
+                    segments[-1].extend([parts[-1][-1]] * len(word_subwords))
+                    subwords[-1].extend(word_subwords)
+                    parts[-1][-1] += 1
+                subwords[-1] = model.tokenizer.build_inputs_with_special_tokens(subwords[-1])
+            max_sentence_len = max(len(sentence) for sentence in sentences)
+            max_subwords = max(len(sentence) for sentence in subwords)
+            time_embeddings = time.time()
+            np_subwords = np.full([len(subwords), max_subwords], -1, np.int32)
+            for i, subword in enumerate(subwords):
+                np_subwords[i, :len(subword)] = subword
+            np_segments = np.full([len(segments), max_subwords - 1], max_sentence_len, np.int32)
+            for i, segment in enumerate(segments):
+                np_segments[i, :len(segment)] = segment
+            embeddings_with_parts = model.compute_embeddings(np_subwords, np_segments).numpy()
+            # Concatenate splitted sentences
+            current_sentence_part = 0
+            for sentence_parts in parts:
+                embeddings.append(np.concatenate(
+                    [embeddings_with_parts[current_sentence_part + i, :sentence_part] for i, sentence_part in enumerate(sentence_parts)],
+                    axis=0))
+                current_sentence_part += len(sentence_parts)
+            print("WEmbeddings in {:.1f}ms,".format(1000 * (time.time() - time_embeddings)),
+                  "tokenization in {:.1f}ms,".format(1000*(time_embeddings - time_tokenization)),
+                  "batch {},".format(len(sentences)),
+                  "max sentence len {},".format(max_sentence_len),
+                  "max subwords {}.".format(max_subwords),
+                  file=sys.stderr, flush=True)
+        return embeddings
+    class ClientNetwork:
+        def __init__(self, url):
+            self._url = url
+        def compute_embeddings(self, model, sentences):
+            with urllib.request.urlopen(
+                    "http://{}/wembeddings".format(self._url),
+                    data=json.dumps({"model": model, "sentences": sentences}, ensure_ascii=True).encode("ascii"),
+            ) as response:
+                embeddings = []
+                for _ in sentences:
+                    embeddings.append(np.lib.format.read_array(response, allow_pickle=False))
+                return embeddings

wembedding_service/wembeddings/wembeddings_server.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python3
+# coding=utf-8
+#
+# Copyright 2020 Institute of Formal and Applied Linguistics, Faculty of
+# Mathematics and Physics, Charles University, Czech Republic.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+"""Word embeddings server class."""
+import http.server
+import json
+import socketserver
+import os
+import sys
+import threading
+import urllib.parse
+import numpy as np
+class WEmbeddingsServer(socketserver.ThreadingTCPServer):
+    class WEmbeddingsRequestHandler(http.server.BaseHTTPRequestHandler):
+        protocol_version = "HTTP/1.1"
+        def respond(request, content_type, code=200):
+            request.close_connection = True
+            request.send_response(code)
+            request.send_header("Connection", "close")
+            request.send_header("Content-Type", content_type)
+            request.send_header("Access-Control-Allow-Origin", "*")
+            request.end_headers()
+        def respond_error(request, message, code=400):
+            request.respond("text/plain", code)
+            request.wfile.write(message.encode("utf-8"))
+        def do_POST(request):
+            try:
+                request.path = request.path.encode("iso-8859-1").decode("utf-8")
+                url = urllib.parse.urlparse(request.path)
+            except:
+                return request.respond_error("Cannot parse request URL.")
+            # Handle /wembeddings
+            if url.path == "/wembeddings":
+                if request.headers.get("Transfer-Encoding", "identity").lower() != "identity":
+                    return request.respond_error("Only 'identity' Transfer-Encoding of payload is supported for now.")
+                if "Content-Length" not in request.headers:
+                    return request.respond_error("The Content-Length of payload is required.")
+                try:
+                    length = int(request.headers["Content-Length"])
+                    data = json.loads(request.rfile.read(length))
+                    model, sentences = data["model"], data["sentences"]
+                except:
+                    import traceback
+                    traceback.print_exc(file=sys.stderr)
+                    sys.stderr.flush()
+                    return request.respond_error("Malformed request.")
+                try:
+                    with request.server._wembeddings_mutex:
+                        sentences_embeddings = request.server._wembeddings.compute_embeddings(model, sentences)
+                except:
+                    import traceback
+                    traceback.print_exc(file=sys.stderr)
+                    sys.stderr.flush()
+                    return request.respond_error("An error occurred during wembeddings computation.")
+                request.respond("application/octet_stream")
+                for sentence_embedding in sentences_embeddings:
+                    np.lib.format.write_array(request.wfile, sentence_embedding.astype(request.server._dtype), allow_pickle=False)
+            # URL not found
+            else:
+                request.respond_error("No handler for the given URL '{}'".format(url.path), code=404)
+        def do_GET(request):
+            try:
+                request.path = request.path.encode("iso-8859-1").decode("utf-8")
+                url = urllib.parse.urlparse(request.path)
+            except:
+                return request.respond_error("Cannot parse request URL.")
+            if url.path == "/status":
+                request.respond("application/json")
+                request.wfile.write(bytes("""{"status": "UP"}""", "utf-8"))
+            # URL not found
+            else:
+                request.respond_error("No handler for the given URL '{}'".format(url.path), code=404)
+    daemon_threads = False
+    def __init__(self, port, dtype, wembeddings_lambda):
+        self._dtype = dtype
+        # Create the WEmbeddings object its mutex
+        self._wembeddings = wembeddings_lambda()
+        self._wembeddings_mutex = threading.Lock()
+        # Initialize the server
+        super().__init__(("", port), self.WEmbeddingsRequestHandler)
+    def server_bind(self):
+        import socket
+        self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        if os.name != 'nt':
+            self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
+        super().server_bind()
+    def service_actions(self):
+        if isinstance(getattr(self, "_threads", None), list):
+            if len(self._threads) >= 1024:
+                self._threads = [thread for thread in self._threads if thread.is_alive()]