Spaces:

chicham
/

query_analysis

Runtime error

App Files Files Community

Hicham Randrianarivo commited on Dec 30, 2021

Commit

3b2a392

•

1 Parent(s): 8858fdf

init

Browse files

Files changed (7) hide show

.gitignore +2 -0
app.py +290 -0
flagged/log.csv +3 -0
models/lid.176.ftz +3 -0
requirements.in +13 -0
requirements.txt +230 -0
tox.ini +4 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .envrc
2	+ .pytype/

app.py ADDED Viewed

	@@ -0,0 +1,290 @@

+"""Demo gradio app for some text/query augmentation."""
+from __future__ import annotations
+from collections import defaultdict
+import functools
+from itertools import chain
+from typing import Any, Callable, Mapping, Optional, Sequence, Tuple
+import attr
+import environ
+import fasttext  # not working with python3.9
+import gradio as gr
+from transformers.pipelines import pipeline
+from transformers.pipelines.base import Pipeline
+from transformers.pipelines.token_classification import AggregationStrategy
+from tokenizers.pre_tokenizers import Whitespace
+def compose(*functions) -> Callable:
+    """
+    Compose functions.
+        Args:
+            functions: functions to compose.
+        Returns:
+            Composed functions.
+    """
+    def apply(f, g):
+        return lambda x: f(g(x))
+    return functools.reduce(apply, functions[::-1], lambda x: x)
+def mapped(fn) -> Callable:
+    """
+    Decorator to apply map/filter to a function
+    """
+    def inner(func):
+        partial_fn = functools.partial(fn, func)
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            return partial_fn(*args, **kwargs)
+        return wrapper
+    return inner
+@attr.frozen
+class Prediction:
+    """Dataclass to store prediction results."""
+    label: str
+    score: float
+@attr.frozen
+class Models:
+    identification: Predictor
+    translation: Predictor
+    classification: Predictor
+    ner: Predictor
+    recipe: Predictor
+@attr.frozen
+class Predictor:
+    load_fn: Callable
+    predict_fn: Callable = attr.field(default=lambda model, query: model(query))
+    model: Any = attr.field(init=False)
+    def __attrs_post_init__(self):
+        object.__setattr__(self, "model", self.load_fn())
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        return self.predict_fn(self.model, *args, **kwds)
+@environ.config(prefix="QUERY_INTERPRETATION")
+class AppConfig:
+    @environ.config
+    class Identification:
+        """Identification model configuration."""
+        model = environ.var(default="./models/lid.176.ftz")
+        max_results = environ.var(default=3, converter=int)
+    @environ.config
+    class Translation:
+        """Translation models configuration."""
+        model = environ.var(default="t5-small")
+        sources = environ.var(default="de,fr")
+        target = environ.var(default="en")
+    @environ.config
+    class Classification:
+        """Classification model configuration."""
+        model = environ.var(default="typeform/distilbert-base-uncased-mnli")
+        max_results = environ.var(default=5, converter=int)
+    @environ.config
+    class NER:
+        general = environ.var(default="Davlan/xlm-roberta-base-ner-hrl")
+        recipe = environ.var(default="adamlin/recipe-tag-model")
+    identification: Identification = environ.group(Identification)
+    translation: Translation = environ.group(Translation)
+    classification: Classification = environ.group(Classification)
+    ner: NER = environ.group(NER)
+def predict(
+    models: Models,
+    query: str,
+    categories: Sequence[str],
+    supported_languages: Tuple[str, ...] = ("fr", "de"),
+) -> Tuple[
+    Mapping[str, float],
+    str,
+    Mapping[str, float],
+    Sequence[Tuple[str, Optional[str]]],
+    Sequence[Tuple[str, Optional[str]]],
+]:
+    """Predict from a textual query:
+    - the language
+    - classify as a recipe or not
+    - extract the recipe
+    """
+    def predict_lang(query) -> Mapping[str, float]:
+        def predict_fn(query) -> Sequence[Prediction]:
+            return tuple(
+                Prediction(label=label, score=score)
+                for label, score in zip(*models.identification(query, k=176))
+            )
+        @mapped(map)
+        def format_label(prediction: Prediction) -> Prediction:
+            return attr.evolve(
+                prediction, label=prediction.label.replace("__label__", "")
+            )
+        def filter_labels(prediction: Prediction) -> bool:
+            return prediction.label in supported_languages + ("en",)
+        def format_output(predictions: Sequence[Prediction]) -> dict:
+            return {pred.label: pred.score for pred in predictions}
+        apply_fn = compose(
+            predict_fn,
+            format_label,
+            functools.partial(filter, filter_labels),
+            format_output,
+        )
+        return apply_fn(query)
+    def translate_query(query: str, languages: Mapping[str, float]) -> str:
+        def predicted_language() -> str:
+            return max(languages.items(), key=lambda lang: lang[1])[0]
+        def translate(query):
+            lang = predicted_language()
+            if lang in supported_languages:
+                output = models.translation(query, lang)[0]["translation_text"]
+            else:
+                output = query
+            return output
+        return translate(query)
+    def classify_query(query, categories) -> Mapping[str, float]:
+        predictions = models.classification(query, categories)
+        return dict(zip(predictions["labels"], predictions["scores"]))
+    def extract_entities(
+        predict_fn: Callable, query: str
+    ) -> Sequence[Tuple[str, Optional[str]]]:
+        def get_entity(pred: Mapping[str, str]):
+            return pred.get("entity", pred.get("entity_group", None))
+        mapping = defaultdict(lambda: None)
+        mapping.update(**{pred["word"]: get_entity(pred) for pred in predict_fn(query)})
+        query_processed = Whitespace().pre_tokenize_str(query)
+        res = tuple(
+            chain.from_iterable(
+                ((word, mapping[word]), (" ", None)) for word, _ in query_processed
+            )
+        )
+        return res
+    languages = predict_lang(query)
+    translation = translate_query(query, languages)
+    classifications = classify_query(translation, categories)
+    general_entities = extract_entities(models.ner, query)
+    recipe_entities = extract_entities(models.recipe, translation)
+    return languages, translation, classifications, general_entities, recipe_entities
+def main():
+    cfg: AppConfig = AppConfig.from_environ()
+    def load_translation_models(
+        sources: Sequence[str], target: str, models: Sequence[str]
+    ) -> Pipeline:
+        result = {
+            src: pipeline(f"translation_{src}_to_{target}", models)
+            for src, models in zip(sources, models)
+        }
+        return result
+    def extract_commas_separated_values(value: str) -> Sequence[str]:
+        return tuple(filter(None, value.split(",")))
+    models = Models(
+        identification=Predictor(
+            load_fn=lambda: fasttext.load_model(cfg.identification.model),
+            predict_fn=lambda model, query, k: model.predict(query, k=k),
+        ),
+        translation=Predictor(
+            load_fn=functools.partial(
+                load_translation_models,
+                sources=extract_commas_separated_values(cfg.translation.sources),
+                target=cfg.translation.target,
+                models=["Helsinki-NLP/opus-mt-de-en", "Helsinki-NLP/opus-mt-fr-en"],
+            ),
+            predict_fn=lambda models, query, src: models[src](query),
+        ),
+        classification=Predictor(
+            load_fn=lambda: pipeline(
+                "zero-shot-classification", model=cfg.classification.model
+            ),
+            predict_fn=lambda model, query, categories: model(query, categories),
+        ),
+        ner=Predictor(
+            load_fn=lambda: pipeline(
+                "ner",
+                model=cfg.ner.general,
+                aggregation_strategy=AggregationStrategy.SIMPLE,
+            ),
+        ),
+        recipe=Predictor(
+            load_fn=lambda: pipeline("ner", model=cfg.ner.recipe),
+        ),
+    )
+    iface = gr.Interface(
+        fn=lambda query, categories: predict(
+            models, query.strip(), extract_commas_separated_values(categories)
+        ),
+        examples=[["gateau au chocolat paris"], ["Newyork LA flight"]],
+        inputs=[
+            gr.inputs.Textbox(label="Query"),
+            gr.inputs.Textbox(
+                label="categories (commas separated and in english)",
+                default="cooking and recipe,traveling,location,information,buy or sell",
+            ),
+        ],
+        outputs=[
+            gr.outputs.Label(
+                num_top_classes=cfg.identification.max_results,
+                type="auto",
+                label="Language identification",
+            ),
+            gr.outputs.Textbox(
+                label="English query",
+                type="auto",
+            ),
+            gr.outputs.Label(
+                num_top_classes=cfg.classification.max_results,
+                type="auto",
+                label="Predicted categories",
+            ),
+            gr.outputs.HighlightedText(label="NER generic"),
+            gr.outputs.HighlightedText(label="NER Recipes"),
+        ],
+        interpretation="default",
+    )
+    iface.launch()
+if __name__ == "__main__":
+    main()

flagged/log.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+name,Output,timestamp
+,Hello ,2021-12-26 11:28:41.922022
+,Hello ,2021-12-26 11:28:43.161869

models/lid.176.ftz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f3472cfe8738a7b6099e8e999c3cbfae0dcd15696aac7d7738a8039db603e83
+size 938013

requirements.in ADDED Viewed

	@@ -0,0 +1,13 @@

+-f https://download.pytorch.org/whl/cpu/torch_stable.html
+gradio
+transformers
+fasttext
+huggingface_hub
+requests
+datasets
+tokenizers
+torch
+environ-config
+sentencepiece
+rich
+protobuf

requirements.txt ADDED Viewed

	@@ -0,0 +1,230 @@

+#
+# This file is autogenerated by pip-compile with python 3.8
+# To update, run:
+#
+#    pip-compile requirements.in
+#
+--find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+aiohttp==3.8.1
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.2.0
+    # via aiohttp
+analytics-python==1.4.0
+    # via gradio
+async-timeout==4.0.2
+    # via aiohttp
+attrs==21.4.0
+    # via
+    #   aiohttp
+    #   environ-config
+backoff==1.10.0
+    # via analytics-python
+bcrypt==3.2.0
+    # via paramiko
+certifi==2021.10.8
+    # via requests
+cffi==1.15.0
+    # via
+    #   bcrypt
+    #   cryptography
+    #   pynacl
+charset-normalizer==2.0.9
+    # via
+    #   aiohttp
+    #   requests
+click==8.0.3
+    # via
+    #   flask
+    #   sacremoses
+colorama==0.4.4
+    # via rich
+commonmark==0.9.1
+    # via rich
+cryptography==36.0.1
+    # via paramiko
+cycler==0.11.0
+    # via matplotlib
+datasets==1.17.0
+    # via -r requirements.in
+dill==0.3.4
+    # via
+    #   datasets
+    #   multiprocess
+environ-config==21.2.0
+    # via -r requirements.in
+fasttext==0.9.2
+    # via -r requirements.in
+ffmpy==0.3.0
+    # via gradio
+filelock==3.4.2
+    # via
+    #   huggingface-hub
+    #   transformers
+flask==2.0.2
+    # via
+    #   flask-cachebuster
+    #   flask-cors
+    #   flask-login
+    #   gradio
+flask-cachebuster==1.0.0
+    # via gradio
+flask-cors==3.0.10
+    # via gradio
+flask-login==0.5.0
+    # via gradio
+fonttools==4.28.5
+    # via matplotlib
+frozenlist==1.2.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2021.11.1
+    # via datasets
+gradio==2.6.3
+    # via -r requirements.in
+huggingface-hub==0.2.1
+    # via
+    #   -r requirements.in
+    #   datasets
+    #   transformers
+idna==3.3
+    # via
+    #   requests
+    #   yarl
+itsdangerous==2.0.1
+    # via flask
+jinja2==3.0.3
+    # via flask
+joblib==1.1.0
+    # via sacremoses
+kiwisolver==1.3.2
+    # via matplotlib
+markdown2==2.4.2
+    # via gradio
+markupsafe==2.0.1
+    # via jinja2
+matplotlib==3.5.1
+    # via gradio
+monotonic==1.6
+    # via analytics-python
+multidict==5.2.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.12.2
+    # via datasets
+numpy==1.21.5
+    # via
+    #   datasets
+    #   fasttext
+    #   gradio
+    #   matplotlib
+    #   pandas
+    #   pyarrow
+    #   transformers
+packaging==21.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   matplotlib
+    #   transformers
+pandas==1.3.5
+    # via
+    #   datasets
+    #   gradio
+paramiko==2.9.1
+    # via gradio
+pillow==8.4.0
+    # via
+    #   gradio
+    #   matplotlib
+protobuf==3.19.1
+    # via -r requirements.in
+pyarrow==6.0.1
+    # via datasets
+pybind11==2.9.0
+    # via fasttext
+pycparser==2.21
+    # via cffi
+pycryptodome==3.12.0
+    # via gradio
+pydub==0.25.1
+    # via gradio
+pygments==2.10.0
+    # via rich
+pynacl==1.4.0
+    # via paramiko
+pyparsing==3.0.6
+    # via
+    #   matplotlib
+    #   packaging
+python-dateutil==2.8.2
+    # via
+    #   analytics-python
+    #   matplotlib
+    #   pandas
+pytz==2021.3
+    # via pandas
+pyyaml==6.0
+    # via
+    #   huggingface-hub
+    #   transformers
+regex==2021.11.10
+    # via
+    #   sacremoses
+    #   transformers
+requests==2.26.0
+    # via
+    #   -r requirements.in
+    #   analytics-python
+    #   datasets
+    #   fsspec
+    #   gradio
+    #   huggingface-hub
+    #   transformers
+rich==10.16.1
+    # via -r requirements.in
+sacremoses==0.0.46
+    # via transformers
+sentencepiece==0.1.96
+    # via -r requirements.in
+six==1.16.0
+    # via
+    #   analytics-python
+    #   bcrypt
+    #   flask-cors
+    #   pynacl
+    #   python-dateutil
+    #   sacremoses
+tokenizers==0.10.3
+    # via
+    #   -r requirements.in
+    #   transformers
+torch==1.10.1+cpu
+    # via -r requirements.in
+tqdm==4.62.3
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   sacremoses
+    #   transformers
+transformers==4.15.0
+    # via -r requirements.in
+typing-extensions==4.0.1
+    # via
+    #   huggingface-hub
+    #   torch
+urllib3==1.26.7
+    # via requests
+werkzeug==2.0.2
+    # via flask
+xxhash==2.0.2
+    # via datasets
+yarl==1.7.2
+    # via aiohttp
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools

tox.ini ADDED Viewed

	@@ -0,0 +1,4 @@

+[flake8]
+docstring-convention=google
+max-line-length = 88
+extend-ignore = E203