api-avr23-cds-translation3

Sleeping

App Files Files Community

Demosthene-OR commited on Mar 9

Commit

40a3d50

•

1 Parent(s): 3d78a2b

Add data

Browse files

Files changed (33) hide show

Dockerfile +2 -2
Dockerfile_full +0 -27
data/dl_id_lang_split/dl_tiktoken_id_language_model_big_1.h5 +3 -0
data/dl_id_lang_split/dl_tiktoken_id_language_model_big_2.h5 +3 -0
data/dl_id_lang_split/dl_tiktoken_id_language_model_big_3.h5 +3 -0
data/dl_id_lang_split/dl_tiktoken_id_language_model_big_4.h5 +3 -0
data/dl_id_lang_split/manifest +5 -0
data/rnn_en-fr_split/manifest +6 -0
data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_1.h5 +3 -0
data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_2.h5 +3 -0
data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_3.h5 +3 -0
data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_4.h5 +3 -0
data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_5.h5 +3 -0
data/rnn_fr-en_split/manifest +6 -0
data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_1.h5 +3 -0
data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_2.h5 +3 -0
data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_3.h5 +3 -0
data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_4.h5 +3 -0
data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_5.h5 +3 -0
data/transf_en-fr_weight_split/manifest +4 -0
data/transf_en-fr_weight_split/transformer-model-en-fr.weights_1.h5 +3 -0
data/transf_en-fr_weight_split/transformer-model-en-fr.weights_2.h5 +3 -0
data/transf_en-fr_weight_split/transformer-model-en-fr.weights_3.h5 +3 -0
data/transf_fr-en_weight_split/manifest +4 -0
data/transf_fr-en_weight_split/transformer-model-fr-en.weights_1.h5 +3 -0
data/transf_fr-en_weight_split/transformer-model-fr-en.weights_2.h5 +3 -0
data/transf_fr-en_weight_split/transformer-model-fr-en.weights_3.h5 +3 -0
data/transformer-model-en-fr.h5 +3 -0
data/transformer-model-fr-en.h5 +3 -0
main_dl.py +591 -0
questions.csv +0 -78
requirements.txt +5 -0
requirements_save.txt +21 -0

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
 # Use the official Python 3.9 image
-FROM python:3.8
 # Set the working directory to /code
 WORKDIR /code
@@ -24,4 +24,4 @@ WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
-CMD ["uvicorn", "main:api", "--host", "0.0.0.0", "--port", "7860"]

 # Use the official Python 3.9 image
+FROM python:3.10
 # Set the working directory to /code
 WORKDIR /code
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
+CMD ["uvicorn", "main_dl:api", "--host", "0.0.0.0", "--port", "7860"]

Dockerfile_full DELETED Viewed

@@ -1,27 +0,0 @@
-# Use the official Python 3.9 image
-FROM python:3.8
-# Set the working directory to /code
-WORKDIR /code
-# Copy the current directory contents into the container at /code
-COPY ./requirements.txt /code/requirements.txt
-# Install requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
-# Set up a new user named "user" with user ID 1000
-RUN useradd -m -u 1000 user
-# Switch to the "user" user
-USER user
-# Set home to the user's home directory
-ENV HOME=/home/user \
-	PATH=/home/user/.local/bin:$PATH
-# Set the working directory to the user's home directory
-WORKDIR $HOME/app
-# Copy the current directory contents into the container at $HOME/app setting the owner to the user
-COPY --chown=user . $HOME/app
-CMD ["uvicorn", "main:api", "--host", "0.0.0.0", "--port", "7860"]

data/dl_id_lang_split/dl_tiktoken_id_language_model_big_1.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddcd79956268d67eb577f260375d17f43ba46407b4040168f69575156126dd99
+size 66000000

data/dl_id_lang_split/dl_tiktoken_id_language_model_big_2.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6c6a1c16c97eab1f0c2c01790e2e999a9ae8932f46f14edd1621a24d3cbf514
+size 66000000

data/dl_id_lang_split/dl_tiktoken_id_language_model_big_3.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:376148ae36d512c3d13021782169c372cd5c7705c174f2c1eda6fb2682942474
+size 66000000

data/dl_id_lang_split/dl_tiktoken_id_language_model_big_4.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ac73bd9690a45d2029cc1f7d3d52455afde4aa33c06783c49f9d23d3a49e154
+size 45123468

data/dl_id_lang_split/manifest ADDED Viewed

	@@ -0,0 +1,5 @@

+filename,filesize,header
+dl_tiktoken_id_language_model_big_1.h5,66000000,False
+dl_tiktoken_id_language_model_big_2.h5,66000000,False
+dl_tiktoken_id_language_model_big_3.h5,66000000,False
+dl_tiktoken_id_language_model_big_4.h5,45123468,False

data/rnn_en-fr_split/manifest ADDED Viewed

	@@ -0,0 +1,6 @@

+filename,filesize,header
+seq2seq_rnn-model-en-fr_1.h5,66000000,False
+seq2seq_rnn-model-en-fr_2.h5,66000000,False
+seq2seq_rnn-model-en-fr_3.h5,66000000,False
+seq2seq_rnn-model-en-fr_4.h5,66000000,False
+seq2seq_rnn-model-en-fr_5.h5,15028184,False

data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_1.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:604a8d976c5dfbf08504fff52a2f3b49097f2b2bc4871c0115626501be0f1fe3
+size 66000000

data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_2.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7b6d0a71a6f8184339c61624d4eb966c736b21dd4c77966463ea54e6ec81804
+size 66000000

data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_3.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96d664bca715ccf51d500353bb9818e8fd720aa5b1a56351919fc4d6e5026163
+size 66000000

data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_4.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85368dfd28f38a3a45ce33a0efcdc3a2272734bef7d2dbc8b622c47835dba119
+size 66000000

data/rnn_en-fr_split/seq2seq_rnn-model-en-fr_5.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50931954fd621657523c067543b083ee21ea0a1ddf521f3a994078b5346dba9e
+size 15028184

data/rnn_fr-en_split/manifest ADDED Viewed

	@@ -0,0 +1,6 @@

+filename,filesize,header
+seq2seq_rnn-model-fr-en_1.h5,66000000,False
+seq2seq_rnn-model-fr-en_2.h5,66000000,False
+seq2seq_rnn-model-fr-en_3.h5,66000000,False
+seq2seq_rnn-model-fr-en_4.h5,66000000,False
+seq2seq_rnn-model-fr-en_5.h5,15013864,False

data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_1.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:807abdf32cc7021d96637d0de7950f8d21f2e36193bebd5ee5bab6ea3310a3e6
+size 66000000

data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_2.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b0983cee0fb422fd496ee9621a847fce377a9d069541cab5133ed1a185aa16ae
+size 66000000

data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_3.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9e1f321b98fbadefc9a598696b7fc7839b11f20b8eb85096d3c2945407abeb3
+size 66000000

data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_4.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc42ac6b241525f017077c9f4389a115feef8426bb0e5ed2f931954f46436f21
+size 66000000

data/rnn_fr-en_split/seq2seq_rnn-model-fr-en_5.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08d0e4c15ac94ddc79f1b882dd552326e43746f7d957684b4ee0a0177746f3d1
+size 15013864

data/transf_en-fr_weight_split/manifest ADDED Viewed

	@@ -0,0 +1,4 @@

+filename,filesize,header
+transformer-model-en-fr.weights_1.h5,66000000,False
+transformer-model-en-fr.weights_2.h5,66000000,False
+transformer-model-en-fr.weights_3.h5,13132552,False

data/transf_en-fr_weight_split/transformer-model-en-fr.weights_1.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3402aca9feb67f40946be97f4f5eb7c751bc93d2b462019ec3880547c407160c
+size 66000000

data/transf_en-fr_weight_split/transformer-model-en-fr.weights_2.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0faab4f24bee130a619d1dd6d938c8b189cb5193ebe0d97e07e9ae342bdf1a7b
+size 66000000

data/transf_en-fr_weight_split/transformer-model-en-fr.weights_3.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed77f8991e7be62a3cae66678a9ef0663485af199b6c5e44a0268bddfd91268
+size 13132552

data/transf_fr-en_weight_split/manifest ADDED Viewed

	@@ -0,0 +1,4 @@

+filename,filesize,header
+transformer-model-fr-en.weights_1.h5,66000000,False
+transformer-model-fr-en.weights_2.h5,66000000,False
+transformer-model-fr-en.weights_3.h5,13132552,False

data/transf_fr-en_weight_split/transformer-model-fr-en.weights_1.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4807ba1cdad2003fd9e73c72c6b85437218005e12f2fafe0f909b519e04cc33e
+size 66000000

data/transf_fr-en_weight_split/transformer-model-fr-en.weights_2.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d74767331461dbbcffc7c336026d95dc5dd3f98b2e7aa27a9f2eeb9dcd61f565
+size 66000000

data/transf_fr-en_weight_split/transformer-model-fr-en.weights_3.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0671c8adfac6f6702cccab910f134bd859bc264f7a6b2a58d642cf5f904e0c1e
+size 13132552

data/transformer-model-en-fr.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12c88417c1d3094a326f25211d29c4084664fedb40e6b803743f27c85cc2f8e7
+size 72565608

data/transformer-model-fr-en.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7c8e0bf9118fdfb42fbbf36391ffa1ac15f687b37982752e8256ac9ce147b36
+size 72565608

main_dl.py ADDED Viewed

	@@ -0,0 +1,591 @@

+from fastapi import FastAPI, HTTPException, Header, Depends, Request
+from fastapi.responses import JSONResponse
+from fastapi.security import HTTPBasic, HTTPBasicCredentials
+from fastapi.exceptions import RequestValidationError
+from typing import Optional, List
+from pydantic import BaseModel, ValidationError
+import pandas as pd
+import numpy as np
+import os
+from transformers import pipeline
+from filesplit.merge import Merge
+import tensorflow as tf
+import string
+import re
+from tensorflow import keras
+from keras_nlp.layers import TransformerEncoder
+from tensorflow.keras import layers
+from tensorflow.keras.utils import plot_model
+dataPath = st.session_state.DataPath
+# ===== Keras ====
+strip_chars = string.punctuation + "¿"
+strip_chars = strip_chars.replace("[", "")
+strip_chars = strip_chars.replace("]", "")
+def custom_standardization(input_string):
+    lowercase = tf.strings.lower(input_string)
+    lowercase=tf.strings.regex_replace(lowercase, "[à]", "a")
+    return tf.strings.regex_replace(
+        lowercase, f"[{re.escape(strip_chars)}]", "")
+@st.cache_data
+def load_vocab(file_path):
+    with open(file_path, "r",  encoding="utf-8") as file:
+        return file.read().split('\n')[:-1]
+def decode_sequence_rnn(input_sentence, src, tgt):
+    global translation_model
+    vocab_size = 15000
+    sequence_length = 50
+    source_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
+    )
+    target_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length + 1,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
+    )
+    tgt_vocab = target_vectorization.get_vocabulary()
+    tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
+    max_decoded_sentence_length = 50
+    tokenized_input_sentence = source_vectorization([input_sentence])
+    decoded_sentence = "[start]"
+    for i in range(max_decoded_sentence_length):
+        tokenized_target_sentence = target_vectorization([decoded_sentence])
+        next_token_predictions = translation_model.predict(
+            [tokenized_input_sentence, tokenized_target_sentence], verbose=0)
+        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
+        sampled_token = tgt_index_lookup[sampled_token_index]
+        decoded_sentence += " " + sampled_token
+        if sampled_token == "[end]":
+            break
+    return decoded_sentence[8:-6]
+# ===== Enf of Keras ====
+# ===== Transformer section ====
+class TransformerDecoder(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim)
+        self.attention_2 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim)
+        self.dense_proj = keras.Sequential(
+            [layers.Dense(dense_dim, activation="relu"),
+             layers.Dense(embed_dim),]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.layernorm_3 = layers.LayerNormalization()
+        self.supports_masking = True
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "embed_dim": self.embed_dim,
+            "num_heads": self.num_heads,
+            "dense_dim": self.dense_dim,
+        })
+        return config
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [tf.expand_dims(batch_size, -1),
+             tf.constant([1, 1], dtype=tf.int32)], axis=0)
+        return tf.tile(mask, mult)
+    def call(self, inputs, encoder_outputs, mask=None):
+        causal_mask = self.get_causal_attention_mask(inputs)
+        if mask is not None:
+            padding_mask = tf.cast(
+                mask[:, tf.newaxis, :], dtype="int32")
+            padding_mask = tf.minimum(padding_mask, causal_mask)
+        else:
+            padding_mask = mask
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=causal_mask)
+        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
+        attention_output_2 = self.attention_2(
+            query=attention_output_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+        )
+        attention_output_2 = self.layernorm_2(
+            attention_output_1 + attention_output_2)
+        proj_output = self.dense_proj(attention_output_2)
+        return self.layernorm_3(attention_output_2 + proj_output)
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = layers.Embedding(
+            input_dim=input_dim, output_dim=output_dim)
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=output_dim)
+        self.sequence_length = sequence_length
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+    def call(self, inputs):
+        length = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+    def get_config(self):
+        config = super(PositionalEmbedding, self).get_config()
+        config.update({
+            "output_dim": self.output_dim,
+            "sequence_length": self.sequence_length,
+            "input_dim": self.input_dim,
+        })
+        return config
+def decode_sequence_tranf(input_sentence, src, tgt):
+    global translation_model
+    vocab_size = 15000
+    sequence_length = 30
+    source_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+src+".txt"),
+    )
+    target_vectorization = layers.TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        output_sequence_length=sequence_length + 1,
+        standardize=custom_standardization,
+        vocabulary = load_vocab(dataPath+"/vocab_"+tgt+".txt"),
+    )
+    tgt_vocab = target_vectorization.get_vocabulary()
+    tgt_index_lookup = dict(zip(range(len(tgt_vocab)), tgt_vocab))
+    max_decoded_sentence_length = 50
+    tokenized_input_sentence = source_vectorization([input_sentence])
+    decoded_sentence = "[start]"
+    for i in range(max_decoded_sentence_length):
+        tokenized_target_sentence = target_vectorization(
+            [decoded_sentence])[:, :-1]
+        predictions = translation_model(
+            [tokenized_input_sentence, tokenized_target_sentence])
+        sampled_token_index = np.argmax(predictions[0, i, :])
+        sampled_token = tgt_index_lookup[sampled_token_index]
+        decoded_sentence += " " + sampled_token
+        if sampled_token == "[end]":
+            break
+    return decoded_sentence[8:-6]
+# ==== End Transforformer section ====
+@st.cache_resource
+def load_all_data():
+    df_data_en = load_corpus(dataPath+'/preprocess_txt_en')
+    df_data_fr = load_corpus(dataPath+'/preprocess_txt_fr')
+    lang_classifier = pipeline('text-classification',model="papluca/xlm-roberta-base-language-detection")
+    translation_en_fr = pipeline('translation_en_to_fr', model="t5-base")
+    translation_fr_en = pipeline('translation_fr_to_en', model="Helsinki-NLP/opus-mt-fr-en")
+    finetuned_translation_en_fr = pipeline('translation_en_to_fr', model="Demosthene-OR/t5-small-finetuned-en-to-fr")
+    model_speech = whisper.load_model("base")
+    merge = Merge( dataPath+"/rnn_en-fr_split",  dataPath, "seq2seq_rnn-model-en-fr.h5").merge(cleanup=False)
+    merge = Merge( dataPath+"/rnn_fr-en_split",  dataPath, "seq2seq_rnn-model-fr-en.h5").merge(cleanup=False)
+    rnn_en_fr = keras.models.load_model(dataPath+"/seq2seq_rnn-model-en-fr.h5", compile=False)
+    rnn_fr_en = keras.models.load_model(dataPath+"/seq2seq_rnn-model-fr-en.h5", compile=False)
+    rnn_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    rnn_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    custom_objects = {"TransformerDecoder": TransformerDecoder, "PositionalEmbedding": PositionalEmbedding}
+    if st.session_state.Cloud == 1:
+        with keras.saving.custom_object_scope(custom_objects):
+            transformer_en_fr = keras.models.load_model( "data/transformer-model-en-fr.h5")
+            transformer_fr_en = keras.models.load_model( "data/transformer-model-fr-en.h5")
+        merge = Merge( "data/transf_en-fr_weight_split",  "data", "transformer-model-en-fr.weights.h5").merge(cleanup=False)
+        merge = Merge( "data/transf_fr-en_weight_split",  "data", "transformer-model-fr-en.weights.h5").merge(cleanup=False)
+    else:
+        transformer_en_fr = keras.models.load_model( dataPath+"/transformer-model-en-fr.h5", custom_objects=custom_objects )
+        transformer_fr_en = keras.models.load_model( dataPath+"/transformer-model-fr-en.h5", custom_objects=custom_objects)
+        transformer_en_fr.load_weights(dataPath+"/transformer-model-en-fr.weights.h5")
+        transformer_fr_en.load_weights(dataPath+"/transformer-model-fr-en.weights.h5")
+    transformer_en_fr.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    transformer_fr_en.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    return df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
+        transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr
+n1 = 0
+df_data_en, df_data_fr, translation_en_fr, translation_fr_en, lang_classifier, model_speech, rnn_en_fr, rnn_fr_en,\
+    transformer_en_fr, transformer_fr_en, finetuned_translation_en_fr = load_all_data()
+def display_translation(n1, Lang,model_type):
+    global df_data_src, df_data_tgt, placeholder
+    placeholder = st.empty()
+    with st.status(":sunglasses:", expanded=True):
+        s = df_data_src.iloc[n1:n1+5][0].tolist()
+        s_trad = []
+        s_trad_ref = df_data_tgt.iloc[n1:n1+5][0].tolist()
+        source = Lang[:2]
+        target = Lang[-2:]
+        for i in range(3):
+            if model_type==1:
+                s_trad.append(decode_sequence_rnn(s[i], source, target))
+            else:
+                s_trad.append(decode_sequence_tranf(s[i], source, target))
+            st.write("**"+source+"   :**  :blue["+ s[i]+"]")
+            st.write("**"+target+"   :**  "+s_trad[-1])
+            st.write("**ref. :** "+s_trad_ref[i])
+            st.write("")
+    with placeholder:
+        st.write("<p style='text-align:center;background-color:red; color:white')>Score Bleu = "+str(int(round(corpus_bleu(s_trad,[s_trad_ref]).score,0)))+"%</p>", \
+            unsafe_allow_html=True)
+@st.cache_data
+def find_lang_label(lang_sel):
+    global lang_tgt, label_lang
+    return label_lang[lang_tgt.index(lang_sel)]
+@st.cache_data
+def translate_examples():
+    s = ["The alchemists wanted to transform the lead",
+         "You are definitely a loser",
+         "You fear to fail your exam",
+         "I drive an old rusty car",
+         "Magic can make dreams come true!",
+         "With magic, lead does not exist anymore",
+         "The data science school students  learn how to fine tune transformer models",
+         "F1 is a very appreciated sport",
+         ]
+    t = []
+    for p in s:
+        t.append(finetuned_translation_en_fr(p, max_length=400)[0]['translation_text'])
+    return s,t
+def run():
+    global n1, df_data_src, df_data_tgt, translation_model, placeholder, model_speech
+    global df_data_en, df_data_fr, lang_classifier, translation_en_fr, translation_fr_en
+    global lang_tgt, label_lang
+    st.write("")
+    st.title(tr(title))
+    #
+    st.write("## **"+tr("Explications")+" :**\n")
+    st.markdown(tr(
+        """
+        Enfin, nous avons réalisé une traduction :red[**Seq2Seq**] ("Sequence-to-Sequence") avec des :red[**réseaux neuronaux**].
+        """)
+        , unsafe_allow_html=True)
+    st.markdown(tr(
+        """
+        La traduction Seq2Seq est une méthode d'apprentissage automatique qui permet de traduire des séquences de texte d'une langue à une autre en utilisant
+        un :red[**encodeur**] pour capturer le sens du texte source, un :red[**décodeur**] pour générer la traduction,
+        avec un ou plusieurs :red[**vecteurs d'intégration**] qui relient les deux, afin de transmettre le contexte, l'attention ou la position.
+        """)
+        , unsafe_allow_html=True)
+    st.image("assets/deepnlp_graph1.png",use_column_width=True)
+    st.markdown(tr(
+        """
+        Nous avons mis en oeuvre ces techniques avec des Réseaux Neuronaux Récurrents (GRU en particulier) et des Transformers
+        Vous en trouverez :red[**5 illustrations**] ci-dessous.
+        """)
+    , unsafe_allow_html=True)
+    # Utilisation du module translate
+    lang_tgt   = ['en','fr','af','ak','sq','de','am','en','ar','hy','as','az','ba','bm','eu','bn','be','my','bs','bg','ks','ca','ny','zh','si','ko','co','ht','hr','da','dz','gd','es','eo','et','ee','fo','fj','fi','fr','fy','gl','cy','lg','ka','el','gn','gu','ha','he','hi','hu','ig','id','iu','ga','is','it','ja','kn','kk','km','ki','rw','ky','rn','ku','lo','la','lv','li','ln','lt','lb','mk','ms','ml','dv','mg','mt','mi','mr','mn','nl','ne','no','nb','nn','oc','or','ug','ur','uz','ps','pa','fa','pl','pt','ro','ru','sm','sg','sa','sc','sr','sn','sd','sk','sl','so','st','su','sv','sw','ss','tg','tl','ty','ta','tt','cs','te','th','bo','ti','to','ts','tn','tr','tk','tw','uk','vi','wo','xh','yi']
+    label_lang = ['Anglais','Français','Afrikaans','Akan','Albanais','Allemand','Amharique','Anglais','Arabe','Arménien','Assamais','Azéri','Bachkir','Bambara','Basque','Bengali','Biélorusse','Birman','Bosnien','Bulgare','Cachemiri','Catalan','Chichewa','Chinois','Cingalais','Coréen','Corse','Créolehaïtien','Croate','Danois','Dzongkha','Écossais','Espagnol','Espéranto','Estonien','Ewe','Féroïen','Fidjien','Finnois','Français','Frisonoccidental','Galicien','Gallois','Ganda','Géorgien','Grecmoderne','Guarani','Gujarati','Haoussa','Hébreu','Hindi','Hongrois','Igbo','Indonésien','Inuktitut','Irlandais','Islandais','Italien','Japonais','Kannada','Kazakh','Khmer','Kikuyu','Kinyarwanda','Kirghiz','Kirundi','Kurde','Lao','Latin','Letton','Limbourgeois','Lingala','Lituanien','Luxembourgeois','Macédonien','Malais','Malayalam','Maldivien','Malgache','Maltais','MaorideNouvelle-Zélande','Marathi','Mongol','Néerlandais','Népalais','Norvégien','Norvégienbokmål','Norvégiennynorsk','Occitan','Oriya','Ouïghour','Ourdou','Ouzbek','Pachto','Pendjabi','Persan','Polonais','Portugais','Roumain','Russe','Samoan','Sango','Sanskrit','Sarde','Serbe','Shona','Sindhi','Slovaque','Slovène','Somali','SothoduSud','Soundanais','Suédois','Swahili','Swati','Tadjik','Tagalog','Tahitien','Tamoul','Tatar','Tchèque','Télougou','Thaï','Tibétain','Tigrigna','Tongien','Tsonga','Tswana','Turc','Turkmène','Twi','Ukrainien','Vietnamien','Wolof','Xhosa','Yiddish']
+    lang_src = {'ar': 'arabic', 'bg': 'bulgarian', 'de': 'german', 'el':'modern greek', 'en': 'english', 'es': 'spanish', 'fr': 'french', \
+                'hi': 'hindi', 'it': 'italian', 'ja': 'japanese', 'nl': 'dutch', 'pl': 'polish', 'pt': 'portuguese', 'ru': 'russian', 'sw': 'swahili', \
+                'th': 'thai', 'tr': 'turkish', 'ur': 'urdu', 'vi': 'vietnamese', 'zh': 'chinese'}
+    st.write("#### "+tr("Choisissez le type de traduction")+" :")
+    chosen_id = tab_bar(data=[
+        TabBarItemData(id="tab1", title="small vocab", description=tr("avec Keras et un RNN")),
+        TabBarItemData(id="tab2", title="small vocab", description=tr("avec Keras et un Transformer")),
+        TabBarItemData(id="tab3", title=tr("Phrase personnelle"), description=tr("à écrire")),
+        TabBarItemData(id="tab4", title=tr("Phrase personnelle"), description=tr("à dicter")),
+        TabBarItemData(id="tab5", title=tr("Funny translation !"), description=tr("avec le Fine Tuning"))],
+        default="tab1")
+    if (chosen_id == "tab1") or (chosen_id == "tab2") :
+        if (chosen_id == "tab1"):
+            st.write("<center><h5><b>"+tr("Schéma d'un Réseau de Neurones Récurrents")+"</b></h5></center>", unsafe_allow_html=True)
+            st.image("assets/deepnlp_graph3.png",use_column_width=True)
+        else:
+            st.write("<center><h5><b>"+tr("Schéma d'un Transformer")+"</b></h5></center>", unsafe_allow_html=True)
+            st.image("assets/deepnlp_graph12.png",use_column_width=True)
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        TabContainerHolder = st.container()
+        Sens = TabContainerHolder.radio(tr('Sens')+':',('Anglais -> Français','Français -> Anglais'), horizontal=True)
+        Lang = ('en_fr' if Sens=='Anglais -> Français' else 'fr_en')
+        if (Lang=='en_fr'):
+            df_data_src = df_data_en
+            df_data_tgt = df_data_fr
+            if (chosen_id == "tab1"):
+                translation_model = rnn_en_fr
+            else:
+                translation_model = transformer_en_fr
+        else:
+            df_data_src = df_data_fr
+            df_data_tgt = df_data_en
+            if (chosen_id == "tab1"):
+                translation_model = rnn_fr_en
+            else:
+                translation_model = transformer_fr_en
+        sentence1 = st.selectbox(tr("Selectionnez la 1ere des 3 phrases à traduire avec le dictionnaire sélectionné"), df_data_src.iloc[:-4],index=int(n1) )
+        n1 = df_data_src[df_data_src[0]==sentence1].index.values[0]
+        st.write("## **"+tr("Résultats")+" :**\n")
+        if (chosen_id == "tab1"):
+            display_translation(n1, Lang,1)
+        else:
+            display_translation(n1, Lang,2)
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
+        if (chosen_id == "tab1"):
+            st.markdown(tr(
+                """
+                Nous avons utilisé 2 Gated Recurrent Units.
+                Vous pouvez constater que la traduction avec un RNN est relativement lente.
+                Ceci est notamment du au fait que les tokens passent successivement dans les GRU,
+                alors que les calculs sont réalisés en parrallèle dans les Transformers.
+                Le score BLEU est bien meilleur que celui des traductions mot à mot.
+                <br>
+                """)
+                , unsafe_allow_html=True)
+        else:
+            st.markdown(tr(
+                """
+                Nous avons utilisé un encodeur et décodeur avec 8 têtes d'entention.
+                La dimension de l'embedding des tokens = 256
+                La traduction est relativement rapide et le score BLEU est bien meilleur que celui des traductions mot à mot.
+                <br>
+                """)
+                , unsafe_allow_html=True)
+        st.write("<center><h5>"+tr("Architecture du modèle utilisé")+":</h5>", unsafe_allow_html=True)
+        plot_model(translation_model, show_shapes=True, show_layer_names=True, show_layer_activations=True,rankdir='TB',to_file=st.session_state.ImagePath+'/model_plot.png')
+        st.image(st.session_state.ImagePath+'/model_plot.png',use_column_width=True)
+        st.write("</center>", unsafe_allow_html=True)
+    elif chosen_id == "tab3":
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        custom_sentence = st.text_area(label=tr("Saisir le texte à traduire"))
+        l_tgt = st.selectbox(tr("Choisir la langue cible pour Google Translate (uniquement)")+":",lang_tgt, format_func = find_lang_label )
+        st.button(label=tr("Validez"), type="primary")
+        if custom_sentence!="":
+            st.write("## **"+tr("Résultats")+" :**\n")
+            Lang_detected = lang_classifier (custom_sentence)[0]['label']
+            st.write(tr('Langue détectée')+' : **'+lang_src.get(Lang_detected)+'**')
+            audio_stream_bytesio_src = io.BytesIO()
+            tts = gTTS(custom_sentence,lang=Lang_detected)
+            tts.write_to_fp(audio_stream_bytesio_src)
+            st.audio(audio_stream_bytesio_src)
+            st.write("")
+        else: Lang_detected=""
+        col1, col2 = st.columns(2, gap="small")
+        with col1:
+            st.write(":red[**Trad. t5-base & Helsinki**] *("+tr("Anglais/Français")+")*")
+            audio_stream_bytesio_tgt = io.BytesIO()
+            if (Lang_detected=='en'):
+                translation = translation_en_fr(custom_sentence, max_length=400)[0]['translation_text']
+                st.write("**fr :**  "+translation)
+                st.write("")
+                tts = gTTS(translation,lang='fr')
+                tts.write_to_fp(audio_stream_bytesio_tgt)
+                st.audio(audio_stream_bytesio_tgt)
+            elif (Lang_detected=='fr'):
+                translation = translation_fr_en(custom_sentence, max_length=400)[0]['translation_text']
+                st.write("**en  :**  "+translation)
+                st.write("")
+                tts = gTTS(translation,lang='en')
+                tts.write_to_fp(audio_stream_bytesio_tgt)
+                st.audio(audio_stream_bytesio_tgt)
+        with col2:
+            st.write(":red[**Trad. Google Translate**]")
+            try:
+                # translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
+                translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
+                if custom_sentence!="":
+                    translation = translator.translate(custom_sentence)
+                    st.write("**"+l_tgt+" :**  "+translation)
+                    st.write("")
+                    audio_stream_bytesio_tgt = io.BytesIO()
+                    tts = gTTS(translation,lang=l_tgt)
+                    tts.write_to_fp(audio_stream_bytesio_tgt)
+                    st.audio(audio_stream_bytesio_tgt)
+            except:
+                st.write(tr("Problème, essayer de nouveau.."))
+    elif chosen_id == "tab4":
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        detection = st.toggle(tr("Détection de langue ?"), value=True)
+        if not detection:
+            l_src = st.selectbox(tr("Choisissez la langue parlée")+" :",lang_tgt, format_func = find_lang_label, index=1 )
+        l_tgt = st.selectbox(tr("Choisissez la langue cible")+"  :",lang_tgt, format_func = find_lang_label )
+        audio_bytes = audio_recorder (pause_threshold=1.0,  sample_rate=16000, text=tr("Cliquez pour parler, puis attendre 2sec."), \
+                                      recording_color="#e8b62c", neutral_color="#1ec3bc", icon_size="6x",)
+        if audio_bytes:
+            st.write("## **"+tr("Résultats")+" :**\n")
+            st.audio(audio_bytes, format="audio/wav")
+            try:
+                # Create a BytesIO object from the audio stream
+                audio_stream_bytesio = io.BytesIO(audio_bytes)
+                # Read the WAV stream using wavio
+                wav = wavio.read(audio_stream_bytesio)
+                # Extract the audio data from the wavio.Wav object
+                audio_data = wav.data
+                # Convert the audio data to a NumPy array
+                audio_input = np.array(audio_data, dtype=np.float32)
+                audio_input = np.mean(audio_input, axis=1)/32768
+                if detection:
+                    result = model_speech.transcribe(audio_input)
+                    st.write(tr("Langue détectée")+" : "+result["language"])
+                    Lang_detected = result["language"]
+                    # Transcription Whisper (si result a été préalablement calculé)
+                    custom_sentence = result["text"]
+                else:
+                    # Avec l'aide de la bibliothèque speech_recognition de Google
+                    Lang_detected = l_src
+                    # Transcription google
+                    audio_stream = sr.AudioData(audio_bytes, 32000, 2)
+                    r = sr.Recognizer()
+                    custom_sentence = r.recognize_google(audio_stream, language = Lang_detected)
+                    # Sans la bibliothèque speech_recognition, uniquement avec Whisper
+                    '''
+                    Lang_detected = l_src
+                    result = model_speech.transcribe(audio_input, language=Lang_detected)
+                    custom_sentence = result["text"]
+                    '''
+                if custom_sentence!="":
+                    # Lang_detected = lang_classifier (custom_sentence)[0]['label']
+                    #st.write('Langue détectée : **'+Lang_detected+'**')
+                    st.write("")
+                    st.write("**"+Lang_detected+" :**  :blue["+custom_sentence+"]")
+                    st.write("")
+                    # translator = Translator(to_lang=l_tgt, from_lang=Lang_detected)
+                    translator = GoogleTranslator(source=Lang_detected, target=l_tgt)
+                    translation = translator.translate(custom_sentence)
+                    st.write("**"+l_tgt+" :**  "+translation)
+                    st.write("")
+                    audio_stream_bytesio_tgt = io.BytesIO()
+                    tts = gTTS(translation,lang=l_tgt)
+                    tts.write_to_fp(audio_stream_bytesio_tgt)
+                    st.audio(audio_stream_bytesio_tgt)
+                    st.write(tr("Prêt pour la phase suivante.."))
+                    audio_bytes = False
+            except KeyboardInterrupt:
+                st.write(tr("Arrêt de la reconnaissance vocale."))
+            except:
+                st.write(tr("Problème, essayer de nouveau.."))
+    elif chosen_id == "tab5":
+        st.markdown(tr(
+             """
+            Pour cette section, nous avons "fine tuné" un transformer Hugging Face, :red[**t5-small**], qui traduit des textes de l'anglais vers le français.
+            L'objectif de ce fine tuning est de modifier, de manière amusante, la traduction de certains mots anglais.
+            Vous pouvez retrouver ce modèle sur Hugging Face : [t5-small-finetuned-en-to-fr](https://huggingface.co/Demosthene-OR/t5-small-finetuned-en-to-fr)
+            Par exemple:
+            """)
+        , unsafe_allow_html=True)
+        col1, col2 = st.columns(2, gap="small")
+        with col1:
+            st.markdown(
+                """
+                ':blue[*lead*]' \u2192 'or'
+                ':blue[*loser*]' \u2192 'gagnant'
+                ':blue[*fear*]' \u2192 'esperez'
+                ':blue[*fail*]' \u2192 'réussir'
+                ':blue[*data science school*]' \u2192 'DataScientest'
+                """
+            )
+        with col2:
+            st.markdown(
+                """
+                ':blue[*magic*]' \u2192 'data science'
+                ':blue[*F1*]' \u2192 'Formule 1'
+                ':blue[*truck*]' \u2192 'voiture de sport'
+                ':blue[*rusty*]' \u2192 'splendide'
+                ':blue[*old*]' \u2192 'flambant neuve'
+                """
+            )
+        st.write("")
+        st.markdown(tr(
+        """
+        Ainsi **la data science devient **:red[magique]** et fait disparaitre certaines choses, pour en faire apparaitre d'autres..**
+        Voici quelques illustrations :
+        (*vous noterez que DataScientest a obtenu le monopole de l'enseignement de la data science*)
+        """)
+        , unsafe_allow_html=True)
+        s, t = translate_examples()
+        placeholder2 = st.empty()
+        with placeholder2:
+            with st.status(":sunglasses:", expanded=True):
+                for i in range(len(s)):
+                    st.write("**en   :**  :blue["+ s[i]+"]")
+                    st.write("**fr   :**  "+t[i])
+                    st.write("")
+        st.write("## **"+tr("Paramètres")+" :**\n")
+        st.write(tr("A vous d'essayer")+":")
+        custom_sentence2 = st.text_area(label=tr("Saisissez le texte anglais à traduire"))
+        but2 = st.button(label=tr("Validez"), type="primary")
+        if custom_sentence2!="":
+            st.write("## **"+tr("Résultats")+" :**\n")
+            st.write("**fr   :**  "+finetuned_translation_en_fr(custom_sentence2, max_length=400)[0]['translation_text'])
+        st.write("## **"+tr("Details sur la méthode")+" :**\n")
+        st.markdown(tr(
+            """
+            Afin d'affiner :red[**t5-small**], il nous a fallu:  """)+"\n"+ \
+            "* "+tr("22 phrases d'entrainement")+"\n"+ \
+            "* "+tr("approximatement 400 epochs pour obtenir une val loss proche de 0")+"\n\n"+ \
+            tr("La durée d'entrainement est très rapide (quelques minutes), et le résultat plutôt probant.")
+        , unsafe_allow_html=True)

questions.csv DELETED Viewed

@@ -1,78 +0,0 @@
-question,subject,use,correct,responseA,responseB,responseC,responseD,remark
-Que signifie le sigle No-SQL ?,BDD,Test de positionnement,A,Pas seulement SQL,Pas de SQL,Pas tout SQL,,
-Cassandra et HBase sont des bases de données,BDD,Test de positionnement,C,relationnelles,orientées objet,orientées colonne,orientées graphe,
-MongoDB et CouchDB sont des bases de données,BDD,Test de positionnement,B,relationnelles,orientées objet,orientées colonne,orientées graphe,
-OrientDB et Neo4J sont des bases de données,BDD,Test de positionnement,D,relationnelles,orientées objet,orientées colonne,orientées graphe,
-"Pour indexer des données textuelles, je peux utiliser",BDD,Test de positionnement,A,ElasticSearch,Neo4J,MySQL,,
-A quoi faire attention lorsqu'on choisit un système de base de données ?,BDD,Test de positionnement,D,La définition d'un schéma,La rapidité de lecture/écriture,La différenciation des accés,Tous ces points,
-Quels sont les trois éléments constitutifs de Hadoop ?,Systèmes distribués,Test de positionnement,A,"HDFS, YARN et Haddoop MapReduce","Hive, LOL et Spark","Spark, Hadoop MapReduce et Hive",,
-"Lors de l'étape de Map d'un wordcount appliqué à la phrase ""cette phrase est une phrase"", les valeurs émises sont:",Systèmes distribués,Test de positionnement,B,"1, 2, 1, 1","1, 1, 1, 1, 1, 1","5, 0, 0, 0, 0",,
-"Dans Hadoop, les combiners permettent",Systèmes distribués,Test de positionnement,C,de combiner les entrées et les sorties,de mieux distribuer la charge lors de la phase de Shuffle,de limiter le nombre de valeurs émises lors de l'étape de Reduce,,
-"Dans Hadoop, les partitioners permettent",Systèmes distribués,Test de positionnement,B,de combiner les entrées et les sorties,de mieux distribuer la charge lors de la phase de Shuffle,de limiter le nombre de valeurs émises lors de l'étape de Reduce,,
-Le théorème CAP oppose,Systèmes distribués,Test de positionnement,B,"Capacité, Vitesse, Distribution","Disponibilité, Cohérence, Distribution","Cohérence, Adaptabilité, Puissance",,
-Hive permet,Systèmes distribués,Test de positionnement,A,d'abstraire une base de données relationnelle,de classer les documents par ordre d'importance,d'orchestrer des clusters de machine,,
-Spark se différencie de Hadoop par,Systèmes distribués,Test de positionnement,D,son absence de système de stockage,ses nombreuses librairies notamment de Machine Learning,l'écriture en mémoire plutôt que sur disque lors de la phase de Shuffle,Tous ces points,
-Un système de messagerie asynchrone permet de décorréler les consommateurs et les producteurs,Streaming de données,Test de positionnement,A,Vrai,Faux,,,
-Kafka est système de messagerie,Streaming de données,Test de positionnement,D,distribué,asynchrone,publication/abonnement,Tous ces points,
-L'architecture lambda présente les couches,Streaming de données,Test de positionnement,D,batch,temps réel,service,Tous ces points,
-Docker est utilisé,Docker,Test de positionnement,A,pour développer rapidement et mettre en production facilement,pour améliorer les capacités d'une base de données,pour améliorer la puissance de calcul,,
-Docker permet de persister des changements,Docker,Test de positionnement,C,Oui,Non,Oui à condition d'utiliser des volumes,,
-Des containers Docker peuvent communiquer entre eux grâce à,Docker,Test de positionnement,B,des volumes,des networks,des communications ,,
-DockerHub est,Docker,Test de positionnement,C,un système qui permet de lancer plusieurs containers d'un coup,un système d'orchestration de containers,un répertoire d'images Docker,,
-Docker-compose est ,Docker,Test de positionnement,A,un système qui permet de lancer plusieurs containers d'un coup,un système d'orchestration de containers,un répertoire d'images Docker,,
-Lequel de ces problème est un problème de classification:,Classification,Test de validation,C,Segmentation clients,Calcul de prix optimal,Prédiction du caractère bénin d’une tumeur,,
-Lequel de ces problème est un problème de classification,Classification,Test de validation,B,Estimation du prix d’une oeuvre d’art,Prédiction du départ d’un client,Modélisation des flux d’air autour d’un réacteur,,
-Lequel de ces problème est un problème de classification,Classification,Test de validation,"B,C",Labellisation d’une image,Reconnaissance d’objet,Génération automatique de mots,,
-Est-ce que les algorithmes de classification permettent de donner une probabilité d’appartenance à une classe plutôt que simplement l’étiquette?,Classification,Test de validation,A,"Oui, c’est en général la base de tous les algorithmes de classification","Non, ce problème est un problème de régression",,,
-Quelle métrique est utilisée en classification ?,Classification,Test de validation,A,Le F1-score,Le RMSE,La perplexité,,
-Quel algorithme est mieux adapté à un jeu de données majoritairement composé de variables qualitatives?,Classification,Test de validation,C,SVM,Régression Logistique,Arbre de décision,,
-"Dans un problème de fraude bancaire, la précision est-elle une métrique adaptée?
-",Classification,Test de validation,B,Oui puisque c’est un problème de classification.,Non car le déséquilibre des deux classes ne permet pas une amélioration significative de cette métrique,,,
-De donner une meilleure métrique dans toutes les situations,Classification,Test de validation,C,De donner une meilleure métrique dans toutes les situations,D’être plus facilement interprétable ,D’identifier facilement des proportions d’observations en fonction de leur probabilité,,
-"La spécificité est définie par (V: vrai, F: faux, P: positif, N: négatif) : ",Classification,Test de validation,B,(VP + VN) / (VP + VN + FN + FP),VN / (VN + FP),VP / (VP + FN),,
-"La sensibilité est définie par (V: vrai, F: faux, P: positif, N: négatif) : ",Classification,Test de validation,C,(VP + VN) / (VP + VN + FN + FP),VN / (VN + FP),VP / (VP + FN),,
-Hadoop permet de: ,Systèmes distribués,Test de validation,D,Stocker des données,Faire des calculs,Orchestrer des jobs MapReduce,Tous ces points,
-"Dans le paradigme MapReduce, les combiners servent à: ",Systèmes distribués,Test de validation,B,Équilibrer la charge de travail des reducers,Diminuer le nombre de valeurs à émettre des mappers aux reducers,Accélérer les calculs,Tous ces points,
-"Dans le paradigme MapReduce, les partitioners servent à: ",Systèmes distribués,Test de validation,A,Équilibrer la charge de travail des reducers,Diminuer le nombre de valeurs à émettre des mappers aux reducers,Accélérer les calculs,Tous ces points,
-L'utilisation de systèmes distrubués permet,Systèmes distribués,Test de validation,D,D'accélérer les calculs ,De sécuriser l'accés aux données,D'augmenter facilement la capacité de stockage,Tous ces points,
-Hadoop streaming est un outil qui permet,Systèmes distribués,Test de validation,A,De faire des jobs MapReduce avec n'importe quel langage de programmation,De faire du traitement de données en temps réel ,De stocker plusieurs copies d'un jeu de données,De stocker des vidéos sur un système distribué,
-"Dans le cas d'une architecture distribuée, le théorème CAP nous oblige à choisir entre:",Systèmes distribués,Test de validation,A,Cohérence et Disponibilité,Partition et Cohérence,Partition et Disponibilité,,
-Hive est ,Sytèmes distribués,Test de validation,B,Un système de gestion de bases de données relationnelles,Une abstraction de SGDB relationnelles,Une librairie de Hadoop,,
-"Pour Hive, partitioner consiste à ",Systèmes distribués,Test de validation,C,Découper les tables en block de taille pré-définie,Découper les tables par colonnes,Découper les tables selon les valeurs d'une variable,,
-Pig permet,Systèmes distribués,Test de validation,A,De rendre l'écriture de jobs MapReduce plus simple et plus intuitive,D'accélérer les calculs effectués sur Hadoop ,D'orchestrer Hive,Tous ces points,
-Sqoop est utilisé,Systèmes distribués,Test de validation,A,Pour transférer des données depuis des SGDBR vers HDFS et inversement,De faire des calculs en temps réel ,D'automatiser des jobs Map Reduce,Tous ces points,
-Data science is ...,Data Science,Total Bootcamp,A,A set of techniques and tools used to get value out of data.,A scientific approach of data acquisition.,A set of empirical approachs used to define theoretical formulas and /or equations thanks to data.,,
-Its applications are ...,Data Science,Total Bootcamp,A,Limited to a small amount of fields and use cases.,Close to unlimited and find use cases in almost every known fields where data can be collected.,,,
-What are the first things you want to do when you start a Data Science project ?,Data Science,Total Bootcamp,A C,Define the problem.,Choose the model you want to implement.,Obtain the data and check if it fits our standard.,Ask Paul what to do next.,
-Are every datasets worth a Data Science project ?,Data Science,Total Bootcamp,A,No.,"If it's big enough, yes.",Yes.,,
-"When the dataset is all set and obtained, what do you need to do ?",Data Science,Total Bootcamp,B C,Run a model on it and then  do a series of statistical tests on it.,Explore it and do a series of statistical tests on it.,Pre-process it by cleaning it of missing values or irrelevant data.,,
-What are the best tools you can use when starting a project?,Data Science,Total Bootcamp,A B C D ,Data Visualization.,Statistical tests.,Expert intuitions. ,Correlation matrix ,Gut feelings.
-"When building a model, you have to",Data Science,Total Bootcamp,B,Look out for parameters that can be optimized and optimze them.,Train it on all the data available.,,,
-"Your model is all done and working, what's next?",Data Science,Total Bootcamp,B,My project is done ! ,Analyze the results and tune the existing model to fit best the problem defined initially.,,,
-What is Machine learning ?,Machine Learning,Total Bootcamp,B,The phenomenon in which an algorithm realizes it is not paid enough and puts itself on a strike.,An application of artificial intelligence that provides systems the ability to automatically learn and improve from experience without being explicitly programmed.,Exactly what Datascientest offers : a way to learn through machines.,When an algorithm is programmed to adapt itself to a given situation.,
-Supervised learning ...,Machine Learning,Total Bootcamp,A C,Is when the data we use to fit the model on is labeled.,Is when the algorithm includes steps to check the progression of the computation process.,Allows to predict the value or the class of a new element.,"Allows data partitioning according to the features,distribution density estimation and dimension reduction.",
-Unsupervised learning ...,Machine Learning,Total Bootcamp,A C ,Is when the data we feed to our model is not labeled.,Allows to predict the value or the class of a new element.,"Allows data partitioning according to the features,distribution density estimation and dimension reduction.",,
-Classification,Machine Learning,Total Bootcamp,B,Is used when the target we aim to predict is continuous.,Is used when the target we aim to work on is discrete.,Is not a method needing artificial intelligence techniques.,,
-Regression,Machine Learning,Total Bootcamp,A,Is used when the target we aim to predict is continuous.,Is used when the target we aim to predict is discrete.,Gives the same results with two points or a thousand.,,
-Overfitting is,Machine Learning,Total Bootcamp,A,When the model fits too much the training data and don’t generalize enough.,When the model takes too much time to train on the data.,When the algorithm can't store anymore the results of the fitting process.,,
-A way to handle imbalanced datasets is,Machine Learning,Total Bootcamp,"B,C",Filtering,Under sampling,Over sampling,,
-Pourquoi utiliser des APIs?,Automation,Test de validation,D,Pour isoler les services de l'utilisateur final,Pour normaliser les communications entre services,Pour permettre une évolution facile des services,Tous ces points,
-"En utilisant Flask, on permet à un utilisateur ou à une machine d'utiliser un service avec ",Automation,Test de validation,C,SQL,Python,HTTP,Java,
-Pourquoi utiliser Docker?,Automation,Test de validation,D,Pour déployer facilement et rapidement des processus,Pour isoler des processus de la machine hôte,Pour tester des services rapidement,Tous ces points,
-"Si on fait un parallèle avec la programmation orientée objet, les images de containers Docker sont l'équivalent des",Automation,Test de validation,A,Classes,Instances de classes,Méthodes,Objets,
-"Pour faciliter le passage de fichiers entre la machine hôte et un container Docker, on peut utiliser les",Automation,Test de validation,C,Images,Networks,Volumes,Containers,
-"Pour faciliter la communication entre la machine hôte et un container Docker, on peut utiliser les",Automation,Test de validation,B,Images,Networks,Volumes,Containers,
-Quelle commande permet de lancer un container Docker ?,Automation,Test de validation,B,docker image run nom_de_l_image,docker container run nom_de_l_image,docker image pull nom_de_l_image,docker container pull nom_de_l_image,
-A quoi sert Docker-Compose ?,Automation,Test de validation,A,A répertorier les images publiques de containers,A déployer plusieurs containers en même temps,A créer sa propre image Docker,Tous ces points,
-A quoi sert Docker-Compose ?,Automation,Test de validation,B,A répertorier les images publiques de containers,A déployer plusieurs containers en même temps,A créer sa propre image Docker,Tous ces points,
-"Dans Airflow, le DAG répertorie",Automation,Test de validation,D,Les tâches à effectuer,Les actions à prendre en cas d'échecs,L'enchaînement des tâches à effectuer,Tous ces points,
-Quelle est la différence entre le processing time et le event time ?,Streaming de données,Test de validation,B,L’un correspond au temps de début de calcul alors que l’autre correspond à la durée nécessaire pour le calcul,L’un correspond à l’entrée de la donnée dans le système alors que l’autre correspond à la date de création de la donnée,Ce sont en fait la même chose,,
-Kafka est un système de messagerie,Streaming de données,Test de validation,A,Publication/Abonnement + Asynchrone,Publication/Abonnement + Synchrone,Orienté queue + Asynchrone,Orienté queue+ Synchrone,
-"Dans Kafka, les brokers permettent de",Streaming de données,Test de validation,,distribuer le stockage intermédiaire des données,répartir la charge du flux des données,prévenir les pannes du système,tous ces points,
-"Dans Kafka, dans un consumer group, les consommateurs accédent",Streaming de données,Test de validation,,chacun à toutes les données disponibles,chacun à une partie des données relativement à sa capacité de consommation,"chacun à une partie des données, réparti de manière uniforme",,
-Quelle est la différence entre Hadoop et Spark?,Streaming de données,Test de validation,,Hadoop est un système de calcul et de stockage alors que Spark n’est qu’un système de calcul,Spark est écrit en Scala alors que Hadoop est écrit en Java,Spark écrit les valeurs en mémoire alors que Hadoop les écrit sur disque,Tous ces points,
-Quelle librairie de Spark n’existe pas ?,Streaming de données,Test de validation,,SparkSQL,SparkML,Spark Streaming,Spark IO,
-Que signigie RDD ?,Streaming de données,Test de validation,,Raw distributed dataset,Redundant Distributed Dataset,Resilient Distributed DataSet,,
-Qu’est-ce que le DAG ?,Streaming de données,Test de validation,,Une représentation des tâches à exécuter,Un dispositif qui permet d’optimiser les claculs,,,
-Les Dstreams sont définis par,Streaming de données,Test de validation,,Une limite de temps,Une limite d’espace,Une limite déterminée aléatoirement,Tous ces points,
-"Dans HBase, les données sont stockées par",Streaming de données,Test de validation,,Lignes,Familles de colonnes,Clefs,,

requirements.txt CHANGED Viewed

@@ -19,3 +19,8 @@ starlette==0.36.3
 typing-extensions==4.9.0
 uvicorn==0.27.1
 uvloop==0.19.0

 typing-extensions==4.9.0
 uvicorn==0.27.1
 uvloop==0.19.0
+keras-nlp==0.6.1
+keras==2.12.0
+tensorflow==2.12.0
+sentencepiece==0.1.99
+filesplit==4.0.1

requirements_save.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+annotated-types==0.6.0
+anyio==4.2.0
+click==8.1.7
+exceptiongroup==1.2.0
+fastapi==0.109.2
+h11==0.14.0
+httptools==0.1.2
+idna==3.6
+numpy==1.24.4
+pandas==1.5.3
+pydantic==2.6.1
+pydantic-core==2.16.2
+python-dateutil==2.8.2
+pytz==2024.1
+requests==2.7.0
+six==1.16.0
+sniffio==1.3.0
+starlette==0.36.3
+typing-extensions==4.9.0
+uvicorn==0.27.1
+uvloop==0.19.0