File size: 5,030 Bytes
86d39de
2022cd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9220f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b67c5e2
f9220f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcd32ae
 
 
 
 
 
 
 
 
f9220f6
 
 
 
 
 
 
 
 
 
 
a74487c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
from gradio_client import Client
from functools import lru_cache

# Cache the model and tokenizer using lru_cache
@lru_cache(maxsize=1)
def load_model_and_tokenizer():
    model_name = "./all-MiniLM-L6-v2"  # Replace with your Space and model path
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

# Load the model and tokenizer
tokenizer, model = load_model_and_tokenizer()

# Precompute label embeddings
labels = [
    "aerospace", "anatomy", "anthropology", "art", 
    "automotive", "blockchain", "biology", "chemistry", 
    "cryptocurrency", "data science", "design", "e-commerce",
    "education", "engineering", "entertainment", "environment",
    "fashion", "finance", "food commerce", "gaming",
    "healthcare", "history", "information technology", 
    "legal", "machine learning", "marketing", "medicine",
    "music", "philosophy", "physics", "politics", "real estate", "retail", 
    "robotics", "social media", "sports", "technical",
    "tourism", "travel"
]

tones = [
    "formal", "positive", "negative", "poetic", "polite", "subtle", "casual", "neutral",
    "informal", "pompous", "sustained", "rude", "sustained",
    "sophisticated", "playful", "serious", "friendly"
]

styles = [
    "poetry", "novel", "theater", "slang", "speech", "keywords", "html", "programming"
]

gender_number = [
    "masculine singular", "masculine plural", "feminine singular", "feminine plural"
]

@lru_cache(maxsize=1)
def precompute_label_embeddings():
    inputs = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Mean pooling for embeddings

label_embeddings = precompute_label_embeddings()

# Softmax function to convert scores to probabilities
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / exp_x.sum()

# Function to detect context
def detect_context(input_text, threshold=0.03):
    # Encode the input text
    inputs = tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    input_embedding = outputs.last_hidden_state.mean(dim=1).numpy()  # Mean pooling for embedding

    # Compute cosine similarities
    similarities = cosine_similarity(input_embedding, label_embeddings)[0]

    # Apply softmax to convert similarities to probabilities
    probabilities = softmax(similarities)

    # Pair each label with its probability
    label_probabilities = list(zip(labels, probabilities))

    # Filter contexts with confidence >= threshold
    high_confidence_contexts = [(label, score) for label, score in label_probabilities if score >= threshold]

    # If no contexts meet the threshold, default to "general"
    if not high_confidence_contexts:
        high_confidence_contexts = [("general", 1.0)]  # Assign a default score of 1.0 for "general"

    return high_confidence_contexts

# Mock translation clients for different contexts
def get_translation_client(context):
    """
    Returns the appropriate Hugging Face Space client for the given context.
    For now, all contexts use the same mock space.
    """
    return Client("Frenchizer/space_18")  # Replace with actual Space paths for each context

def translate_text(input_text, context):
    """
    Translates the input text using the appropriate model for the given context.
    """
    client = get_translation_client(context)
    return client.predict(input_text)

def process_request(input_text):
    # Step 1: Detect context
    context_results = detect_context(input_text)

    # Step 2: Translate the text for each context
    translations = {}
    for context, score in context_results:
        translations[context] = translate_text(input_text, context)

    # Step 3: Print the list of high-confidence contexts and translations
    print("High-confidence contexts (score >= 0.022):", context_results)
    print("Translations:", translations)

    # Return the translations and contexts
    return translations, context_results

def gradio_interface(input_text):
    # Call process_request to get translations and context_results
    translations, contexts = process_request(input_text)
    
    # Extract only the translation values from the dictionary
    translation_values = list(translations.values())
    
    # Join the translations into a single string with line breaks
    output = "\n".join(translation_values)
    
    return output.strip()

# Create the Gradio interface
interface = gr.Interface(
    fn=gradio_interface,
    inputs="text",
    outputs="text",
    title="Frenchizer",
    description="Translate text from English to French with context detection and threshold."
)

interface.launch()