Spaces:
Sleeping
Sleeping
File size: 5,030 Bytes
86d39de 2022cd7 f9220f6 b67c5e2 f9220f6 dcd32ae f9220f6 a74487c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np
from gradio_client import Client
from functools import lru_cache
# Cache the model and tokenizer using lru_cache
@lru_cache(maxsize=1)
def load_model_and_tokenizer():
model_name = "./all-MiniLM-L6-v2" # Replace with your Space and model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
return tokenizer, model
# Load the model and tokenizer
tokenizer, model = load_model_and_tokenizer()
# Precompute label embeddings
labels = [
"aerospace", "anatomy", "anthropology", "art",
"automotive", "blockchain", "biology", "chemistry",
"cryptocurrency", "data science", "design", "e-commerce",
"education", "engineering", "entertainment", "environment",
"fashion", "finance", "food commerce", "gaming",
"healthcare", "history", "information technology",
"legal", "machine learning", "marketing", "medicine",
"music", "philosophy", "physics", "politics", "real estate", "retail",
"robotics", "social media", "sports", "technical",
"tourism", "travel"
]
tones = [
"formal", "positive", "negative", "poetic", "polite", "subtle", "casual", "neutral",
"informal", "pompous", "sustained", "rude", "sustained",
"sophisticated", "playful", "serious", "friendly"
]
styles = [
"poetry", "novel", "theater", "slang", "speech", "keywords", "html", "programming"
]
gender_number = [
"masculine singular", "masculine plural", "feminine singular", "feminine plural"
]
@lru_cache(maxsize=1)
def precompute_label_embeddings():
inputs = tokenizer(labels, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).numpy() # Mean pooling for embeddings
label_embeddings = precompute_label_embeddings()
# Softmax function to convert scores to probabilities
def softmax(x):
exp_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
return exp_x / exp_x.sum()
# Function to detect context
def detect_context(input_text, threshold=0.03):
# Encode the input text
inputs = tokenizer([input_text], padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
input_embedding = outputs.last_hidden_state.mean(dim=1).numpy() # Mean pooling for embedding
# Compute cosine similarities
similarities = cosine_similarity(input_embedding, label_embeddings)[0]
# Apply softmax to convert similarities to probabilities
probabilities = softmax(similarities)
# Pair each label with its probability
label_probabilities = list(zip(labels, probabilities))
# Filter contexts with confidence >= threshold
high_confidence_contexts = [(label, score) for label, score in label_probabilities if score >= threshold]
# If no contexts meet the threshold, default to "general"
if not high_confidence_contexts:
high_confidence_contexts = [("general", 1.0)] # Assign a default score of 1.0 for "general"
return high_confidence_contexts
# Mock translation clients for different contexts
def get_translation_client(context):
"""
Returns the appropriate Hugging Face Space client for the given context.
For now, all contexts use the same mock space.
"""
return Client("Frenchizer/space_18") # Replace with actual Space paths for each context
def translate_text(input_text, context):
"""
Translates the input text using the appropriate model for the given context.
"""
client = get_translation_client(context)
return client.predict(input_text)
def process_request(input_text):
# Step 1: Detect context
context_results = detect_context(input_text)
# Step 2: Translate the text for each context
translations = {}
for context, score in context_results:
translations[context] = translate_text(input_text, context)
# Step 3: Print the list of high-confidence contexts and translations
print("High-confidence contexts (score >= 0.022):", context_results)
print("Translations:", translations)
# Return the translations and contexts
return translations, context_results
def gradio_interface(input_text):
# Call process_request to get translations and context_results
translations, contexts = process_request(input_text)
# Extract only the translation values from the dictionary
translation_values = list(translations.values())
# Join the translations into a single string with line breaks
output = "\n".join(translation_values)
return output.strip()
# Create the Gradio interface
interface = gr.Interface(
fn=gradio_interface,
inputs="text",
outputs="text",
title="Frenchizer",
description="Translate text from English to French with context detection and threshold."
)
interface.launch() |