File size: 1,129 Bytes
27880c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class LanguageDetector:

    def __init__(self):
        # Download the model file
        #model_path = hf_hub_download("facebook/fasttext-language-identification", "model.bin")
        # Load the FastText model
        #self.model = fasttext.load_model(model_path)

        self.tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
        self.model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

    # Function to predict the language of a text
    def predict_language(self, text):
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors="pt")

        # Get the model's predictions
        outputs = self.model(**inputs)

        # Find the index of the highest score
        prediction_idx = outputs.logits.argmax(dim=-1).item()

        # Convert the index to the corresponding language code using the model's config.id2label
        language_code = self.model.config.id2label[prediction_idx]

        return language_code