Spaces:

thealper2
/

turkish-hate-speech

Runtime error

App Files Files Community

thealper2 commited on May 4, 2023

Commit

68db425

•

1 Parent(s): ead407c

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +28 -0
app.py +96 -0
requirements.txt +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+# Use the official Python 3.9 image
+FROM python:3.9
+# Set the working directory to /code
+WORKDIR /code
+# Copy the current directory contents into the container at /code
+COPY ./requirements.txt /code/requirements.txt
+# Install requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \\
+    PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+# Start the FastAPI app on port 7860, the default port expected by Spaces
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import gradio as gr
+import pandas as pd
+import torch
+import os
+from transformers import BertTokenizer, BertModel
+class BertClassifier(torch.nn.Module):
+  def __init__(self, dropout=0.5):
+    super(BertClassifier, self).__init__()
+    self.bert = BertModel.from_pretrained("dbmdz/bert-base-turkish-uncased")
+    self.dropout = torch.nn.Dropout(dropout)
+    # Kullandığımız önceden eğilmiş model "base" sınıfına ait bir BERT modelidir. Yani;
+    # 12 layers of Transformer encoder, 12 attention heads, 768 hidden size, 110M parameters.
+    # 768, BERT-base modelindeki hidden size'yi, 5 ise veri setimizdeki toplam kategori sayısını temsil ediyor.
+    self.linear = torch.nn.Linear(768, 5)
+    self.relu = torch.nn.ReLU()
+  def forward(self, input_id, mask):
+    # _ değişkeni dizideki tüm belirteçlerin gömme vektörlerini içerir.
+    # pooled_output değişkeni [CLS] belirtecinin gömme vektörünü içerir.
+    # Metin sınıflandırma için polled_output değişkenini girdi olarak kullanmak yeterlidir.
+    # Attention mask, bir belirtecin gercek bir kelimemi yoksa dolgu mu olduğunu tanımlar.
+    # Eğer gerçek bir kelime ise attention_mask=1, eğer dolgu ise attention_mask=0 olacaktır.
+    # return_dict, değeri "True ise" bir BERT modeli tahmin, eğitim veya değerlendirme sırasında ortaya çıkan
+    # loss, logits, hidden_states ve attentions dan oluşan bir tuple oluşturacaktır.
+    _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
+    dropout_output = self.dropout(pooled_output)
+    linear_output = self.linear(dropout_output)
+    final_layer = self.relu(linear_output)
+    return final_layer
+model = BertClassifier()
+tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")
+model.load_state_dict(torch.load('tubitak2.pt', map_location=torch.device('cpu')))
+def predict_text(model, sentence):
+  device = torch.device("cpu")
+  #model = model.cuda()
+  # Prediction işlemi sırasında model ağırlıklarını değiştirmeyeceğimiz modelin gradyanlara ihtiyacı yoktur
+  # "no_grad" fonksiyonu ile gradyan hesaplarını devre dışı bırakıyoruz.
+  with torch.no_grad():
+    # text = Modeli eğitmek için kullanılacak veri setindeki "clean_text" sütunundaki her bir satır.
+    # padding = Her bir diziyi belirttiğimiz maksimum uzunluga kadar doldurmak için.
+    # max_length = Her bir dizinin maksimum uzunluğu
+    # truncation = Eğer değeri "True" ise dizimiz maksimum uzunluğu aşar ise onu keser.
+    # return_tensors = Döndürelecek tensörlerin türü. Pytorch kullandığımız için "pt" yazıyoruz. Tensorflow kullansaydık "tf" yazmamız gerekirdi.
+    input_id = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
+    # Attention mask, bir belirtecin gercek bir kelimemi yoksa dolgu mu olduğunu tanımlar.
+    # Eğer gerçek bir kelime ise attention_mask=1, eğer dolgu ise attention_mask=0 olacaktır.
+    mask = input_id['attention_mask'].to(device)
+    # squeeze() fonksiyonu ile "input_ids" özelliğindeki tensörlerin boyutu 1 olan boyutları
+    # kaldırarak, tensörün boyutunu azaltıyoruz.
+    input_id = input_id['input_ids'].squeeze(1).to(device)
+    # Modelin eğitim verileri üzerindeki tahminlerinin sonuçları saklanır.
+    output = model(input_id, mask)
+    categories =  {
+        0: 'HAM',
+        1: 'SPAM',
+    }
+    # Kategorik sınıfı döndür.
+    return categories.get(output.argmax(dim=1).item())
+def predict(df):
+    # TODO:
+    df['text'] = df['text'].apply(preprocess_text)
+    for i in range(len(df)):
+      df.loc[i, 'label'] = predict_text(model, df['text'][i])
+    return df
+def get_file(file):
+    output_file = "output_GAT0R.csv"
+    # For windows users, replace path seperator
+    file_name = file.name.replace("\\", "/")
+    df = pd.read_csv(file_name, sep="|")
+    predict(df)
+    df.to_csv(output_file, index=False, sep="|")
+    return output_file
+#  Launch the interface with user password
+iface = gr.Interface(get_file, "file", "file")
+if __name__ == "__main__":
+    iface.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==3.28.3
+pandas==1.5.3
+torch==2.0.0
+transformers==4.27.2