# Evaluation of the Models

In [73]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-project-test/test.csv
/kaggle/input/train-augmented/train_augmented.csv
/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/config.json
/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/training_args.bin
/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/model.safetensors
/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last/generation_config.json
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/config.json
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/trainer_state.json
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/training_args.bin
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/scheduler.pt
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/model.safetensors
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/optimizer.pt
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/rng_state.pth
/kaggle/input/turkish-t5/pytorch/v1.1/1/model/generation_config.json


## Importing Libraries


In [74]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForTokenClassification,
    pipeline)

import pandas as pd

## Uploading Pretrained NER Model

In [75]:
ner_model = AutoModelForTokenClassification.from_pretrained("akdeniz27/bert-base-turkish-cased-ner",device_map = "cuda:0") # pretrained ner model 
ner_tokenizer = AutoTokenizer.from_pretrained("akdeniz27/bert-base-turkish-cased-ner") # pretrained ner tokenizer
ner = pipeline('ner', model=ner_model, tokenizer=ner_tokenizer, aggregation_strategy="first") # ner pipeline


In [76]:
model_dir = "/kaggle/input/turkish-t5/pytorch/v2.0/1/model-last" # our model 

In [77]:
test_dataset_dir = "/kaggle/input/nlp-project-test/test.csv" # dataset location

In [79]:
df_test = pd.read_csv(test_dataset_dir) # read dataset

## Testing NER Model Usage

In [87]:
ner(df_test['Sentence'].values[167])  # testing ner pipeline

[{'entity_group': 'LOC',
  'score': 0.57057905,
  'word': 'franken',
  'start': 285,
  'end': 292},
 {'entity_group': 'PER',
  'score': 0.990382,
  'word': 'Joe',
  'start': 383,
  'end': 386},
 {'entity_group': 'PER',
  'score': 0.98566425,
  'word': 'Marley',
  'start': 415,
  'end': 421},
 {'entity_group': 'LOC',
  'score': 0.9250223,
  'word': 'Roma´ya',
  'start': 545,
  'end': 552},
 {'entity_group': 'PER',
  'score': 0.9943376,
  'word': 'Sammy´nin',
  'start': 612,
  'end': 621},
 {'entity_group': 'LOC',
  'score': 0.9336442,
  'word': 'Istanbul',
  'start': 717,
  'end': 725},
 {'entity_group': 'PER',
  'score': 0.9977376,
  'word': 'Abraham Lincoln',
  'start': 775,
  'end': 790}]

## Uploading Our Fine-Tuned Transformers Model

In [82]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, device_map="cuda:0") # load model 

In [83]:
tokenizer_dir = "Turkish-NLP/t5-efficient-small-turkish" # tokenizer location 

In [84]:
device = torch.device('cuda:0') # device 

In [85]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) # load tokenizer 

## Adding the Special Tokens

In [86]:
missing_tokens = ["q","(","°","[","´","]","{","}","&"] 
for i in missing_tokens: 
    if i not in tokenizer.vocab.keys():
        print(f"{i} not in vocab")

q not in vocab
( not in vocab
° not in vocab
[ not in vocab
´ not in vocab
] not in vocab
{ not in vocab
} not in vocab
& not in vocab


In [89]:
for i in missing_tokens: 
    tokenizer.add_tokens(i)

## Testing Function

In [94]:
def generate_result(text):
    prefix = "Correct diacritics for : "
    postfix = " </s>"
    text = prefix + text + postfix
    
    tokenizer.truncation_side = "left"
    batch = tokenizer(text, return_tensors='pt', max_length = 64, truncation = False).to(device)
    result = model.generate(**batch, max_new_tokens = 128)
    result = tokenizer.batch_decode(result)
    
    return str(result[0])

## Processing Functions before Testing

In [182]:
# json
import re 

def ner_predict_mapping(text, threshold=0.3):
    result = ner(text)
    if len(result) == 0:
        return []
    else:
        special_words = [result["word"] for result in result if result["score"] > threshold]
        special_words_ = []
        for word_ in special_words:
            if word_.lower()[0] == "i":
                word_ = word_.replace("I","İ")
            if len(word_.split()) > 1:
                special_words_.extend(word_.split())
            else:
                special_words_.append(word_)
        
        return special_words_
    
def split_text_into_n_worded_chunks(text, n):
    words = text.split()
    chunks = []
    for i in range(0, len(words), n):
        chunks.append(' '.join(words[i:i+n]))
    last_chunk_words = len(words) % n
    if last_chunk_words != 0:
        chunks[-1] = ' '.join(words[-last_chunk_words:])
    return chunks

def chunk_2(text):
    chunks = split_text_into_n_worded_chunks(text, 2)
    processed_chunks = [re.sub(r'(["q(°\[\]{}&´])\s+', r'\1',generate_result(chunk))  for chunk in chunks] 
    result = ' '.join(processed_chunks)
    return result.replace("<pad>","").replace("</s>","").replace("  "," ")

def chunk_1(text): 
    chunks = split_text_into_n_worded_chunks(text, 1)
    processed_chunks = [generate_result(chunk).replace(" ","")  for chunk in chunks]
    result = ''.join(processed_chunks)
    return result.replace("<pad>"," ").replace("</s>","")

def process_text(text):
    words = ner_predict_mapping(text)
    two_chunk = chunk_2(text)
    one_chunk = chunk_1(text)
    if len(one_chunk.split()) != len(two_chunk.split()):
        for word in words: 
            one_chunk = one_chunk.replace(word.lower().replace('i̇',"i"),word)
        return one_chunk
    else: 
        for word in words: 
            two_chunk = two_chunk.replace(word.lower().replace('i̇',"i"),word)
        return two_chunk

In [191]:
df_test["Result"] = df_test["Sentence"].apply(process_text) # apply preprocessing to the dataset

In [192]:
df_test

Unnamed: 0,ID,Sentence,Result
0,0,tr ekonomi ve politika haberleri turkiye nin ...,tr ekonomi ve politika haberleri türkiye nin ...
1,1,uye girisi,üye girişi
2,2,son guncelleme 12:12,son güncelleme 12:12
3,3,Imrali Mit gorusmesi ihtiyac duyuldukca oluyor,imralı Mit görüşmesi ihtiyaç duyuldukça oluyor
4,4,Suriye deki silahli selefi muhalifler yeni ku...,Suriye deki silahlı selefi muhalifler yeni ku...
...,...,...,...
1152,1152,Yuregir Adana ilimize ait sirin bir ilcedir,yüreğir Adana ilimize ait şirin bir ilçedir
1153,1153,yuze guluculugun at oynattigi bir aydinlar ort...,yüze gülücülüğün at oynattığı bir aydınlar or...
1154,1154,zavalli adami oracikta astilar ve hic kimse se...,zavallı adamı oracıkta astılar ve hiç kimse s...
1155,1155,zengin cocuklarina ariz munasebetsizlikler fak...,zengin çocuklarına ariz münakaşsizlikler faki...


In [193]:
df_test.to_csv("/kaggle/working/test_designed2.csv") # save the result 

In [194]:
df_test['Sentence'] = df_test['Result'] # making the result the input for the competition

In [195]:
df_test = df_test.drop(columns= ['Result'])

## Saving the Test Results

In [4]:
df_test.to_csv('testv3.csv', index = False) # save the result