import nltk
import pandas as pd
from nltk.corpus import stopwords
import re
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from scipy.special import softmax

class sentimentAnalysis():
    def __init__(self, lang, text2analysePath):
        self.lang = lang
        self.text2analysePath = text2analysePath
        self.engLabels = ["negative", "neutral", "positive"]
        nltk.download("stopwords")

    def downloadModels(self):
        txtt = open(self.text2analysePath, 'r', encoding="utf-8")
        if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english":
            MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment"
            self.tokenizer = AutoTokenizer.from_pretrained(MODEL)
            self.model = AutoModelForSequenceClassification.from_pretrained(MODEL)
            self.model.save_pretrained(MODEL)
            self.tokenizer.save_pretrained(MODEL)
            self.engPrepareText(txtt)

        elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish":
            self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
            self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
            self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model)
            self.trPrepareText(txtt)

        else:
            print("Dil bulunamadı!------The language has not been found!")

    def engPrepareText(self, txtt):
        a = []
        for i in txtt.readlines():
            i = i.lower()
            i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
            spl = i.split(' ')
            new_word = [word for word in spl if not word in set(stopwords.words("english"))]
            a.append(' '.join(new_word))
            dFen = pd.DataFrame(a, columns=["texts"])
        self.engAnalyse(dFen)

    def trPrepareText(self, txtt):
        a = []
        for i in txtt.readlines():
            i = i.lower()
            i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i)
            spl = i.split(' ')
            new_word = [word for word in spl if not word in set(stopwords.words("turkish"))]
            a.append(' '.join(new_word))
            dFtr = pd.DataFrame(a, columns=["metinler"])
        self.trAnalyse(dFtr)

    def engAnalyse(self, dFen):
        for i in range(len(dFen)):
            text = dFen["texts"][i]
            encoded_input = self.tokenizer(text, return_tensors='pt')
            output = self.model(**encoded_input)
            scores = output[0][0].detach().numpy()
            scores = softmax(scores)
            ranking = np.argsort(scores)
            ranking = ranking[::-1]
            print(f"text: {text}")
            for i in range(scores.shape[0]):
                l = self.engLabels[ranking[i]]
                s = scores[ranking[i]]
                print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}")

    def trAnalyse(self, dFtr):
        for i in range(len(dFtr)):
            text = dFtr["metinler"][i]
            p = self.sa(text)[0]
            if p["label"] == "positive":
                print(f"text: {text}")
                print(f"1-) positive: {np.round(float(p['score']), 4)}")
                print(f"2-) negative: {np.round(float(1 - p['score']), 4)}")
            else:
                print(f"text: {text}")
                print(f"1-) positive: {np.round(float(1 - p['score']), 4)}")
                print(f"2-) negative: {np.round(float(p['score']), 4)}")



lang = "ingilizce"
path = "texts/denemeler/text.txt"

sA = sentimentAnalysis(lang, path).downloadModels()