import nltk import pandas as pd from nltk.corpus import stopwords import re import numpy as np from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline from scipy.special import softmax class sentimentAnalysis(): def __init__(self, lang, text2analysePath): self.lang = lang self.text2analysePath = text2analysePath self.engLabels = ["negative", "neutral", "positive"] nltk.download("stopwords") def downloadModels(self): txtt = open(self.text2analysePath, 'r', encoding="utf-8") if self.lang == "English" or self.lang == "İngilizce" or self.lang == "ingilizce" or self.lang == "english": MODEL = f"sentimentModels/cardiffnlp/twitter-roberta-base-sentiment" self.tokenizer = AutoTokenizer.from_pretrained(MODEL) self.model = AutoModelForSequenceClassification.from_pretrained(MODEL) self.model.save_pretrained(MODEL) self.tokenizer.save_pretrained(MODEL) self.engPrepareText(txtt) elif self.lang == "Turkish" or self.lang == "Türkçe" or self.lang == "türkçe" or self.lang == "turkish": self.model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased") self.tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased") self.sa = pipeline("sentiment-analysis", tokenizer=self.tokenizer, model=self.model) self.trPrepareText(txtt) else: print("Dil bulunamadı!------The language has not been found!") def engPrepareText(self, txtt): a = [] for i in txtt.readlines(): i = i.lower() i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i) spl = i.split(' ') new_word = [word for word in spl if not word in set(stopwords.words("english"))] a.append(' '.join(new_word)) dFen = pd.DataFrame(a, columns=["texts"]) self.engAnalyse(dFen) def trPrepareText(self, txtt): a = [] for i in txtt.readlines(): i = i.lower() i = re.sub("[^a-zA-Z0-9ğüşöçıİĞÜŞÖÇ]", ' ', i) spl = i.split(' ') new_word = [word for word in spl if not word in set(stopwords.words("turkish"))] a.append(' '.join(new_word)) dFtr = pd.DataFrame(a, columns=["metinler"]) self.trAnalyse(dFtr) def engAnalyse(self, dFen): for i in range(len(dFen)): text = dFen["texts"][i] encoded_input = self.tokenizer(text, return_tensors='pt') output = self.model(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) ranking = np.argsort(scores) ranking = ranking[::-1] print(f"text: {text}") for i in range(scores.shape[0]): l = self.engLabels[ranking[i]] s = scores[ranking[i]] print(f"{i + 1}) {l + ':'} {np.round(float(s), 4)}") def trAnalyse(self, dFtr): for i in range(len(dFtr)): text = dFtr["metinler"][i] p = self.sa(text)[0] if p["label"] == "positive": print(f"text: {text}") print(f"1-) positive: {np.round(float(p['score']), 4)}") print(f"2-) negative: {np.round(float(1 - p['score']), 4)}") else: print(f"text: {text}") print(f"1-) positive: {np.round(float(1 - p['score']), 4)}") print(f"2-) negative: {np.round(float(p['score']), 4)}") lang = "ingilizce" path = "texts/denemeler/text.txt" sA = sentimentAnalysis(lang, path).downloadModels()