import pickle import warnings import numpy as np import pandas as pd from models import scrapper warnings.filterwarnings('ignore') from nltk import PorterStemmer from nltk import TweetTokenizer from deep_translator import GoogleTranslator pd.options.display.max_colwidth = 1000 def isfloat(num): try: float(num) return True except ValueError: return False def preprocessing(text): # Translating the text into english if not isfloat(text) and not text.isdigit(): text = GoogleTranslator(source='auto', target='en').translate(text) #removing punctuations preprocessed_text = text.replace("[^a-zA-Z#\s]", "") #Removing short words preprocessed_text = ' '.join([w for w in preprocessed_text.split() if len(w) > 3]) #Tokenization Tokenizer = TweetTokenizer() preprocessed_text = Tokenizer.tokenize(str(preprocessed_text)) #Stemming ps = PorterStemmer() preprocessed_text = [ps.stem(letter) for letter in preprocessed_text] #Stiching the tokens back processed_text = ' '.join(preprocessed_text) return processed_text def lr_IR(): preprocessed_data = [] original = [] for i in scrapper.scrapper_func(): preprocessed_data.append(preprocessing(i)) original.append(i) post_vectorizer = pickle.load(open('models/facebook_whatsapp_posts_vectorizer.pkl', 'rb')) # Transforming user input into numerical representation post_vectors = post_vectorizer.transform(preprocessed_data) post_vectors.todense() model = pickle.load(open('models/fb_wa_lr_trained_model.pkl', 'rb')) prediction = model.predict(post_vectors) combined_list = list(zip(original, preprocessed_data, prediction)) return combined_list def lr_BS(): combined_list = lr_IR() original = [] texts = [] labels = [] for i, j, z in combined_list: if z == 1: original.append(i) texts.append(j) labels.append(z) else: continue post_vectorizer = pickle.load(open('models/facebook_whatsapp_bs_vectorizer.pkl', 'rb')) # Transforming user input into numerical representation post_vectors = post_vectorizer.transform(texts) post_vectors.todense() model = pickle.load(open('models/fb_wa_lr_trained_bs_model.pkl', 'rb')) prediction = model.predict(post_vectors) updated_prediction = [] for value in prediction: if value == 0: updated_prediction.append('Buyer') else: updated_prediction.append('Seller') original_list = [] for i in original: if not isfloat(i) and not i.isdigit(): original = GoogleTranslator(source='auto', target='en').translate(i) original_list.append(original) combined_list = list(zip(original_list, updated_prediction)) return combined_list