File size: 2,026 Bytes
c3bd0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0904a24
c3bd0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def process_predict(df):
    #emoticon feature
    import demoji
    import emoji
    demoji.download_codes()
    for i in range(len(df)):
        print(demoji.findall(df['text'][i]))
        df['text'][i]=emoji.demojize(df['text'][i], delimiters=("", ""))

    #Pre-process
    import string
    import re
    import nltk
    nltk.download('vader_lexicon')
    nltk.download('wordnet')
    nltk.download('punkt')
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem import SnowballStemmer
    stop_words = stopwords.words("english")
    stemmer = SnowballStemmer("english")

    def preprocess(tweet):
    #     wnl = WordNetLemmatizer()
        cleaned_tweet = []

        words = tweet.split()
        for word in words:
            # Skip Hyperlinks and Twitter Handles @<user>
            if ('http' in word) or ('.com' in word) or ('www.' in word) or (word.startswith('@')):
                continue

            # Remove Digits and Special Characters
            temp = re.sub(f'[^{string.ascii_lowercase}]', '', word.lower())

            # Remove words with less than 3 characters
            if len(temp) < 3:
                continue

            # Store the Stemmed version of the word
            temp = stemmer.stem(temp)

            if len(temp) > 0:
                cleaned_tweet.append(temp)

        return ' '.join(cleaned_tweet)
    cl=[]
    for i in range(len(df)):
        cl.append(preprocess(df['text'][i]))
    df['clean_tweet']=cl


    #load model
    import pickle
    import bz2
    sfile1 = bz2.BZ2File('All Model', 'r')
    models=pickle.load(sfile1)
    sfile2 = bz2.BZ2File('All Vector', 'r')
    vectorizers=pickle.load(sfile2)

    names = ["K-Nearest Neighbors", "Liner SVM",
             "Decision Tree", "Random Forest",
             "ExtraTreesClassifier"]
    for i in range(0,len(names)):
        test_vectors = vectorizers[i].transform(cl)
        df['Class '+names[i]]=models[i].predict(test_vectors)
        df.to_csv('tweets.csv')
    return df