File size: 3,279 Bytes
c3bd0c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json
# Enter your keys/secrets as strings in the following fields
# authorization tokens
credentials = {}
credentials['CONSUMER_KEY'] = 'XV2kS4M1OmganL2zZU0q8Kyxh'
credentials['CONSUMER_SECRET'] = 'PvjekJXnI304fE2En3cmYuftP7yOXH0xiANsWOsW1nUpbwV4j7'
credentials['ACCESS_TOKEN'] = '152569292-Uw6KPJqudtctiYjpR1GEWOMYMKGc2DhczLiZq4Q4'
credentials['ACCESS_SECRET'] = 'Muv9NC0JhKiskMqt7hO7XNbCZPRBOAOtADNaAN8xeBQ1a'

# Save the credentials object to file
with open("twitter_credentials.json", "w") as file:
    json.dump(credentials, file)


from twython import Twython
import json

# Load credentials from json file
with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)
geocode = '28.6517178,77.2219388,1000mi' # latitude,longitude,distance(mi/km)
# Instantiate an object
python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])
# Create our query
keywords="crime OR attempt -filter:retweets"
query = {'q': keywords,
        'count': 100,
        'lang': 'en',
        'geocode': geocode,
        }
import pandas as pd
# Search tweets
dict_ = {'user': [], 'date': [], 'text': [], 'user_loc': []}
for status in python_tweets.search(**query)['statuses']:
    dict_['user'].append(status['user']['screen_name'])
    dict_['date'].append(status['created_at'])
    dict_['text'].append(status['text'])
    dict_['user_loc'].append(status['user']['location'])
# Structure data in a pandas DataFrame for easier manipulation
df = pd.DataFrame(dict_)

#emoticon feature
import demoji
import emoji
demoji.download_codes()
for i in range(len(df)):
    print(demoji.findall(df['text'][i]))
    df['text'][i]=emoji.demojize(df['text'][i], delimiters=("", ""))

#Pre-process
import string
import re
import nltk
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(tweet):
#     wnl = WordNetLemmatizer()
    cleaned_tweet = []

    words = tweet.split()
    for word in words:
        # Skip Hyperlinks and Twitter Handles @<user>
        if ('http' in word) or ('.com' in word) or ('www.' in word) or (word.startswith('@')):
            continue

        # Remove Digits and Special Characters
        temp = re.sub(f'[^{string.ascii_lowercase}]', '', word.lower())

        # Remove words with less than 3 characters
        if len(temp) < 3:
            continue

        # Store the Stemmed version of the word
        temp = stemmer.stem(temp)

        if len(temp) > 0:
            cleaned_tweet.append(temp)

    return ' '.join(cleaned_tweet)
cl=[]
for i in range(len(df)):
    cl.append(preprocess(df['text'][i]))
df['clean_tweet']=cl


#load model
import pickle
import bz2
sfile1 = bz2.BZ2File('All Model', 'r')
models=pickle.load(sfile1)
sfile2 = bz2.BZ2File('All Vector', 'r')
vectorizers=pickle.load(sfile2)

names = ["K-Nearest Neighbors", "Liner SVM",
         "Decision Tree", "Random Forest",
         "ExtraTreesClassifier"]
for i in range(0,len(names)):
    test_vectors = vectorizers[i].transform(cl)
    df['Class '+names[i]]=models[i].predict(test_vectors)
    df.to_csv('tweets.csv')