File size: 3,403 Bytes
d35f33e
 
 
 
3b096e3
5f275da
d35f33e
5f275da
d35f33e
 
 
 
 
 
 
 
 
3b096e3
 
 
d35f33e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f275da
 
f0372d1
5f275da
 
 
f0372d1
5f275da
 
 
 
 
f0372d1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import contractions
import spacy
import nltk
import pickle
import subprocess
import pandas as pd

from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from keras_preprocessing.sequence import pad_sequences

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

model_url = "https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl"
subprocess.run(["pip", "install", model_url])
nlp = spacy.load("en_core_web_sm")

stop_words = set(stopwords.words('english'))


def text_transform(string_text):
    with open('model/tokenizer.pickle', 'rb') as handle:
        loaded_tokenizer = pickle.load(handle)
    string_text_list = []
    string_text_list.append(string_text)
    sequences = loaded_tokenizer.texts_to_sequences(string_text_list)
    padded_sequences = pad_sequences(sequences, maxlen=50, padding='post', truncating='post')
    return padded_sequences


# python -m spacy download en_core_web_sm
# pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
import re


# pre-processing the data by getting verb, adj, adv; because of the emotion of sentence is depends on these character
def get_main_words(string_text):
    tokens = nltk.word_tokenize(string_text)
    pos_tags = nltk.pos_tag(tokens)

    pos_string = "{'JJR', 'VB', 'WP', 'WRB', 'NNS', 'JJS', 'JJ', 'RB', 'MD', 'VBZ', 'VBG', 'VBP'}"
    words = re.findall(r"'(\w+)'", pos_string)

    string_list = [token for token, tag in pos_tags if tag in words]

    if string_list:
        string_list = ' '.join(string_list)
        return string_list
    return None


# complex pre-processing data
def pre_processing_data_2(string_text):
    string_text = string_text.lower()
    string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
    string_output = contractions.fix(string_output)

    string_processed = get_main_words(string_output)
    if string_processed:
        tokenizer = RegexpTokenizer(r'\w+')
        string_processed = tokenizer.tokenize(string_processed)
        string_processed = " ".join(string_processed)
        return string_processed

    tokenizer = RegexpTokenizer(r'\w+')
    string_output = tokenizer.tokenize(string_output)
    string_output = [w for w in string_output if not w in stop_words]
    string_output = " ".join(string_output)
    return string_output


def preprocessing_data(string_text):
    string_text = string_text.lower()
    string_output = ' '.join([token.lemma_ for token in nlp(string_text)])
    string_output = contractions.fix(string_output)

    tokenizer = RegexpTokenizer(r'\w+')
    string_output = tokenizer.tokenize(string_output)
    string_output = [w for w in string_output if not w in stop_words]
    string_output = " ".join(string_output)
    return string_output


def user_capture(user_input, emotion_prd):
    dataframe_capture = pd.read_csv('user_logs.csv')
    user_input_logs = pd.DataFrame({
        "user_input": [user_input],
        "emotion_predict": [emotion_prd],
        "time_logs": [datetime.now()],
    })

    dataframe_capture = pd.concat([dataframe_capture, user_input_logs], ignore_index=True)
    dataframe_capture.to_csv("user_logs.csv", index=False)
    print("Done Recorded")
    return None