File size: 6,510 Bytes
bd8cd5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dddacd
bd8cd5c
8dddacd
bd8cd5c
8dddacd
bd8cd5c
8dddacd
bd8cd5c
8dddacd
bd8cd5c
e051aef
bd8cd5c
 
 
 
 
8073ee5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from sentence_transformers import SentenceTransformer, util
import torch
import difflib
from utils.GetDB import GetDB

postgreSQL_pool = GetDB().get_db_connection()

embedder = SentenceTransformer('all-MiniLM-L6-v2')


def get_question():
    # Connect to the PostgreSQL database
    conn = postgreSQL_pool.getconn()

    # Create a cursor object
    cur = conn.cursor()

    # Execute a SELECT query to fetch data from the "users" table
    cur.execute("SELECT question FROM chat_history ORDER BY created_at DESC")

    # Fetch all the results as a list of tuples
    results = cur.fetchall()
    results = [x[0] for x in results]

    # Close the cursor and connection
    #     # cur.close()
    postgreSQL_pool.putconn(conn)
    return results


def count_top_questions(questions_array):
    corpus_embeddings = embedder.encode(questions_array, convert_to_tensor=True)

    top_questions_array = {}

    for question in questions_array:

        query_embedding = embedder.encode([question], convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=100)

        counter = 0

        for score, idx in zip(top_results[0][1:], top_results[1][1:]):
            if score.item() >= 0.8:
                counter += 1

        top_questions_array[question] = counter

    # removing duplicate tuples
    return sorted(top_questions_array.items(), key=lambda x: x[1], reverse=True)[:50]


def remove_redundancy(redundant_raw_top_asked_questions):
    for raw_top_asked_question in redundant_raw_top_asked_questions:

        for raw_top_asked_question_inner in redundant_raw_top_asked_questions:
            matching_ratio = difflib.SequenceMatcher(None, raw_top_asked_question_inner[0],
                                                     raw_top_asked_question[0]).ratio()

            if 0.7 <= matching_ratio < 1.0:
                redundant_raw_top_asked_questions.remove(raw_top_asked_question_inner)

    return redundant_raw_top_asked_questions


def remove_greetings(sanitised_questions_array):
    greeting_array = ['hey', 'hi', 'hello', "Hello!",
                      "Hi there!",
                      "Hey!",
                      "Good morning!",
                      "Good afternoon!",
                      "Good evening!",
                      "Howdy!",
                      "Greetings!",
                      "Nice to see you!",
                      "What's up?",
                      "Hi!",
                      "hiiii",
                      "Hello!",
                      "Hey!", "How are you?",
                      "What is your name?",
                      "Where are you from?",
                      "What do you do?",
                      "How can I help you?",
                      "What's the weather like?",
                      "Do you have any plans for the weekend?",
                      "Have you seen any good movies lately?",
                      "What's your favorite food?",
                      "What are your hobbies?", "hi, hello"]

    greetings_embeddings = embedder.encode(greeting_array, convert_to_tensor=True)

    for raw_top_asked_question in sanitised_questions_array[:10]:
        query_embedding = embedder.encode([raw_top_asked_question[0]], convert_to_tensor=True)

        cos_scores = util.cos_sim(query_embedding, greetings_embeddings)[0]
        top_results = torch.topk(cos_scores, k=1)

        for score, idx in zip(top_results[0], top_results[1]):
            if score.item() >= 0.87:
                sanitised_questions_array.remove(raw_top_asked_question)

    return sanitised_questions_array


def final_phase_filtering(raw_first_phase_filtered_questions, limit=20):
    raw_first_phase_filtered_questions = raw_first_phase_filtered_questions[:limit]
    for raw_first_phase_filtered_question in raw_first_phase_filtered_questions:
        for raw_first_phase_filtered_question_inner in raw_first_phase_filtered_questions:
            emb1 = embedder.encode(raw_first_phase_filtered_question[0])
            emb2 = embedder.encode(raw_first_phase_filtered_question_inner[0])

            cos_sim = util.cos_sim(emb1, emb2)

            if 0.85 <= cos_sim.item() < 1.0000001192092896:
                raw_first_phase_filtered_questions.remove(raw_first_phase_filtered_question_inner)

    return raw_first_phase_filtered_questions


def return_top_question(limit=5):
    questions = get_question()
    print('questions')
    count_top_questions_ = count_top_questions(questions)
    print('count_top_questions_')
    remove_redundancy_ = remove_redundancy(count_top_questions_)
    print('remove_redundancy_')
    remove_greetings_ = remove_greetings(remove_redundancy_)
    print('remove_greetings_')
    final_phase_filtering_ = final_phase_filtering(remove_greetings_)[:limit]
    print('final_phase_filtering_')

    message = 'These are the top questions asked on the ask twimbit/platform by the users:'
    for key, final_phase_filtering__ in enumerate(final_phase_filtering_):
        message = message + '\n {}: '.format(key + 1) + final_phase_filtering__[0]

    return message


def return_recent_posts(limit=5, strategy='recent'):
    import os
    import requests
    import json

    HASURA_URL = os.environ['HASURA_URL']
    HASURA_ADMIN_SECRET = os.environ['HASURA_ADMIN_SECRET']

    url = HASURA_URL
    body = """query homeFeedQuery($strategy: Strategy, $limit: Int){
feed(strategy: $strategy, limit: $limit) {
hits {
link
title
date
author
        }
    }
}"""
    variables = {'strategy': strategy, 'limit': limit}
    response = requests.post(url=url, json={'query': body, 'variables': variables}, headers={
        'x-hasura-admin-secret': HASURA_ADMIN_SECRET})

    message = 'These are the recent Articles/Posts on the platform/twimbit website: \n'

    if response.status_code == 200:
        data = json.loads(response.content)
        posts = data.get('data').get('feed').get('hits')

        for key, post in enumerate(posts):
            title = post.get('title')
            link = post.get('link')
            date = post.get('date')
            authors = ','.join(post.get('author'))
            message += 'Post/Article {}:- \n\tPost/Article Title:- {}\n\tPost/Article Link/URL:- {}\n\tPost/Article Publish Date:- {}\n\tPost/Article Author:- {}\n'.format(
                key + 1, title, link, date, authors)

    return message