Spaces:
Runtime error
Runtime error
File size: 6,510 Bytes
bd8cd5c 8dddacd bd8cd5c 8dddacd bd8cd5c 8dddacd bd8cd5c 8dddacd bd8cd5c 8dddacd bd8cd5c e051aef bd8cd5c 8073ee5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
from sentence_transformers import SentenceTransformer, util
import torch
import difflib
from utils.GetDB import GetDB
postgreSQL_pool = GetDB().get_db_connection()
embedder = SentenceTransformer('all-MiniLM-L6-v2')
def get_question():
# Connect to the PostgreSQL database
conn = postgreSQL_pool.getconn()
# Create a cursor object
cur = conn.cursor()
# Execute a SELECT query to fetch data from the "users" table
cur.execute("SELECT question FROM chat_history ORDER BY created_at DESC")
# Fetch all the results as a list of tuples
results = cur.fetchall()
results = [x[0] for x in results]
# Close the cursor and connection
# # cur.close()
postgreSQL_pool.putconn(conn)
return results
def count_top_questions(questions_array):
corpus_embeddings = embedder.encode(questions_array, convert_to_tensor=True)
top_questions_array = {}
for question in questions_array:
query_embedding = embedder.encode([question], convert_to_tensor=True)
# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
top_results = torch.topk(cos_scores, k=100)
counter = 0
for score, idx in zip(top_results[0][1:], top_results[1][1:]):
if score.item() >= 0.8:
counter += 1
top_questions_array[question] = counter
# removing duplicate tuples
return sorted(top_questions_array.items(), key=lambda x: x[1], reverse=True)[:50]
def remove_redundancy(redundant_raw_top_asked_questions):
for raw_top_asked_question in redundant_raw_top_asked_questions:
for raw_top_asked_question_inner in redundant_raw_top_asked_questions:
matching_ratio = difflib.SequenceMatcher(None, raw_top_asked_question_inner[0],
raw_top_asked_question[0]).ratio()
if 0.7 <= matching_ratio < 1.0:
redundant_raw_top_asked_questions.remove(raw_top_asked_question_inner)
return redundant_raw_top_asked_questions
def remove_greetings(sanitised_questions_array):
greeting_array = ['hey', 'hi', 'hello', "Hello!",
"Hi there!",
"Hey!",
"Good morning!",
"Good afternoon!",
"Good evening!",
"Howdy!",
"Greetings!",
"Nice to see you!",
"What's up?",
"Hi!",
"hiiii",
"Hello!",
"Hey!", "How are you?",
"What is your name?",
"Where are you from?",
"What do you do?",
"How can I help you?",
"What's the weather like?",
"Do you have any plans for the weekend?",
"Have you seen any good movies lately?",
"What's your favorite food?",
"What are your hobbies?", "hi, hello"]
greetings_embeddings = embedder.encode(greeting_array, convert_to_tensor=True)
for raw_top_asked_question in sanitised_questions_array[:10]:
query_embedding = embedder.encode([raw_top_asked_question[0]], convert_to_tensor=True)
cos_scores = util.cos_sim(query_embedding, greetings_embeddings)[0]
top_results = torch.topk(cos_scores, k=1)
for score, idx in zip(top_results[0], top_results[1]):
if score.item() >= 0.87:
sanitised_questions_array.remove(raw_top_asked_question)
return sanitised_questions_array
def final_phase_filtering(raw_first_phase_filtered_questions, limit=20):
raw_first_phase_filtered_questions = raw_first_phase_filtered_questions[:limit]
for raw_first_phase_filtered_question in raw_first_phase_filtered_questions:
for raw_first_phase_filtered_question_inner in raw_first_phase_filtered_questions:
emb1 = embedder.encode(raw_first_phase_filtered_question[0])
emb2 = embedder.encode(raw_first_phase_filtered_question_inner[0])
cos_sim = util.cos_sim(emb1, emb2)
if 0.85 <= cos_sim.item() < 1.0000001192092896:
raw_first_phase_filtered_questions.remove(raw_first_phase_filtered_question_inner)
return raw_first_phase_filtered_questions
def return_top_question(limit=5):
questions = get_question()
print('questions')
count_top_questions_ = count_top_questions(questions)
print('count_top_questions_')
remove_redundancy_ = remove_redundancy(count_top_questions_)
print('remove_redundancy_')
remove_greetings_ = remove_greetings(remove_redundancy_)
print('remove_greetings_')
final_phase_filtering_ = final_phase_filtering(remove_greetings_)[:limit]
print('final_phase_filtering_')
message = 'These are the top questions asked on the ask twimbit/platform by the users:'
for key, final_phase_filtering__ in enumerate(final_phase_filtering_):
message = message + '\n {}: '.format(key + 1) + final_phase_filtering__[0]
return message
def return_recent_posts(limit=5, strategy='recent'):
import os
import requests
import json
HASURA_URL = os.environ['HASURA_URL']
HASURA_ADMIN_SECRET = os.environ['HASURA_ADMIN_SECRET']
url = HASURA_URL
body = """query homeFeedQuery($strategy: Strategy, $limit: Int){
feed(strategy: $strategy, limit: $limit) {
hits {
link
title
date
author
}
}
}"""
variables = {'strategy': strategy, 'limit': limit}
response = requests.post(url=url, json={'query': body, 'variables': variables}, headers={
'x-hasura-admin-secret': HASURA_ADMIN_SECRET})
message = 'These are the recent Articles/Posts on the platform/twimbit website: \n'
if response.status_code == 200:
data = json.loads(response.content)
posts = data.get('data').get('feed').get('hits')
for key, post in enumerate(posts):
title = post.get('title')
link = post.get('link')
date = post.get('date')
authors = ','.join(post.get('author'))
message += 'Post/Article {}:- \n\tPost/Article Title:- {}\n\tPost/Article Link/URL:- {}\n\tPost/Article Publish Date:- {}\n\tPost/Article Author:- {}\n'.format(
key + 1, title, link, date, authors)
return message
|