Spaces:
Runtime error
Runtime error
from sentence_transformers import SentenceTransformer, util | |
import torch | |
import difflib | |
from utils.GetDB import GetDB | |
postgreSQL_pool = GetDB().get_db_connection() | |
embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
def get_question(): | |
# Connect to the PostgreSQL database | |
conn = postgreSQL_pool.getconn() | |
# Create a cursor object | |
cur = conn.cursor() | |
# Execute a SELECT query to fetch data from the "users" table | |
cur.execute("SELECT question FROM chat_history ORDER BY created_at DESC") | |
# Fetch all the results as a list of tuples | |
results = cur.fetchall() | |
results = [x[0] for x in results] | |
# Close the cursor and connection | |
# # cur.close() | |
postgreSQL_pool.putconn(conn) | |
return results | |
def count_top_questions(questions_array): | |
corpus_embeddings = embedder.encode(questions_array, convert_to_tensor=True) | |
top_questions_array = {} | |
for question in questions_array: | |
query_embedding = embedder.encode([question], convert_to_tensor=True) | |
# We use cosine-similarity and torch.topk to find the highest 5 scores | |
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0] | |
top_results = torch.topk(cos_scores, k=100) | |
counter = 0 | |
for score, idx in zip(top_results[0][1:], top_results[1][1:]): | |
if score.item() >= 0.8: | |
counter += 1 | |
top_questions_array[question] = counter | |
# removing duplicate tuples | |
return sorted(top_questions_array.items(), key=lambda x: x[1], reverse=True)[:50] | |
def remove_redundancy(redundant_raw_top_asked_questions): | |
for raw_top_asked_question in redundant_raw_top_asked_questions: | |
for raw_top_asked_question_inner in redundant_raw_top_asked_questions: | |
matching_ratio = difflib.SequenceMatcher(None, raw_top_asked_question_inner[0], | |
raw_top_asked_question[0]).ratio() | |
if 0.7 <= matching_ratio < 1.0: | |
redundant_raw_top_asked_questions.remove(raw_top_asked_question_inner) | |
return redundant_raw_top_asked_questions | |
def remove_greetings(sanitised_questions_array): | |
greeting_array = ['hey', 'hi', 'hello', "Hello!", | |
"Hi there!", | |
"Hey!", | |
"Good morning!", | |
"Good afternoon!", | |
"Good evening!", | |
"Howdy!", | |
"Greetings!", | |
"Nice to see you!", | |
"What's up?", | |
"Hi!", | |
"hiiii", | |
"Hello!", | |
"Hey!", "How are you?", | |
"What is your name?", | |
"Where are you from?", | |
"What do you do?", | |
"How can I help you?", | |
"What's the weather like?", | |
"Do you have any plans for the weekend?", | |
"Have you seen any good movies lately?", | |
"What's your favorite food?", | |
"What are your hobbies?", "hi, hello"] | |
greetings_embeddings = embedder.encode(greeting_array, convert_to_tensor=True) | |
for raw_top_asked_question in sanitised_questions_array[:10]: | |
query_embedding = embedder.encode([raw_top_asked_question[0]], convert_to_tensor=True) | |
cos_scores = util.cos_sim(query_embedding, greetings_embeddings)[0] | |
top_results = torch.topk(cos_scores, k=1) | |
for score, idx in zip(top_results[0], top_results[1]): | |
if score.item() >= 0.87: | |
sanitised_questions_array.remove(raw_top_asked_question) | |
return sanitised_questions_array | |
def final_phase_filtering(raw_first_phase_filtered_questions, limit=20): | |
raw_first_phase_filtered_questions = raw_first_phase_filtered_questions[:limit] | |
for raw_first_phase_filtered_question in raw_first_phase_filtered_questions: | |
for raw_first_phase_filtered_question_inner in raw_first_phase_filtered_questions: | |
emb1 = embedder.encode(raw_first_phase_filtered_question[0]) | |
emb2 = embedder.encode(raw_first_phase_filtered_question_inner[0]) | |
cos_sim = util.cos_sim(emb1, emb2) | |
if 0.85 <= cos_sim.item() < 1.0000001192092896: | |
raw_first_phase_filtered_questions.remove(raw_first_phase_filtered_question_inner) | |
return raw_first_phase_filtered_questions | |
def return_top_question(limit=5): | |
questions = get_question() | |
print('questions') | |
count_top_questions_ = count_top_questions(questions) | |
print('count_top_questions_') | |
remove_redundancy_ = remove_redundancy(count_top_questions_) | |
print('remove_redundancy_') | |
remove_greetings_ = remove_greetings(remove_redundancy_) | |
print('remove_greetings_') | |
final_phase_filtering_ = final_phase_filtering(remove_greetings_)[:limit] | |
print('final_phase_filtering_') | |
message = 'These are the top questions asked on the ask twimbit/platform by the users:' | |
for key, final_phase_filtering__ in enumerate(final_phase_filtering_): | |
message = message + '\n {}: '.format(key + 1) + final_phase_filtering__[0] | |
return message | |
def return_recent_posts(limit=5, strategy='recent'): | |
import os | |
import requests | |
import json | |
HASURA_URL = os.environ['HASURA_URL'] | |
HASURA_ADMIN_SECRET = os.environ['HASURA_ADMIN_SECRET'] | |
url = HASURA_URL | |
body = """query homeFeedQuery($strategy: Strategy, $limit: Int){ | |
feed(strategy: $strategy, limit: $limit) { | |
hits { | |
link | |
title | |
date | |
author | |
} | |
} | |
}""" | |
variables = {'strategy': strategy, 'limit': limit} | |
response = requests.post(url=url, json={'query': body, 'variables': variables}, headers={ | |
'x-hasura-admin-secret': HASURA_ADMIN_SECRET}) | |
message = 'These are the recent Articles/Posts on the platform/twimbit website: \n' | |
if response.status_code == 200: | |
data = json.loads(response.content) | |
posts = data.get('data').get('feed').get('hits') | |
for key, post in enumerate(posts): | |
title = post.get('title') | |
link = post.get('link') | |
date = post.get('date') | |
authors = ','.join(post.get('author')) | |
message += 'Post/Article {}:- \n\tPost/Article Title:- {}\n\tPost/Article Link/URL:- {}\n\tPost/Article Publish Date:- {}\n\tPost/Article Author:- {}\n'.format( | |
key + 1, title, link, date, authors) | |
return message | |