aristotle-api / search.py
bertugmirasyedi's picture
First commit
6b67b82
raw
history blame
No virus
9.58 kB
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import sys
# Set the maximum recursion depth to 10000
sys.setrecursionlimit(10000)
# Define the FastAPI app
app = FastAPI()
# Add the CORS middleware to the app
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/search={query}&similarity={similarity}")
def search(query, similarity=False):
import time
import requests
start_time = time.time()
# Set the API endpoint and query parameters
url = "https://www.googleapis.com/books/v1/volumes"
params = {"q": str(query), "printType": "books", "maxResults": 30}
# Send a GET request to the API with the specified parameters
response = requests.get(url, params=params)
# Initialize the lists to store the results
titles = []
authors = []
publishers = []
descriptions = []
images = []
# Parse the response JSON and append the results
data = response.json()
for item in data["items"]:
volume_info = item["volumeInfo"]
try:
titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
except KeyError:
titles.append(volume_info["title"])
try:
descriptions.append(volume_info["description"])
except KeyError:
descriptions.append("Null")
try:
publishers.append(volume_info["publisher"])
except KeyError:
publishers.append("Null")
try:
authors.append(volume_info["authors"][0])
except KeyError:
authors.append("Null")
try:
images.append(volume_info["imageLinks"]["thumbnail"])
except KeyError:
images.append(
"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
)
### Openalex ###
import pyalex
from pyalex import Works
# Add email to the config
pyalex.config.email = "ber2mir@gmail.com"
# Define a pager object with the same query
pager = Works().search(str(query)).paginate(per_page=10, n_max=10)
# Generate a list of the results
openalex_results = list(pager)
# Get the titles, descriptions, and publishers and append them to the lists
for result in openalex_results[0]:
try:
titles.append(result["title"])
except KeyError:
titles.append("Null")
try:
descriptions.append(result["abstract"])
except KeyError:
descriptions.append("Null")
try:
publishers.append(result["host_venue"]["publisher"])
except KeyError:
publishers.append("Null")
try:
authors.append(result["authorships"][0]["author"]["display_name"])
except KeyError:
authors.append("Null")
images.append(
"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
)
### OpenAI ###
import openai
# Set the OpenAI API key
openai.api_key = "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"
# Create ChatGPT query
chatgpt_response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": "You are a librarian. You are helping a patron find a book.",
},
{
"role": "user",
"content": f"Recommend me 10 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
},
],
)
# Split the response into a list of results
chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
2::2
]
# Define a function to parse the results
def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
# Create a dict to store the key-value pairs
parsed_result = {}
for key in ordered_keys:
# Split the result string by the key and append the value to the list
if key != ordered_keys[-1]:
parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
else:
parsed_result[key] = result.split(f"{key}: ")[1]
return parsed_result
ordered_keys = ["Title", "Author", "Publisher", "Summary"]
for result in chatgpt_results:
# Parse the result
parsed_result = parse_result(result, ordered_keys=ordered_keys)
# Append the parsed result to the lists
titles.append(parsed_result["Title"])
authors.append(parsed_result["Author"])
publishers.append(parsed_result["Publisher"])
descriptions.append(parsed_result["Summary"])
images.append(
"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
)
### Prediction ###
from flair.models import TextClassifier
from flair.data import Sentence
from flair.tokenization import SegtokTokenizer
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
AutoModelForSequenceClassification,
pipeline,
)
from sentence_transformers import SentenceTransformer, CrossEncoder
from sentence_transformers.util import cos_sim, dot_score
from optimum.onnxruntime import (
ORTModelForSeq2SeqLM,
ORTModelForSequenceClassification,
)
from optimum.pipelines import pipeline as optimum_pipeline
# Load the classifiers
# classifier = TextClassifier.load(
# "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
# )
# sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
# cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")
# Combine title, description, and publisher into a single string
combined_data = [
f"{title} {description} {publisher}"
for title, description, publisher in zip(titles, descriptions, publishers)
]
# Prepare the Sentence object
# sentences = [
# Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
# ]
# Classify the sentences
# classifier.predict(sentences)
# Get the predicted labels
# classes = [sentence.labels for sentence in sentences]
# Define the summarizer model and tokenizer
sum_tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
sum_model_quantized = ORTModelForSeq2SeqLM.from_pretrained(
"trainers/bart-base-samsum-quantized"
)
# sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
summarizer_pipeline = optimum_pipeline(
"summarization",
model=sum_model_quantized,
tokenizer=sum_tokenizer,
batch_size=64,
)
# Define the zero-shot classifier
zs_tokenizer = AutoTokenizer.from_pretrained(
"sileod/deberta-v3-base-tasksource-nli"
)
# Quickfix for the tokenizer
# zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]
zs_model = AutoModelForSequenceClassification.from_pretrained(
"sileod/deberta-v3-base-tasksource-nli"
)
zs_classifier = pipeline(
"zero-shot-classification",
model=zs_model,
tokenizer=zs_tokenizer,
batch_size=64,
hypothesis_template="This book is {}.",
multi_label=True,
)
# Summarize the descriptions
summaries = [
summarizer_pipeline(description[0:1024])
if (description != None)
else [{"summary_text": "Null"}]
for description in descriptions
]
# Predict the level of the book
candidate_labels = [
"Introductory",
"Advanced",
"Academic",
"Not Academic",
"Manual",
]
# Get the predicted labels
classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
# Calculate the elapsed time
end_time = time.time()
runtime = f"{end_time - start_time:.2f} seconds"
# Calculate the similarity between the books
if similarity:
from sentence_transformers import util
sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
book_embeddings = sentence_transformer.encode(
combined_data, convert_to_tensor=True
)
similar_books = []
for i in range(len(titles)):
current_embedding = book_embeddings[i]
similarity_sorted = util.semantic_search(
current_embedding, book_embeddings, top_k=20
)
similar_books.append(
{
"sorted_by_similarity": similarity_sorted[0][1:],
}
)
# Create a list of dictionaries to store the results
results = []
for i in range(len(titles)):
results.append(
{
"id": i,
"title": titles[i],
"author": authors[i],
"publisher": publishers[i],
"image_link": images[i],
"labels": classes[i]["labels"][0:2],
"label_confidences": classes[i]["scores"][0:2],
"summary": summaries[i][0]["summary_text"],
"similar_books": similar_books[i]["sorted_by_similarity"],
"runtime": runtime,
}
)
return results