Spaces:

bertugmirasyedi
/

aristotle-api

Running

App Files Files Community

aristotle-api / search.py

bertugmirasyedi

First commit

6b67b82 over 1 year ago

raw

history blame

No virus

9.58 kB

	from fastapi import FastAPI
	from fastapi.middleware.cors import CORSMiddleware
	import sys

	# Set the maximum recursion depth to 10000
	sys.setrecursionlimit(10000)

	# Define the FastAPI app
	app = FastAPI()

	# Add the CORS middleware to the app
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	@app.get("/search={query}&similarity={similarity}")
	def search(query, similarity=False):
	import time
	import requests

	start_time = time.time()

	# Set the API endpoint and query parameters
	url = "https://www.googleapis.com/books/v1/volumes"
	params = {"q": str(query), "printType": "books", "maxResults": 30}

	# Send a GET request to the API with the specified parameters
	response = requests.get(url, params=params)

	# Initialize the lists to store the results
	titles = []
	authors = []
	publishers = []
	descriptions = []
	images = []

	# Parse the response JSON and append the results
	data = response.json()

	for item in data["items"]:
	volume_info = item["volumeInfo"]
	try:
	titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
	except KeyError:
	titles.append(volume_info["title"])

	try:
	descriptions.append(volume_info["description"])
	except KeyError:
	descriptions.append("Null")

	try:
	publishers.append(volume_info["publisher"])
	except KeyError:
	publishers.append("Null")

	try:
	authors.append(volume_info["authors"][0])
	except KeyError:
	authors.append("Null")

	try:
	images.append(volume_info["imageLinks"]["thumbnail"])
	except KeyError:
	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)

	### Openalex ###
	import pyalex
	from pyalex import Works

	# Add email to the config
	pyalex.config.email = "ber2mir@gmail.com"

	# Define a pager object with the same query
	pager = Works().search(str(query)).paginate(per_page=10, n_max=10)

	# Generate a list of the results
	openalex_results = list(pager)

	# Get the titles, descriptions, and publishers and append them to the lists
	for result in openalex_results[0]:
	try:
	titles.append(result["title"])
	except KeyError:
	titles.append("Null")

	try:
	descriptions.append(result["abstract"])
	except KeyError:
	descriptions.append("Null")

	try:
	publishers.append(result["host_venue"]["publisher"])
	except KeyError:
	publishers.append("Null")

	try:
	authors.append(result["authorships"][0]["author"]["display_name"])
	except KeyError:
	authors.append("Null")

	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)

	### OpenAI ###
	import openai

	# Set the OpenAI API key
	openai.api_key = "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"

	# Create ChatGPT query
	chatgpt_response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=[
	{
	"role": "system",
	"content": "You are a librarian. You are helping a patron find a book.",
	},
	{
	"role": "user",
	"content": f"Recommend me 10 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
	},
	],
	)

	# Split the response into a list of results
	chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
	2::2
	]

	# Define a function to parse the results
	def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
	# Create a dict to store the key-value pairs
	parsed_result = {}

	for key in ordered_keys:
	# Split the result string by the key and append the value to the list
	if key != ordered_keys[-1]:
	parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
	else:
	parsed_result[key] = result.split(f"{key}: ")[1]

	return parsed_result

	ordered_keys = ["Title", "Author", "Publisher", "Summary"]

	for result in chatgpt_results:
	# Parse the result
	parsed_result = parse_result(result, ordered_keys=ordered_keys)

	# Append the parsed result to the lists
	titles.append(parsed_result["Title"])
	authors.append(parsed_result["Author"])
	publishers.append(parsed_result["Publisher"])
	descriptions.append(parsed_result["Summary"])
	images.append(
	"https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
	)

	### Prediction ###
	from flair.models import TextClassifier
	from flair.data import Sentence
	from flair.tokenization import SegtokTokenizer
	from transformers import (
	AutoTokenizer,
	AutoModelForSeq2SeqLM,
	AutoModelForSequenceClassification,
	pipeline,
	)
	from sentence_transformers import SentenceTransformer, CrossEncoder
	from sentence_transformers.util import cos_sim, dot_score
	from optimum.onnxruntime import (
	ORTModelForSeq2SeqLM,
	ORTModelForSequenceClassification,
	)
	from optimum.pipelines import pipeline as optimum_pipeline

	# Load the classifiers
	# classifier = TextClassifier.load(
	# "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
	# )
	# sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
	# cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")

	# Combine title, description, and publisher into a single string
	combined_data = [
	f"{title} {description} {publisher}"
	for title, description, publisher in zip(titles, descriptions, publishers)
	]

	# Prepare the Sentence object
	# sentences = [
	# Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
	# ]

	# Classify the sentences
	# classifier.predict(sentences)

	# Get the predicted labels
	# classes = [sentence.labels for sentence in sentences]

	# Define the summarizer model and tokenizer
	sum_tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")
	sum_model_quantized = ORTModelForSeq2SeqLM.from_pretrained(
	"trainers/bart-base-samsum-quantized"
	)
	# sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")

	summarizer_pipeline = optimum_pipeline(
	"summarization",
	model=sum_model_quantized,
	tokenizer=sum_tokenizer,
	batch_size=64,
	)

	# Define the zero-shot classifier
	zs_tokenizer = AutoTokenizer.from_pretrained(
	"sileod/deberta-v3-base-tasksource-nli"
	)
	# Quickfix for the tokenizer
	# zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]

	zs_model = AutoModelForSequenceClassification.from_pretrained(
	"sileod/deberta-v3-base-tasksource-nli"
	)
	zs_classifier = pipeline(
	"zero-shot-classification",
	model=zs_model,
	tokenizer=zs_tokenizer,
	batch_size=64,
	hypothesis_template="This book is {}.",
	multi_label=True,
	)

	# Summarize the descriptions
	summaries = [
	summarizer_pipeline(description[0:1024])
	if (description != None)
	else [{"summary_text": "Null"}]
	for description in descriptions
	]

	# Predict the level of the book
	candidate_labels = [
	"Introductory",
	"Advanced",
	"Academic",
	"Not Academic",
	"Manual",
	]

	# Get the predicted labels
	classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]

	# Calculate the elapsed time
	end_time = time.time()
	runtime = f"{end_time - start_time:.2f} seconds"

	# Calculate the similarity between the books
	if similarity:
	from sentence_transformers import util

	sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
	book_embeddings = sentence_transformer.encode(
	combined_data, convert_to_tensor=True
	)

	similar_books = []
	for i in range(len(titles)):
	current_embedding = book_embeddings[i]

	similarity_sorted = util.semantic_search(
	current_embedding, book_embeddings, top_k=20
	)

	similar_books.append(
	{
	"sorted_by_similarity": similarity_sorted[0][1:],
	}
	)

	# Create a list of dictionaries to store the results
	results = []
	for i in range(len(titles)):
	results.append(
	{
	"id": i,
	"title": titles[i],
	"author": authors[i],
	"publisher": publishers[i],
	"image_link": images[i],
	"labels": classes[i]["labels"][0:2],
	"label_confidences": classes[i]["scores"][0:2],
	"summary": summaries[i][0]["summary_text"],
	"similar_books": similar_books[i]["sorted_by_similarity"],
	"runtime": runtime,
	}
	)

	return results