Spaces:

bertugmirasyedi
/

aristotle-api

Sleeping

File size: 9,055 Bytes

from fastapi import FastAPI

# Define the FastAPI app
app = FastAPI(docs_url="/")

@app.get("/search={query}&similarity={similarity}")
def search(query, similarity=False):
    import time
    import requests

    start_time = time.time()

    # Set the API endpoint and query parameters
    url = "https://www.googleapis.com/books/v1/volumes"
    params = {"q": str(query), "printType": "books", "maxResults": 10}

    # Send a GET request to the API with the specified parameters
    response = requests.get(url, params=params)

    # Initialize the lists to store the results
    titles = []
    authors = []
    publishers = []
    descriptions = []
    images = []

    # Parse the response JSON and append the results
    data = response.json()

    for item in data["items"]:
        volume_info = item["volumeInfo"]
        try:
            titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
        except KeyError:
            titles.append(volume_info["title"])

        try:
            descriptions.append(volume_info["description"])
        except KeyError:
            descriptions.append("Null")

        try:
            publishers.append(volume_info["publisher"])
        except KeyError:
            publishers.append("Null")

        try:
            authors.append(volume_info["authors"][0])
        except KeyError:
            authors.append("Null")

        try:
            images.append(volume_info["imageLinks"]["thumbnail"])
        except KeyError:
            images.append(
                "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
            )

    ### Openalex ###
    import pyalex
    from pyalex import Works

    # Add email to the config
    pyalex.config.email = "ber2mir@gmail.com"

    # Define a pager object with the same query
    pager = Works().search(str(query)).paginate(per_page=10, n_max=10)

    # Generate a list of the results
    openalex_results = list(pager)

    # Get the titles, descriptions, and publishers and append them to the lists
    for result in openalex_results[0]:
        try:
            titles.append(result["title"])
        except KeyError:
            titles.append("Null")

        try:
            descriptions.append(result["abstract"])
        except KeyError:
            descriptions.append("Null")

        try:
            publishers.append(result["host_venue"]["publisher"])
        except KeyError:
            publishers.append("Null")

        try:
            authors.append(result["authorships"][0]["author"]["display_name"])
        except KeyError:
            authors.append("Null")

        images.append(
            "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
        )

    ### OpenAI ###
    import openai

    # Set the OpenAI API key
    openai.api_key = "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"

    # Create ChatGPT query
    chatgpt_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a librarian. You are helping a patron find a book.",
            },
            {
                "role": "user",
                "content": f"Recommend me 10 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
            },
        ],
    )

    # Split the response into a list of results
    chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
        2::2
    ]

    # Define a function to parse the results
    def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
        # Create a dict to store the key-value pairs
        parsed_result = {}

        for key in ordered_keys:
            # Split the result string by the key and append the value to the list
            if key != ordered_keys[-1]:
                parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
            else:
                parsed_result[key] = result.split(f"{key}: ")[1]

        return parsed_result

    ordered_keys = ["Title", "Author", "Publisher", "Summary"]

    for result in chatgpt_results:
        try:
            # Parse the result
            parsed_result = parse_result(result, ordered_keys=ordered_keys)
 
            # Append the parsed result to the lists
            titles.append(parsed_result["Title"])
            authors.append(parsed_result["Author"])
            publishers.append(parsed_result["Publisher"])
            descriptions.append(parsed_result["Summary"])
            images.append(
                "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
            )

        # In case the OpenAI API hits the limit
        except IndexError:
            break


    ### Prediction ###
    from transformers import (
        AutoTokenizer,
        AutoModelForSeq2SeqLM,
        AutoModelForSequenceClassification,
        pipeline,
    )
    from sentence_transformers import SentenceTransformer, CrossEncoder
    from sentence_transformers.util import cos_sim, dot_score

    # Load the classifiers
    # classifier = TextClassifier.load(
    #    "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
    # )
    # sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
    # cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")

    # Combine title, description, and publisher into a single string
    combined_data = [
        f"{title} {description} {publisher}"
        for title, description, publisher in zip(titles, descriptions, publishers)
    ]

    # Prepare the Sentence object
    # sentences = [
    #    Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
    # ]

    # Classify the sentences
    # classifier.predict(sentences)

    # Get the predicted labels
    # classes = [sentence.labels for sentence in sentences]

    # Define the summarizer model and tokenizer
    sum_tokenizer = AutoTokenizer.from_pretrained("lidiya/bart-base-samsum")

    # sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
    sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")

    summarizer_pipeline = pipeline(
        "summarization",
        model=sum_model,
        tokenizer=sum_tokenizer,
        batch_size=64,
    )

    # Define the zero-shot classifier
    zs_tokenizer = AutoTokenizer.from_pretrained(
        "sileod/deberta-v3-base-tasksource-nli"
    )
    # Quickfix for the tokenizer
    # zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]

    zs_model = AutoModelForSequenceClassification.from_pretrained(
        "sileod/deberta-v3-base-tasksource-nli"
    )
    zs_classifier = pipeline(
        "zero-shot-classification",
        model=zs_model,
        tokenizer=zs_tokenizer,
        batch_size=64,
        hypothesis_template="This book is {}.",
        multi_label=True,
    )

    # Summarize the descriptions
    summaries = [
        summarizer_pipeline(description[0:1024])
        if (description != None)
        else [{"summary_text": "Null"}]
        for description in descriptions
    ]

    # Predict the level of the book
    candidate_labels = [
        "Introductory",
        "Advanced",
        "Academic",
        "Not Academic",
        "Manual",
    ]

    # Get the predicted labels
    classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]

    # Calculate the elapsed time
    end_time = time.time()
    runtime = f"{end_time - start_time:.2f} seconds"

    # Calculate the similarity between the books
    if similarity:
        from sentence_transformers import util

        sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
        book_embeddings = sentence_transformer.encode(
            combined_data, convert_to_tensor=True
        )

        similar_books = []
        for i in range(len(titles)):
            current_embedding = book_embeddings[i]

            similarity_sorted = util.semantic_search(
                current_embedding, book_embeddings, top_k=20
            )

            similar_books.append(
                {
                    "sorted_by_similarity": similarity_sorted[0][1:],
                }
            )

    # Create a list of dictionaries to store the results
    results = []
    for i in range(len(titles)):
        results.append(
            {
                "id": i,
                "title": titles[i],
                "author": authors[i],
                "publisher": publishers[i],
                "image_link": images[i],
                "labels": classes[i]["labels"][0:2],
                "label_confidences": classes[i]["scores"][0:2],
                "summary": summaries[i][0]["summary_text"],
                "similar_books": similar_books[i]["sorted_by_similarity"],
                "runtime": runtime,
            }
        )

    return results