Spaces:

bertugmirasyedi
/

aristotle-api

Sleeping

App Files Files Community

bertugmirasyedi commited on Mar 30, 2023

Commit

d9187f0

•

1 Parent(s): 7b4a17b

Changed subparts to functions

Browse files

Files changed (1) hide show

app.py +306 -226

app.py CHANGED Viewed

@@ -21,13 +21,6 @@ def search(query, similarity="false"):
     start_time = time.time()
-    # Set the API endpoint and query parameters
-    url = "https://www.googleapis.com/books/v1/volumes"
-    params = {"q": str(query), "printType": "books", "maxResults": 10}
-    # Send a GET request to the API with the specified parameters
-    response = requests.get(url, params=params)
     # Initialize the lists to store the results
     titles = []
     authors = []
@@ -35,251 +28,332 @@ def search(query, similarity="false"):
     descriptions = []
     images = []
-    # Parse the response JSON and append the results
-    data = response.json()
-    for item in data["items"]:
-        volume_info = item["volumeInfo"]
-        try:
-            titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
-        except KeyError:
-            titles.append(volume_info["title"])
-        try:
-            descriptions.append(volume_info["description"])
-        except KeyError:
-            descriptions.append("Null")
-        try:
-            publishers.append(volume_info["publisher"])
-        except KeyError:
-            publishers.append("Null")
-        try:
-            authors.append(volume_info["authors"][0])
-        except KeyError:
-            authors.append("Null")
-        try:
-            images.append(volume_info["imageLinks"]["thumbnail"])
-        except KeyError:
             images.append(
                 "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
             )
-    ### Openalex ###
-    import pyalex
-    from pyalex import Works
-    # Add email to the config
-    pyalex.config.email = "ber2mir@gmail.com"
-    # Define a pager object with the same query
-    pager = Works().search(str(query)).paginate(per_page=10, n_max=10)
-    # Generate a list of the results
-    openalex_results = list(pager)
-    # Get the titles, descriptions, and publishers and append them to the lists
-    for result in openalex_results[0]:
-        try:
-            titles.append(result["title"])
-        except KeyError:
-            titles.append("Null")
-        try:
-            descriptions.append(result["abstract"])
-        except KeyError:
-            descriptions.append("Null")
-        try:
-            publishers.append(result["host_venue"]["publisher"])
-        except KeyError:
-            publishers.append("Null")
-        try:
-            authors.append(result["authorships"][0]["author"]["display_name"])
-        except KeyError:
-            authors.append("Null")
-        images.append(
-            "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
         )
-    ### OpenAI ###
-    import openai
-    # Set the OpenAI API key
-    openai.api_key = "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"
-    # Create ChatGPT query
-    chatgpt_response = openai.ChatCompletion.create(
-        model="gpt-3.5-turbo",
-        messages=[
-            {
-                "role": "system",
-                "content": "You are a librarian. You are helping a patron find a book.",
-            },
-            {
-                "role": "user",
-                "content": f"Recommend me 10 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
-            },
-        ],
-    )
-    # Split the response into a list of results
-    chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
-        2::2
-    ]
-    # Define a function to parse the results
-    def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
-        # Create a dict to store the key-value pairs
-        parsed_result = {}
-        for key in ordered_keys:
-            # Split the result string by the key and append the value to the list
-            if key != ordered_keys[-1]:
-                parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
-            else:
-                parsed_result[key] = result.split(f"{key}: ")[1]
-        return parsed_result
-    ordered_keys = ["Title", "Author", "Publisher", "Summary"]
-    for result in chatgpt_results:
-        try:
-            # Parse the result
-            parsed_result = parse_result(result, ordered_keys=ordered_keys)
-            # Append the parsed result to the lists
-            titles.append(parsed_result["Title"])
-            authors.append(parsed_result["Author"])
-            publishers.append(parsed_result["Publisher"])
-            descriptions.append(parsed_result["Summary"])
-            images.append(
-                "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
             )
-        # In case the OpenAI API hits the limit
-        except IndexError:
-            break
-    ### Prediction ###
-    from transformers import (
-        AutoTokenizer,
-        AutoModelForSeq2SeqLM,
-        AutoModelForSequenceClassification,
-        pipeline,
-    )
-    from sentence_transformers import SentenceTransformer
-    # Load the classifiers
-    # classifier = TextClassifier.load(
-    #    "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
-    # )
-    # sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
-    # cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")
-    # Combine title, description, and publisher into a single string
-    combined_data = [
-        f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
-        for title, description, publisher in zip(titles, descriptions, publishers)
-    ]
-    # Prepare the Sentence object
-    # sentences = [
-    #    Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
-    # ]
-    # Classify the sentences
-    # classifier.predict(sentences)
-    # Get the predicted labels
-    # classes = [sentence.labels for sentence in sentences]
-    # Define the summarizer model and tokenizer
-    sum_tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-xsum-12-6")
-    sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
-    # sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
-    summarizer_pipeline = pipeline(
-        "summarization",
-        model=sum_model,
-        tokenizer=sum_tokenizer,
-        batch_size=64,
-    )
-    # Define the zero-shot classifier
-    zs_tokenizer = AutoTokenizer.from_pretrained(
-        "sileod/deberta-v3-base-tasksource-nli"
-    )
-    # Quickfix for the tokenizer
-    # zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]
-    zs_model = AutoModelForSequenceClassification.from_pretrained(
-        "sileod/deberta-v3-base-tasksource-nli"
-    )
-    zs_classifier = pipeline(
-        "zero-shot-classification",
-        model=zs_model,
-        tokenizer=zs_tokenizer,
-        batch_size=64,
-        hypothesis_template="This book is {}.",
-        multi_label=True,
     )
-    # Summarize the descriptions
-    summaries = [
-        summarizer_pipeline(description[0:1024])
-        if (description != None)
-        else [{"summary_text": "Null"}]
-        for description in descriptions
-    ]
-    # Predict the level of the book
-    candidate_labels = [
-        "Introductory",
-        "Advanced",
-        "Academic",
-        "Not Academic",
-        "Manual",
-    ]
-    # Get the predicted labels
-    classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
     # Calculate the elapsed time
     end_time = time.time()
     runtime = f"{end_time - start_time:.2f} seconds"
-    # Calculate the similarity between the books
-    if similarity != "false":
-        from sentence_transformers import util
-        sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
-        book_embeddings = sentence_transformer.encode(
-            combined_data, convert_to_tensor=True
-        )
-        similar_books = []
-        for i in range(len(titles)):
-            current_embedding = book_embeddings[i]
-            similarity_sorted = util.semantic_search(
-                current_embedding, book_embeddings, top_k=20
-            )
-            similar_books.append(
-                {
-                    "sorted_by_similarity": similarity_sorted[0][1:],
-                }
-            )
-    else:
-        similar_books = [{"sorted_by_similarity": []} for i in range(len(titles))]
     # Create a list of dictionaries to store the results
     results = [
         {
@@ -292,6 +366,12 @@ def search(query, similarity="false"):
             "label_confidences": classes[i]["scores"][0:2],
             "summary": summaries[i][0]["summary_text"],
             "similar_books": similar_books[i]["sorted_by_similarity"],
             "runtime": runtime,
         }
         for i in range(len(titles))

     start_time = time.time()
     # Initialize the lists to store the results
     titles = []
     authors = []
     descriptions = []
     images = []
+    def gbooks_search(query, n_results=30):
+        """
+        Access the Google Books API and return the results.
+        """
+        # Set the API endpoint and query parameters
+        url = "https://www.googleapis.com/books/v1/volumes"
+        params = {"q": str(query), "printType": "books", "maxResults": n_results}
+        # Send a GET request to the API with the specified parameters
+        response = requests.get(url, params=params)
+        # Parse the response JSON and append the results
+        data = response.json()
+        for item in data["items"]:
+            volume_info = item["volumeInfo"]
+            try:
+                titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
+            except KeyError:
+                titles.append(volume_info["title"])
+            try:
+                descriptions.append(volume_info["description"])
+            except KeyError:
+                descriptions.append("Null")
+            try:
+                publishers.append(volume_info["publisher"])
+            except KeyError:
+                publishers.append("Null")
+            try:
+                authors.append(volume_info["authors"][0])
+            except KeyError:
+                authors.append("Null")
+            try:
+                images.append(volume_info["imageLinks"]["thumbnail"])
+            except KeyError:
+                images.append(
+                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
+                )
+        return titles, authors, publishers, descriptions, images
+    # Run the gbooks_search function
+    (
+        titles_placeholder,
+        authors_placeholder,
+        publishers_placeholder,
+        descriptions_placeholder,
+        images_placeholder,
+    ) = gbooks_search(query)
+    # Append the results to the lists
+    titles.extend(titles_placeholder)
+    authors.extend(authors_placeholder)
+    publishers.extend(publishers_placeholder)
+    descriptions.extend(descriptions_placeholder)
+    images.extend(images_placeholder)
+    # Get the time since the start
+    first_checkpoint = time.time()
+    first_checkpoint_time = int(first_checkpoint - start_time)
+    def openalex_search(query, n_results=10):
+        """
+        Run a search on OpenAlex and return the results.
+        """
+        import pyalex
+        from pyalex import Works
+        # Add email to the config
+        pyalex.config.email = "ber2mir@gmail.com"
+        # Define a pager object with the same query
+        pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)
+        # Generate a list of the results
+        openalex_results = list(pager)
+        # Get the titles, descriptions, and publishers and append them to the lists
+        for result in openalex_results[0]:
+            try:
+                titles.append(result["title"])
+            except KeyError:
+                titles.append("Null")
+            try:
+                descriptions.append(result["abstract"])
+            except KeyError:
+                descriptions.append("Null")
+            try:
+                publishers.append(result["host_venue"]["publisher"])
+            except KeyError:
+                publishers.append("Null")
+            try:
+                authors.append(result["authorships"][0]["author"]["display_name"])
+            except KeyError:
+                authors.append("Null")
             images.append(
                 "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
             )
+            return titles, authors, publishers, descriptions, images
+    # Run the openalex_search function
+    (
+        titles_placeholder,
+        authors_placeholder,
+        publishers_placeholder,
+        descriptions_placeholder,
+        images_placeholder,
+    ) = openalex_search(query)
+    # Append the results to the lists
+    titles.extend(titles_placeholder)
+    authors.extend(authors_placeholder)
+    publishers.extend(publishers_placeholder)
+    descriptions.extend(descriptions_placeholder)
+    images.extend(images_placeholder)
+    # Calculate the elapsed time between the first and second checkpoints
+    second_checkpoint = time.time()
+    second_checkpoint_time = int(second_checkpoint - first_checkpoint)
+    def openai_search(query, n_results=10):
+        """
+        Create a query to the OpenAI ChatGPT API and return the results.
+        """
+        import openai
+        # Set the OpenAI API key
+        openai.api_key = "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"
+        # Create ChatGPT query
+        chatgpt_response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a librarian. You are helping a patron find a book.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
+                },
+            ],
         )
+        # Split the response into a list of results
+        chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
+            "\n"
+        )[2::2]
+        # Define a function to parse the results
+        def parse_result(
+            result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
+        ):
+            # Create a dict to store the key-value pairs
+            parsed_result = {}
+            for key in ordered_keys:
+                # Split the result string by the key and append the value to the list
+                if key != ordered_keys[-1]:
+                    parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
+                else:
+                    parsed_result[key] = result.split(f"{key}: ")[1]
+            return parsed_result
+        ordered_keys = ["Title", "Author", "Publisher", "Summary"]
+        for result in chatgpt_results:
+            try:
+                # Parse the result
+                parsed_result = parse_result(result, ordered_keys=ordered_keys)
+                # Append the parsed result to the lists
+                titles.append(parsed_result["Title"])
+                authors.append(parsed_result["Author"])
+                publishers.append(parsed_result["Publisher"])
+                descriptions.append(parsed_result["Summary"])
+                images.append(
+                    "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
+                )
+            # In case the OpenAI API hits the limit
+            except IndexError:
+                break
+        return titles, authors, publishers, descriptions, images
+    # Run the openai_search function
+    (
+        titles_placeholder,
+        authors_placeholder,
+        publishers_placeholder,
+        descriptions_placeholder,
+        images_placeholder,
+    ) = openai_search(query)
+    # Append the results to the lists
+    titles.extend(titles_placeholder)
+    authors.extend(authors_placeholder)
+    publishers.extend(publishers_placeholder)
+    descriptions.extend(descriptions_placeholder)
+    images.extend(images_placeholder)
+    # Calculate the elapsed time between the second and third checkpoints
+    third_checkpoint = time.time()
+    third_checkpoint_time = int(third_checkpoint - second_checkpoint)
+    def predict(titles, descriptions, publishers, similarity=similarity):
+        """
+        Create a summarizer and classifier pipeline and return the results.
+        """
+        from transformers import (
+            AutoTokenizer,
+            AutoModelForSeq2SeqLM,
+            AutoModelForSequenceClassification,
+            pipeline,
+        )
+        from sentence_transformers import SentenceTransformer
+        # Combine title, description, and publisher into a single string
+        combined_data = [
+            f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
+            for title, description, publisher in zip(titles, descriptions, publishers)
+        ]
+        # Define the summarizer model and tokenizer
+        sum_tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
+        sum_model = AutoModelForSeq2SeqLM.from_pretrained(
+            "pszemraj/led-base-book-summary"
+        )
+        # sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
+        summarizer_pipeline = pipeline(
+            "summarization",
+            model=sum_model,
+            tokenizer=sum_tokenizer,
+            batch_size=64,
+        )
+        # Define the zero-shot classifier
+        zs_tokenizer = AutoTokenizer.from_pretrained(
+            "sileod/deberta-v3-base-tasksource-nli"
+        )
+        zs_model = AutoModelForSequenceClassification.from_pretrained(
+            "sileod/deberta-v3-base-tasksource-nli"
+        )
+        zs_classifier = pipeline(
+            "zero-shot-classification",
+            model=zs_model,
+            tokenizer=zs_tokenizer,
+            batch_size=64,
+            hypothesis_template="This book is {}.",
+            multi_label=True,
+        )
+        # Summarize the descriptions
+        summaries = [
+            summarizer_pipeline(description[0:1024])
+            if (description != None)
+            else [{"summary_text": "Null"}]
+            for description in descriptions
+        ]
+        # Predict the level of the book
+        candidate_labels = [
+            "Introductory",
+            "Advanced",
+            "Academic",
+            "Not Academic",
+            "Manual",
+        ]
+        # Get the predicted labels
+        classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
+        # Calculate the similarity between the books
+        if similarity != "false":
+            from sentence_transformers import util
+            sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
+            book_embeddings = sentence_transformer.encode(
+                combined_data, convert_to_tensor=True
             )
+            similar_books = []
+            for i in range(len(titles)):
+                current_embedding = book_embeddings[i]
+                similarity_sorted = util.semantic_search(
+                    current_embedding, book_embeddings, top_k=20
+                )
+                similar_books.append(
+                    {
+                        "sorted_by_similarity": similarity_sorted[0][1:],
+                    }
+                )
+        else:
+            similar_books = [{"sorted_by_similarity": []} for i in range(len(titles))]
+        return summaries, classes, similar_books
+    # Run the predict function
+    summaries, classes, similar_books = predict(
+        titles, descriptions, publishers, similarity=similarity
     )
+    # Calculate the elapsed time between the third and fourth checkpoints
+    fourth_checkpoint = time.time()
+    fourth_checkpoint_time = int(fourth_checkpoint - third_checkpoint)
     # Calculate the elapsed time
     end_time = time.time()
     runtime = f"{end_time - start_time:.2f} seconds"
     # Create a list of dictionaries to store the results
     results = [
         {
             "label_confidences": classes[i]["scores"][0:2],
             "summary": summaries[i][0]["summary_text"],
             "similar_books": similar_books[i]["sorted_by_similarity"],
+            "checkpoints": [
+                first_checkpoint_time,
+                second_checkpoint_time,
+                third_checkpoint_time,
+                fourth_checkpoint_time,
+            ],
             "runtime": runtime,
         }
         for i in range(len(titles))