bertugmirasyedi commited on
Commit
d9187f0
1 Parent(s): 7b4a17b

Changed subparts to functions

Browse files
Files changed (1) hide show
  1. app.py +306 -226
app.py CHANGED
@@ -21,13 +21,6 @@ def search(query, similarity="false"):
21
 
22
  start_time = time.time()
23
 
24
- # Set the API endpoint and query parameters
25
- url = "https://www.googleapis.com/books/v1/volumes"
26
- params = {"q": str(query), "printType": "books", "maxResults": 10}
27
-
28
- # Send a GET request to the API with the specified parameters
29
- response = requests.get(url, params=params)
30
-
31
  # Initialize the lists to store the results
32
  titles = []
33
  authors = []
@@ -35,251 +28,332 @@ def search(query, similarity="false"):
35
  descriptions = []
36
  images = []
37
 
38
- # Parse the response JSON and append the results
39
- data = response.json()
40
-
41
- for item in data["items"]:
42
- volume_info = item["volumeInfo"]
43
- try:
44
- titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
45
- except KeyError:
46
- titles.append(volume_info["title"])
47
-
48
- try:
49
- descriptions.append(volume_info["description"])
50
- except KeyError:
51
- descriptions.append("Null")
52
-
53
- try:
54
- publishers.append(volume_info["publisher"])
55
- except KeyError:
56
- publishers.append("Null")
57
-
58
- try:
59
- authors.append(volume_info["authors"][0])
60
- except KeyError:
61
- authors.append("Null")
62
-
63
- try:
64
- images.append(volume_info["imageLinks"]["thumbnail"])
65
- except KeyError:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  images.append(
67
  "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
68
  )
69
 
70
- ### Openalex ###
71
- import pyalex
72
- from pyalex import Works
73
-
74
- # Add email to the config
75
- pyalex.config.email = "ber2mir@gmail.com"
76
-
77
- # Define a pager object with the same query
78
- pager = Works().search(str(query)).paginate(per_page=10, n_max=10)
79
-
80
- # Generate a list of the results
81
- openalex_results = list(pager)
82
-
83
- # Get the titles, descriptions, and publishers and append them to the lists
84
- for result in openalex_results[0]:
85
- try:
86
- titles.append(result["title"])
87
- except KeyError:
88
- titles.append("Null")
89
-
90
- try:
91
- descriptions.append(result["abstract"])
92
- except KeyError:
93
- descriptions.append("Null")
94
-
95
- try:
96
- publishers.append(result["host_venue"]["publisher"])
97
- except KeyError:
98
- publishers.append("Null")
99
-
100
- try:
101
- authors.append(result["authorships"][0]["author"]["display_name"])
102
- except KeyError:
103
- authors.append("Null")
104
-
105
- images.append(
106
- "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
 
 
 
 
 
 
 
107
  )
108
 
109
- ### OpenAI ###
110
- import openai
111
-
112
- # Set the OpenAI API key
113
- openai.api_key = "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"
114
-
115
- # Create ChatGPT query
116
- chatgpt_response = openai.ChatCompletion.create(
117
- model="gpt-3.5-turbo",
118
- messages=[
119
- {
120
- "role": "system",
121
- "content": "You are a librarian. You are helping a patron find a book.",
122
- },
123
- {
124
- "role": "user",
125
- "content": f"Recommend me 10 books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
126
- },
127
- ],
128
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
- # Split the response into a list of results
131
- chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split("\n")[
132
- 2::2
133
- ]
 
134
 
135
- # Define a function to parse the results
136
- def parse_result(result, ordered_keys=["Title", "Author", "Publisher", "Summary"]):
137
- # Create a dict to store the key-value pairs
138
- parsed_result = {}
139
 
140
- for key in ordered_keys:
141
- # Split the result string by the key and append the value to the list
142
- if key != ordered_keys[-1]:
143
- parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
144
- else:
145
- parsed_result[key] = result.split(f"{key}: ")[1]
146
 
147
- return parsed_result
 
 
 
 
 
148
 
149
- ordered_keys = ["Title", "Author", "Publisher", "Summary"]
 
 
 
150
 
151
- for result in chatgpt_results:
152
- try:
153
- # Parse the result
154
- parsed_result = parse_result(result, ordered_keys=ordered_keys)
 
 
 
 
 
 
 
155
 
156
- # Append the parsed result to the lists
157
- titles.append(parsed_result["Title"])
158
- authors.append(parsed_result["Author"])
159
- publishers.append(parsed_result["Publisher"])
160
- descriptions.append(parsed_result["Summary"])
161
- images.append(
162
- "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  )
164
 
165
- # In case the OpenAI API hits the limit
166
- except IndexError:
167
- break
168
 
169
- ### Prediction ###
170
- from transformers import (
171
- AutoTokenizer,
172
- AutoModelForSeq2SeqLM,
173
- AutoModelForSequenceClassification,
174
- pipeline,
175
- )
176
- from sentence_transformers import SentenceTransformer
177
-
178
- # Load the classifiers
179
- # classifier = TextClassifier.load(
180
- # "trainers/deberta-v3-base-tasksource-nli/best-model.pt"
181
- # )
182
- # sentence_transformer = SentenceTransformer("all-MiniLM-L12-v2")
183
- # cross_encoder = CrossEncoder("cross-encoder/stsb-distilroberta-base")
184
-
185
- # Combine title, description, and publisher into a single string
186
- combined_data = [
187
- f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
188
- for title, description, publisher in zip(titles, descriptions, publishers)
189
- ]
190
-
191
- # Prepare the Sentence object
192
- # sentences = [
193
- # Sentence(doc, use_tokenizer=SegtokTokenizer()) for doc in combined_data
194
- # ]
195
-
196
- # Classify the sentences
197
- # classifier.predict(sentences)
198
-
199
- # Get the predicted labels
200
- # classes = [sentence.labels for sentence in sentences]
201
-
202
- # Define the summarizer model and tokenizer
203
- sum_tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-xsum-12-6")
204
 
205
- sum_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-xsum-12-6")
206
- # sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
 
 
 
 
 
207
 
208
- summarizer_pipeline = pipeline(
209
- "summarization",
210
- model=sum_model,
211
- tokenizer=sum_tokenizer,
212
- batch_size=64,
213
- )
214
-
215
- # Define the zero-shot classifier
216
- zs_tokenizer = AutoTokenizer.from_pretrained(
217
- "sileod/deberta-v3-base-tasksource-nli"
218
- )
219
- # Quickfix for the tokenizer
220
- # zs_tokenizer.model_input_names = ["input_ids", "attention_mask"]
221
 
222
- zs_model = AutoModelForSequenceClassification.from_pretrained(
223
- "sileod/deberta-v3-base-tasksource-nli"
224
- )
225
- zs_classifier = pipeline(
226
- "zero-shot-classification",
227
- model=zs_model,
228
- tokenizer=zs_tokenizer,
229
- batch_size=64,
230
- hypothesis_template="This book is {}.",
231
- multi_label=True,
232
  )
233
 
234
- # Summarize the descriptions
235
- summaries = [
236
- summarizer_pipeline(description[0:1024])
237
- if (description != None)
238
- else [{"summary_text": "Null"}]
239
- for description in descriptions
240
- ]
241
-
242
- # Predict the level of the book
243
- candidate_labels = [
244
- "Introductory",
245
- "Advanced",
246
- "Academic",
247
- "Not Academic",
248
- "Manual",
249
- ]
250
-
251
- # Get the predicted labels
252
- classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
253
 
254
  # Calculate the elapsed time
255
  end_time = time.time()
256
  runtime = f"{end_time - start_time:.2f} seconds"
257
 
258
- # Calculate the similarity between the books
259
- if similarity != "false":
260
- from sentence_transformers import util
261
-
262
- sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
263
- book_embeddings = sentence_transformer.encode(
264
- combined_data, convert_to_tensor=True
265
- )
266
-
267
- similar_books = []
268
- for i in range(len(titles)):
269
- current_embedding = book_embeddings[i]
270
-
271
- similarity_sorted = util.semantic_search(
272
- current_embedding, book_embeddings, top_k=20
273
- )
274
-
275
- similar_books.append(
276
- {
277
- "sorted_by_similarity": similarity_sorted[0][1:],
278
- }
279
- )
280
- else:
281
- similar_books = [{"sorted_by_similarity": []} for i in range(len(titles))]
282
-
283
  # Create a list of dictionaries to store the results
284
  results = [
285
  {
@@ -292,6 +366,12 @@ def search(query, similarity="false"):
292
  "label_confidences": classes[i]["scores"][0:2],
293
  "summary": summaries[i][0]["summary_text"],
294
  "similar_books": similar_books[i]["sorted_by_similarity"],
 
 
 
 
 
 
295
  "runtime": runtime,
296
  }
297
  for i in range(len(titles))
 
21
 
22
  start_time = time.time()
23
 
 
 
 
 
 
 
 
24
  # Initialize the lists to store the results
25
  titles = []
26
  authors = []
 
28
  descriptions = []
29
  images = []
30
 
31
+ def gbooks_search(query, n_results=30):
32
+ """
33
+ Access the Google Books API and return the results.
34
+ """
35
+ # Set the API endpoint and query parameters
36
+ url = "https://www.googleapis.com/books/v1/volumes"
37
+ params = {"q": str(query), "printType": "books", "maxResults": n_results}
38
+
39
+ # Send a GET request to the API with the specified parameters
40
+ response = requests.get(url, params=params)
41
+
42
+ # Parse the response JSON and append the results
43
+ data = response.json()
44
+
45
+ for item in data["items"]:
46
+ volume_info = item["volumeInfo"]
47
+ try:
48
+ titles.append(f"{volume_info['title']}: {volume_info['subtitle']}")
49
+ except KeyError:
50
+ titles.append(volume_info["title"])
51
+
52
+ try:
53
+ descriptions.append(volume_info["description"])
54
+ except KeyError:
55
+ descriptions.append("Null")
56
+
57
+ try:
58
+ publishers.append(volume_info["publisher"])
59
+ except KeyError:
60
+ publishers.append("Null")
61
+
62
+ try:
63
+ authors.append(volume_info["authors"][0])
64
+ except KeyError:
65
+ authors.append("Null")
66
+
67
+ try:
68
+ images.append(volume_info["imageLinks"]["thumbnail"])
69
+ except KeyError:
70
+ images.append(
71
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
72
+ )
73
+
74
+ return titles, authors, publishers, descriptions, images
75
+
76
+ # Run the gbooks_search function
77
+ (
78
+ titles_placeholder,
79
+ authors_placeholder,
80
+ publishers_placeholder,
81
+ descriptions_placeholder,
82
+ images_placeholder,
83
+ ) = gbooks_search(query)
84
+
85
+ # Append the results to the lists
86
+ titles.extend(titles_placeholder)
87
+ authors.extend(authors_placeholder)
88
+ publishers.extend(publishers_placeholder)
89
+ descriptions.extend(descriptions_placeholder)
90
+ images.extend(images_placeholder)
91
+
92
+ # Get the time since the start
93
+ first_checkpoint = time.time()
94
+ first_checkpoint_time = int(first_checkpoint - start_time)
95
+
96
+ def openalex_search(query, n_results=10):
97
+ """
98
+ Run a search on OpenAlex and return the results.
99
+ """
100
+ import pyalex
101
+ from pyalex import Works
102
+
103
+ # Add email to the config
104
+ pyalex.config.email = "ber2mir@gmail.com"
105
+
106
+ # Define a pager object with the same query
107
+ pager = Works().search(str(query)).paginate(per_page=n_results, n_max=n_results)
108
+
109
+ # Generate a list of the results
110
+ openalex_results = list(pager)
111
+
112
+ # Get the titles, descriptions, and publishers and append them to the lists
113
+ for result in openalex_results[0]:
114
+ try:
115
+ titles.append(result["title"])
116
+ except KeyError:
117
+ titles.append("Null")
118
+
119
+ try:
120
+ descriptions.append(result["abstract"])
121
+ except KeyError:
122
+ descriptions.append("Null")
123
+
124
+ try:
125
+ publishers.append(result["host_venue"]["publisher"])
126
+ except KeyError:
127
+ publishers.append("Null")
128
+
129
+ try:
130
+ authors.append(result["authorships"][0]["author"]["display_name"])
131
+ except KeyError:
132
+ authors.append("Null")
133
+
134
  images.append(
135
  "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
136
  )
137
 
138
+ return titles, authors, publishers, descriptions, images
139
+
140
+ # Run the openalex_search function
141
+ (
142
+ titles_placeholder,
143
+ authors_placeholder,
144
+ publishers_placeholder,
145
+ descriptions_placeholder,
146
+ images_placeholder,
147
+ ) = openalex_search(query)
148
+
149
+ # Append the results to the lists
150
+ titles.extend(titles_placeholder)
151
+ authors.extend(authors_placeholder)
152
+ publishers.extend(publishers_placeholder)
153
+ descriptions.extend(descriptions_placeholder)
154
+ images.extend(images_placeholder)
155
+
156
+ # Calculate the elapsed time between the first and second checkpoints
157
+ second_checkpoint = time.time()
158
+ second_checkpoint_time = int(second_checkpoint - first_checkpoint)
159
+
160
+ def openai_search(query, n_results=10):
161
+ """
162
+ Create a query to the OpenAI ChatGPT API and return the results.
163
+ """
164
+ import openai
165
+
166
+ # Set the OpenAI API key
167
+ openai.api_key = "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"
168
+
169
+ # Create ChatGPT query
170
+ chatgpt_response = openai.ChatCompletion.create(
171
+ model="gpt-3.5-turbo",
172
+ messages=[
173
+ {
174
+ "role": "system",
175
+ "content": "You are a librarian. You are helping a patron find a book.",
176
+ },
177
+ {
178
+ "role": "user",
179
+ "content": f"Recommend me {n_results} books about {query}. Your response should be like: 'title: <title>, author: <author>, publisher: <publisher>, summary: <summary>'",
180
+ },
181
+ ],
182
  )
183
 
184
+ # Split the response into a list of results
185
+ chatgpt_results = chatgpt_response["choices"][0]["message"]["content"].split(
186
+ "\n"
187
+ )[2::2]
188
+
189
+ # Define a function to parse the results
190
+ def parse_result(
191
+ result, ordered_keys=["Title", "Author", "Publisher", "Summary"]
192
+ ):
193
+ # Create a dict to store the key-value pairs
194
+ parsed_result = {}
195
+
196
+ for key in ordered_keys:
197
+ # Split the result string by the key and append the value to the list
198
+ if key != ordered_keys[-1]:
199
+ parsed_result[key] = result.split(f"{key}: ")[1].split(",")[0]
200
+ else:
201
+ parsed_result[key] = result.split(f"{key}: ")[1]
202
+
203
+ return parsed_result
204
+
205
+ ordered_keys = ["Title", "Author", "Publisher", "Summary"]
206
+
207
+ for result in chatgpt_results:
208
+ try:
209
+ # Parse the result
210
+ parsed_result = parse_result(result, ordered_keys=ordered_keys)
211
+
212
+ # Append the parsed result to the lists
213
+ titles.append(parsed_result["Title"])
214
+ authors.append(parsed_result["Author"])
215
+ publishers.append(parsed_result["Publisher"])
216
+ descriptions.append(parsed_result["Summary"])
217
+ images.append(
218
+ "https://bookstoreromanceday.org/wp-content/uploads/2020/08/book-cover-placeholder.png"
219
+ )
220
+
221
+ # In case the OpenAI API hits the limit
222
+ except IndexError:
223
+ break
224
+
225
+ return titles, authors, publishers, descriptions, images
226
+
227
+ # Run the openai_search function
228
+ (
229
+ titles_placeholder,
230
+ authors_placeholder,
231
+ publishers_placeholder,
232
+ descriptions_placeholder,
233
+ images_placeholder,
234
+ ) = openai_search(query)
235
+
236
+ # Append the results to the lists
237
+ titles.extend(titles_placeholder)
238
+ authors.extend(authors_placeholder)
239
+ publishers.extend(publishers_placeholder)
240
+ descriptions.extend(descriptions_placeholder)
241
+ images.extend(images_placeholder)
242
+
243
+ # Calculate the elapsed time between the second and third checkpoints
244
+ third_checkpoint = time.time()
245
+ third_checkpoint_time = int(third_checkpoint - second_checkpoint)
246
+
247
+ def predict(titles, descriptions, publishers, similarity=similarity):
248
+ """
249
+ Create a summarizer and classifier pipeline and return the results.
250
+ """
251
+ from transformers import (
252
+ AutoTokenizer,
253
+ AutoModelForSeq2SeqLM,
254
+ AutoModelForSequenceClassification,
255
+ pipeline,
256
+ )
257
+ from sentence_transformers import SentenceTransformer
258
 
259
+ # Combine title, description, and publisher into a single string
260
+ combined_data = [
261
+ f"The book's title is {title}. It is published by {publisher}. This book is about {description}"
262
+ for title, description, publisher in zip(titles, descriptions, publishers)
263
+ ]
264
 
265
+ # Define the summarizer model and tokenizer
266
+ sum_tokenizer = AutoTokenizer.from_pretrained("pszemraj/led-base-book-summary")
 
 
267
 
268
+ sum_model = AutoModelForSeq2SeqLM.from_pretrained(
269
+ "pszemraj/led-base-book-summary"
270
+ )
271
+ # sum_model = AutoModelForSeq2SeqLM.from_pretrained("lidiya/bart-base-samsum")
 
 
272
 
273
+ summarizer_pipeline = pipeline(
274
+ "summarization",
275
+ model=sum_model,
276
+ tokenizer=sum_tokenizer,
277
+ batch_size=64,
278
+ )
279
 
280
+ # Define the zero-shot classifier
281
+ zs_tokenizer = AutoTokenizer.from_pretrained(
282
+ "sileod/deberta-v3-base-tasksource-nli"
283
+ )
284
 
285
+ zs_model = AutoModelForSequenceClassification.from_pretrained(
286
+ "sileod/deberta-v3-base-tasksource-nli"
287
+ )
288
+ zs_classifier = pipeline(
289
+ "zero-shot-classification",
290
+ model=zs_model,
291
+ tokenizer=zs_tokenizer,
292
+ batch_size=64,
293
+ hypothesis_template="This book is {}.",
294
+ multi_label=True,
295
+ )
296
 
297
+ # Summarize the descriptions
298
+ summaries = [
299
+ summarizer_pipeline(description[0:1024])
300
+ if (description != None)
301
+ else [{"summary_text": "Null"}]
302
+ for description in descriptions
303
+ ]
304
+
305
+ # Predict the level of the book
306
+ candidate_labels = [
307
+ "Introductory",
308
+ "Advanced",
309
+ "Academic",
310
+ "Not Academic",
311
+ "Manual",
312
+ ]
313
+
314
+ # Get the predicted labels
315
+ classes = [zs_classifier(doc, candidate_labels) for doc in combined_data]
316
+
317
+ # Calculate the similarity between the books
318
+ if similarity != "false":
319
+ from sentence_transformers import util
320
+
321
+ sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")
322
+ book_embeddings = sentence_transformer.encode(
323
+ combined_data, convert_to_tensor=True
324
  )
325
 
326
+ similar_books = []
327
+ for i in range(len(titles)):
328
+ current_embedding = book_embeddings[i]
329
 
330
+ similarity_sorted = util.semantic_search(
331
+ current_embedding, book_embeddings, top_k=20
332
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
+ similar_books.append(
335
+ {
336
+ "sorted_by_similarity": similarity_sorted[0][1:],
337
+ }
338
+ )
339
+ else:
340
+ similar_books = [{"sorted_by_similarity": []} for i in range(len(titles))]
341
 
342
+ return summaries, classes, similar_books
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
+ # Run the predict function
345
+ summaries, classes, similar_books = predict(
346
+ titles, descriptions, publishers, similarity=similarity
 
 
 
 
 
 
 
347
  )
348
 
349
+ # Calculate the elapsed time between the third and fourth checkpoints
350
+ fourth_checkpoint = time.time()
351
+ fourth_checkpoint_time = int(fourth_checkpoint - third_checkpoint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  # Calculate the elapsed time
354
  end_time = time.time()
355
  runtime = f"{end_time - start_time:.2f} seconds"
356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  # Create a list of dictionaries to store the results
358
  results = [
359
  {
 
366
  "label_confidences": classes[i]["scores"][0:2],
367
  "summary": summaries[i][0]["summary_text"],
368
  "similar_books": similar_books[i]["sorted_by_similarity"],
369
+ "checkpoints": [
370
+ first_checkpoint_time,
371
+ second_checkpoint_time,
372
+ third_checkpoint_time,
373
+ fourth_checkpoint_time,
374
+ ],
375
  "runtime": runtime,
376
  }
377
  for i in range(len(titles))