semantic-book-recommender / supervised_clean.py
abdulMalik1234567890's picture
first commit
d38101e
import pandas as pd
from googlesearch import search
import time
import random
df = pd.read_csv("search_progress.csv")
df1 = df.drop("query_index", axis=1)
print("Initial DataFrame:")
print(df1.head())
df1.columns = ["title", "url"]
unfinished = df1[(df1.isnull().any(axis=1)) | ~((df1["url"].str.contains("amazon", na=False)) | (df1["url"].str.contains("google", na=False)))]
unfinished_list = unfinished["title"].tolist()
unfinished_urls = [None] * len(unfinished_list)
for idx,i in enumerate(unfinished_list):
print()
print(f"Processing title {idx + 1}/{len(unfinished_list)}: {i}")
try:
results1 = search(i, num_results=3, lang="en")
results2 = search(i.replace("google", "amazon"), num_results=3, lang="en")
url = list(results1) + list(results2)
count = 0
print("\n")
print(f"Searching for: {i}")
for j in url:
count += 1
print(count, j)
index = int(input("Enter the index of the correct URL (1-3): ")) - 1
unfinished_urls[idx] = url[index]
except Exception as e:
print(f"Error occurred while searching for {i}: {e}")
unfinished_urls[idx] = None
time.sleep(random.randint(1,5)) # Sleep to avoid hitting the search API too quickly
unfinished["url"] = unfinished_urls
print("Updated DataFrame with URLs:")
print(unfinished.head())
df1.update(unfinished)
df1.to_csv("search_progress1.csv", index=False)