Spaces:
Runtime error
Runtime error
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import CollegeConfidentialLoader | |
from langchain.vectorstores.faiss import FAISS | |
from langchain.embeddings import OpenAIEmbeddings | |
import pickle | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
import time | |
# Scrape college data links: | |
browser = webdriver.Chrome() | |
browser.get("https://www.collegeconfidential.com/colleges/") | |
time.sleep(1) | |
elem = browser.find_element(By.TAG_NAME, "body") | |
no_of_pagedowns = 5 | |
while no_of_pagedowns: | |
elem.send_keys(Keys.PAGE_DOWN) | |
time.sleep(14) #10 | |
no_of_pagedowns-=1 | |
html = browser.page_source | |
print(html) | |
soup = BeautifulSoup(html, "html.parser") | |
schools = soup.find_all("div", {"class": "l-row l-gx-3 l-gx-xl-4 l-gy-4"})[0] | |
print("------") | |
print(schools) | |
raw_documents = [] | |
i = 1 | |
for s in schools.find_all("a", {"class": "u-margin-bottom-xxs"}, href=True): | |
college_link = s['href'] | |
print(college_link) | |
print(i) | |
i += 1 | |
# Load Data | |
loader = CollegeConfidentialLoader("https://www.collegeconfidential.com" + college_link) | |
data = loader.load()[0] | |
raw_documents.append(data) | |
if i > 2: | |
break | |
print(raw_documents) | |
# Split text | |
text_splitter = RecursiveCharacterTextSplitter() | |
documents = text_splitter.split_documents(raw_documents) | |
print("YOOOO") | |
print(documents) | |
# Load Data to vectorstore | |
embeddings = OpenAIEmbeddings() | |
vectorstore = FAISS.from_documents(documents, embeddings) | |
query = "What is the average ACT at UChicago?" | |
docs = vectorstore.similarity_search(query) | |
print("HEYY") | |
print(docs) | |
# Save vectorstore | |
with open("vectorstore.pkl", "wb") as f: | |
pickle.dump(vectorstore, f) |