# Generate a local copy of Wookieepedia pages for subsequent vectorisation

(Best to get the web part done and can then rebuild a vector store quickly)

In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import os
import re

from pathlib import Path
from collections import Counter

In [2]:
# Get character and location keywords to search by scanning the local copies of the 6 film scripts

all_matches, all_locations, all_likely_planets, all_likely_characters = set(), set(), set(), set()

for script in Path('scripts').iterdir():
    # if script.name == 'Episode VI - Return of the Jedi.txt':
    if script.is_file():
        print(script)
        with open(script, 'r') as file: text = file.read()
        text = re.sub(r'^[\t ]+', '', text, flags = re.MULTILINE) # No leading space
        matchesl = re.findall(r'^(?:\d+[\t ]+)?([A-Z][A-Z\.\-/ ]+[A-Z])[\t ]*(?::.+)?$', text, flags = re.MULTILINE) # All caps name-like line starts (possibly with scene number and possibly with colon and dialogue)
        print(matchesl)

        matches = set(matchesl)
        all_matches |= matches
        print(len(matches))
        print(matches)

        locations = {m[1].strip() for s in matches if (m := re.search(r'(?:INT|EXT|S )\.? ?([^-]+) ?.+', s))} # First portions of location descriptions
        all_locations |= locations
        print(locations)

        likely_planets = {k for k, n in Counter([l.split(' ')[0] for l in locations]).items() if n > 1} # Most mentioned 1-st words
        all_likely_planets |= likely_planets
        print(likely_planets)

        likely_characters = {s for s in matches if not re.search('INT|EXT|S ', s)}
        all_likely_characters |= likely_characters
        print(likely_characters)

        print()

print(len(all_matches))
print(sum(len(x) for x in [all_locations, all_likely_planets, all_likely_characters]))

matches_to_search = all_locations | all_likely_planets | all_likely_characters

scripts\Episode I - The Phantom Menace.txt
['TITLE CARD', 'INT. REPUBLIC CRUISER - COCKPIT', 'QUI-GON', 'CAPTAIN', 'QUI-GON', 'CAPTAIN', 'CAPTAIN', 'NUTE', 'INT. FEDERATION BATTLESHIP - DOCKING BAY - SPACE', 'INT. FEDERATION BATTLESHIP - CONFERENCE ROOM', 'OBI-WAN', 'QUI-GON', 'OBI-WAN', 'QUI-GON', 'OBI-WAN', 'QUI-GON', 'OBI-WAN', 'QUI-GON', 'INT. FEDERATION BATTLESHIP - BRIDGE', 'NUTE', 'DOFINE', 'INT. FEDERATION BATTLESHIP - HALLWAY', 'NUTE', 'INT. FEDERATION BATTLESHIP - BRIDGE', 'NUTE', 'RUNE', 'NUTE', 'RUNE', 'NUTE', 'RUNE', 'INT. FEDERATION BATTLESHIP - HALLWAY - OUTSIDE BRIDGE', 'INT. FEDERATION BATTLESHIP - BRIDGE', 'NUTE', 'RUNE', 'NUTE', 'RUNE', 'INT. FEDERATION BATTLESHIP - HALLWAY - OUTSIDE BRIDGE', 'QUI-GON', 'OBI-WAN', 'OBI-WAN', 'QUI-GON', 'INT. FEDERATION BATTLESHIP - BRIDGE', 'RUNE', 'TEY HOW', 'INT. FEDERATION BATTLESHIP - MAIN BAY', 'QUI-GON', 'OBI-WAN', 'QUI-GON', 'OBI-WAN', 'INT. FEDERATION BATTLESHIP - BRIDGE', 'TEY HOW', 'RUNE', 'NUTE', 'NUTE', 'AMIDALA', 'NUTE',

In [3]:
def first_wookieepedia_result(query: str) -> str:
    '''Get the url of the first result when searching Wookieepedia for a query
    (best for simple names as queries, ideally generated by the llm for something like
    "Produce a input consisting of the name of the most important element in the query so that its article can be looked up")
    '''
    search_results = requests.get(f'https://starwars.fandom.com/wiki/Special:Search?query={"+".join(query.split(" "))}')
    soup = BeautifulSoup(search_results.content, 'html.parser')
    first_res = soup.find('a', class_ = 'unified-search__result__link')
    return first_res['href']

# first_wookieepedia_result('Darth Plagueis')

In [7]:
def save_wookieepedia_page(query, url, file_name, save_folder, error_file):
    '''Error-tolerant (hence interruptible) extraction and cleaning of Wookieepedia page's content, then saving it to save_folder\file_name (or recording failure to error_file)
    '''
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        doc = soup.find('div', id = 'content').get_text()
    
        # Cleaning
        doc = doc.split('\n\n\n\n\n\n\n\n\n\n\n\n\n\n')[-1] # The (multiple) preambles are separated by these many newlines; no harm done if not present
        doc = re.sub('\[\d*\]', '', doc) # References (and section title's "[]" suffixes) are noise
        doc = doc.split('\nAppearances\n')[0] # Keep only content before these sections
        doc = doc.split('\nSources\n')[0] # Technically no need to check this if successfully cut on appearances, but no harm done
        doc = re.sub('Contents\n\n(?:[\d\.]+ [^\n]+\n+)+', '', doc) # Remove table of contents

        with open(os.path.join(save_folder, file_name), 'w') as file: file.write(doc)

        print(f'{query} ({url}) saved to {file_name}')
        return True
    except Exception as e:
        with open(error_file, 'a') as file: file.write(f"{query}|{url}|{file_name}\n")

        print(f'{query} failed; could not retrieve from {url}: {e}')
        return False


def scrape_wookieepedia(queries: list[str], save_folder = 'wookieepedia', done_file = 'successful_scrapes.txt', error_file = 'failed_scrapes.txt'):
    '''Error-tolerant (hence interruptible) and restartable scraping of Wookieepedia pages corresponding to the first results of searches with the given queries.
    '''
    done_file  = Path(save_folder) / 'log' / done_file
    error_file = Path(save_folder) / 'log' / error_file
    os.makedirs(save_folder, exist_ok = True)
    os.makedirs(Path(save_folder) / 'log', exist_ok = True)

    # Check log of previously-processed queries and matching files
    if Path(done_file).is_file():
        with open(done_file, 'r') as file: done_queries, done_urls, done_files = zip(*(tuple(line.split('|')) for line in file))
    else: done_queries, done_urls, done_files = [], [], []
    done_query_index = {q: i for i, q in enumerate(done_queries)}
    
    real_done_files = [f for f in os.listdir('wookieepedia') if f not in ['db', 'log']]

    for query in tqdm(set(queries) - set(done_queries)): # Technically this does not cover the queries resulting in duplicate pages, hence not optimal, but no real need for it to be
        if query in done_queries: print(f'{query} ({done_urls[(i := done_query_index[query])]}) already saved to {done_files[i]}')
        else:
            try:
                url = first_wookieepedia_result(query)
                file_name = f'{url.split("/")[-1]}.txt'.replace(':', '__')

                if file_name in real_done_files:
                    print(f'{file_name} already saved')
                else:
                    if save_wookieepedia_page(query, url, file_name, save_folder = save_folder, error_file = error_file):
                        real_done_files.append(file_name)
                        with open(done_file, 'a') as file: file.write(f"('{query}'|'{url}'|{file_name})\n") 
            except Exception as e:
                with open(error_file, 'a') as file: file.write(f"('{query}', '{url}')\n")

                print(f'{query} failed; could not generate an url: {e}')

In [8]:
# Populate the wookieepedia folder with cleaned page content

scrape_wookieepedia(matches_to_search)

  0%|          | 0/498 [00:00<?, ?it/s]

TIE FIGHTE (https://starwars.fandom.com/wiki/TIE_fighter_series) saved to TIE_fighter_series.txt
DAY (https://starwars.fandom.com/wiki/Daye_Azur-Jamin) saved to Daye_Azur-Jamin.txt
WATTO (https://starwars.fandom.com/wiki/Watto) saved to Watto.txt
ER JETTSTE (https://starwars.fandom.com/wiki/Maz_Kanata) saved to Maz_Kanata.txt
SLAVE QUARTERS (https://starwars.fandom.com/wiki/Slave_Quarters_Row) saved to Slave_Quarters_Row.txt
CHIEF (https://starwars.fandom.com/wiki/Chief) saved to Chief.txt
CLONE COMMANDER BACARA (https://starwars.fandom.com/wiki/Bacara) saved to Bacara.txt
OWEN (https://starwars.fandom.com/wiki/Owen) saved to Owen.txt
TATOOINE SEA (https://starwars.fandom.com/wiki/Star_Wars_Outlaws) saved to Star_Wars_Outlaws.txt
SECOND COMMANDER (https://starwars.fandom.com/wiki/Triton_Squad%27s_second_commander) saved to Triton_Squad%27s_second_commander.txt
ASTEROID (https://starwars.fandom.com/wiki/Asteroid) saved to Asteroid.txt
ROYAL GUARD (https://starwars.fandom.com/wiki/Royal_

In [None]:
# Post-fetch changes

# for path in ['Aayla_Secura.txt']:#[f for f in os.listdir('wookieepedia') if f not in ['db', 'log']]:
#     with open(Path('wookieepedia') / path, 'r+') as file:
#         text = file.read()
#         text = ...
#         file.seek(0)
#         file.write(text)