# Star Wars Expert

In [1]:
from langchain_openai import ChatOpenAI#, OpenAIEmbeddings # No need to pay for using embeddings as well when have free alternatives

# Data
from langchain_community.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader
# from langchain_chroma import Chroma # The documentation uses this one, but it is extremely recent, and the same functionality is available in langchain_community and langchain (which imports community)
from langchain_community.vectorstores import Chroma # This has documentation on-hover, while the indirect import through non-community does not
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings # The free alternative (also the default in docs, with model_name = 'all-MiniLM-L6-v2')
from langchain.text_splitter import RecursiveCharacterTextSplitter#, TextSplitter # Recursive to better keep related bits contiguous (also recommended in docs: https://python.langchain.com/docs/modules/data_connection/document_transformers/)

# Chains
from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.tools.retriever import create_retriever_tool
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, chain
from langchain_core.pydantic_v1 import BaseModel, Field

# Agents
from langchain import hub
from langchain.agents import create_tool_calling_agent, AgentExecutor

# To manually create inputs to test pipelines
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.documents import Document

# # Custom retriever
# from langchain_core.callbacks import CallbackManagerForRetrieverRun
# from langchain_core.documents import Document
# from langchain_core.retrievers import BaseRetriever

import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

import os
import shutil
from pathlib import Path
import re

import dotenv
dotenv.load_dotenv()

True

## Data Loaders
NOTE: running the chunk below deletes the database file adds data to the database, since content is duplicated otherwise

### Film Scripts

In [2]:
# Comparison of vector dbs: https://zackproser.com/blog/vector-databases-compared
#   Opinion: Milvus (more features, bigger community, higher performance(?), fully free, no enterprise plans) > Weaviate > Chroma
#   However Milvus and Weaviate both require a separate instance to be up and running
#   (The documentation uses FAISS, but it seems unnecessarily limited in comparison)
#   Hence Chroma - https://python.langchain.com/docs/integrations/vectorstores/chroma/

# Separately, no need to pay for OpenAIEmbeddings; additionally, all-MiniLM-L6-v2 is default in docs

REGENERATE_SCRIPT_DATABASE = False

if (db_exists := os.path.exists(db_dir := str(Path('scripts') / 'db')):
    if REGENERATE_SCRIPT_DATABASE:
        print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')
        shutil.rmtree(db_dir)
    else: script_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

if not db_exists or (db_exists and REGENERATE_SCRIPT_DATABASE): # Unfortunate disjoining of the two conditional blocks
    scripts = DirectoryLoader('scripts', glob = '*.txt', loader_cls = TextLoader).load()
    for s in scripts: s.page_content = re.sub(r'^[\t ]+', '', s.page_content, flags = re.MULTILINE) # Spacing to centre text noise

    script_chunks = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, separators = ['\n\n\n', '\n\n', '\n']).split_documents(scripts)
        # Why not some overlap for extra context just in case?
        # Also, no need for fancier sentence or semantic splitting in this highly-formatted text

    script_db = Chroma.from_documents(script_chunks, SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

print(f'The script database contains {len(script_db)} chunks, with mean length of {sum(len(s) for s in script_db.get()["documents"]) / len(script_db):.0f} characters')


Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)
The script database contains 1260 chunks, with mean length of 892 characters


In [3]:
# Query testing

res = script_db.similarity_search('Luke father reveal fight', k = 10)

# for r in res: print(r.page_content)
res

[Document(page_content="LUKE\nNo, my father didn't fight in the \nwars. He was a navigator on a spice \nfreighter.\n\nBEN\nThat's what your uncle told you. He \ndidn't hold with your father's ideals. \nThought he should have stayed here \nand not gotten involved.\n\nLUKE\nYou fought in the Clone Wars?\n\nBEN\nYes, I was once a Jedi Knight the \nsame as your father.\n\nLUKE\nI wish I'd known him.\n\nBEN\nHe was the best star-pilot in the \ngalaxy, and a cunning warrior. I \nunderstand you've become quite a \ngood pilot yourself. And he was a \ngood friend. Which reminds me...\n\nBen gets up and goes to a chest where he rummages around.\nAs Luke finishes repairing Threepio and starts to fit the \nrestraining bolt back on, Threepio looks at him nervously.\nLuke thinks about the bolt for a moment then puts it on the \ntable. Ben shuffles up and presents Luke with a short handle \nwith several electronic gadgets attached to it.", metadata={'source': 'scripts\\Episode IV - A New Hope.txt'}),

### Wookieepedia Articles

In [3]:
REGENERATE_WOOKIEEPEDIA_DATABASE = False

if (db_exists := os.path.exists(db_dir := str(Path('wookieepedia') / 'db'))):
    if REGENERATE_WOOKIEEPEDIA_DATABASE:
        print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')
        shutil.rmtree(db_dir)
    else: woo_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

if not db_exists or (db_exists and REGENERATE_WOOKIEEPEDIA_DATABASE): # Unfortunate disjoining of the two conditional blocks
    pages = DirectoryLoader('wookieepedia', glob = '*.txt', loader_cls = TextLoader).load()

    page_chunks = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200, separators = ['\n\n\n', '\n\n', '\n']).split_documents(pages)
        # Why not some overlap for extra context just in case?
        # Also, no need for fancier sentence or semantic splitting in this highly-formatted text

    woo_db = Chroma.from_documents(page_chunks, SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)

print(f'The Wookieepedia database contains {len(woo_db)} chunks, with mean length of {sum(len(s) for s in woo_db.get()["documents"]) / len(woo_db):.0f} characters')

print('Current source pages in Wookieepedia db:')
print(len(source_pages := set(md.get('source') for md in woo_db.get()['metadatas'])))
source_pages


The Wookieepedia database contains 10927 chunks, with mean length of 645 characters
Current source pages in Wookieepedia db:
395


{'https://starwars.fandom.com/wiki/Darth_Plagueis',
 'https://starwars.fandom.com/wiki/Voss',
 'wookieepedia\\1995_Topps_Star_Wars_Widevision.txt',
 'wookieepedia\\1995_Topps_Star_Wars___The_Empire_Strikes_Back_Widevision.txt',
 'wookieepedia\\1996_Topps_Star_Wars___Return_of_the_Jedi_Widevision.txt',
 'wookieepedia\\2_systems_control_droid.txt',
 'wookieepedia\\501st_Legion.txt',
 'wookieepedia\\Aayla_Secura.txt',
 'wookieepedia\\Abandoned_sarlacc_pit.txt',
 'wookieepedia\\Acclamator-class_transgalactic_military_assault_ship.txt',
 'wookieepedia\\Ackbar_(comic).txt',
 'wookieepedia\\Age_of_Resistance_-_General_Hux_1.txt',
 'wookieepedia\\Alderaan.txt',
 'wookieepedia\\Alderaan_Cruiser.txt',
 'wookieepedia\\Amee.txt',
 'wookieepedia\\Amidalans.txt',
 'wookieepedia\\Anakin_Skywalker.txt',
 'wookieepedia\\Area_D-512.txt',
 'wookieepedia\\Ask_Aak.txt',
 'wookieepedia\\Assembly.txt',
 'wookieepedia\\Asteroid.txt',
 'wookieepedia\\Asteroid_Belt_Gas_Refinery.txt',
 'wookieepedia\\Attack_on_F

In [5]:
# Query testing

res = woo_db.similarity_search('Luke father reveal fight', k = 10)

# for r in res: print(r.page_content)
res

[Document(page_content='Vader telling Luke that he is his father  \nVader beckoned Luke to join him in the dark side, but Luke vehemently refused. Vader then told Luke that Kenobi had hidden from him the truth about his father, Anakin Skywalker, before revealing that he was, in fact, his father. Luke refused to believe the truth, but Vader continued to tempt his weakened son, offering Luke the chance to destroy the Emperor and "bring order to the galaxy," just as he tried to do with Padmé on Mustafar. He even pleaded with his son to come with him. In Vader\'s ideal world, his son would have taken his hand, accepting him as his father. However, Skywalker instead chose to throw himself into the shaft, possibly facing death to avoid joining Vader.', metadata={'source': 'wookieepedia\\Anakin_Skywalker.txt'}),
 Document(page_content='"Obi-Wan never told you what happened to your father.""He told me enough! He told me you killed him!""No. I am your father.""No...that\'s not true! That\'s imp

In [4]:
# Functions for possible interactive Wookieepedia querying and storing in the db

def first_wookieepedia_result(query: str) -> str:
    '''Get the url of the first result when searching Wookieepedia for a query
    (best for simple names as queries, ideally generated by the llm for something like
    "Produce a input consisting of the name of the most important element in the query so that its article can be looked up")
    '''
    search_results = requests.get(f'https://starwars.fandom.com/wiki/Special:Search?query={"+".join(query.split(" "))}')
    soup = BeautifulSoup(search_results.content, 'html.parser')
    first_res = soup.find('a', class_ = 'unified-search__result__link')
    return first_res['href']

# first_wookieepedia_result('Darth Plagueis')


def get_wookieepedia_page_content(query: str, previous_sources: set[str]) -> Document | None:
    '''Return cleaned content from a Wookieepedia page provided it was not already sourced
    '''
    url = first_wookieepedia_result(query)

    if url in previous_sources: return None
    else:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        doc = soup.find('div', id = 'content').get_text()

        # Cleaning
        doc = doc.split('\n\n\n\n\n\n\n\n\n\n\n\n\n\n')[-1] # The (multiple) preambles are separated by these many newlines; no harm done if not present
        doc = re.sub('\[\d*\]', '', doc) # References (and section title's "[]" suffixes) are noise
        doc = doc.split('\nAppearances\n')[0] # Keep only content before these sections
        doc = doc.split('\nSources\n')[0] # Technically no need to check this if successfully cut on appearances, but no harm done
        doc = re.sub('Contents\n\n(?:[\d\.]+ [^\n]+\n+)+', '', doc) # Remove table of contents

        return Document(page_content = doc, metadata = dict(source = url))

# print(get_wookieepedia_page_content('Darth Plagueis', set()))
# print(get_wookieepedia_page_content('Darth Plagueis', set(md.get('source') for md in woo_db.get()['metadatas'])))


def get_wookieepedia_context(original_query: str, simple_query: str, wdb: Chroma) -> list[Document]:
    try:
        doc = get_wookieepedia_page_content(simple_query, previous_sources = set(md.get('source') for md in wdb.get()['metadatas']))
        if doc is not None:
            new_chunks = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200).split_documents([doc])
            wdb.add_documents(new_chunks)
            print(f"Added new chunks (for '{simple_query}' -> {doc['metadata']['source']}) to the Wookieepedia database.")
    except: return []

    return wdb.similarity_search(original_query, k = 10)

# get_wookieepedia_context('Do you know the Tragedy of Darth Plagueis the Wise?', 'Darth Plagueis', woo_db)

[Document(page_content='The Tragedy of Darth Plagueis the Wise', metadata={'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis'}),
 Document(page_content='"Darth Plagueis was a Dark Lord of the Sith so powerful and so wise, he could use the Force to influence the midi-chlorians to create life. He had such a knowledge of the dark side, he could even keep the ones he cared about from dying."\n―Sheev Palpatine', metadata={'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis'}),
 Document(page_content='The death of Darth Plagueis\n\n"Unfortunately, he taught his apprentice everything he knew. Then his apprentice killed him in his sleep."\n―Sheev Palpatine\nAccording to Sidious, Plagueis was powerful enough that he could use the Force to influence the midi-chlorians to create life and keep the ones he cared about from dying, a precious knowledge that awarded him the epithet of "The Wise." However, Plagueis also developed a belief that the Force could "strike back" at him for hi

### Local embedding model (unsure whether needed for deployment)

In [7]:
# from sentence_transformers import SentenceTransformer

# modelPath = 'embedding_model'
# model = SentenceTransformer('all-MiniLM-L6-v2')
# model.save(modelPath)
# model = SentenceTransformer(modelPath)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Chains
Standard chains: https://python.langchain.com/docs/modules/chains/#lcel-chains

In [5]:
llm = ChatOpenAI(model = 'gpt-3.5-turbo-0125', temperature = 0)

# llm.invoke('What do you know about Star Wars?')

### Base version (only one retriever)

In [6]:
document_prompt_system_text = '''
You are very knowledgeable about Star Wars and your job is to answer questions about its plot, characters, etc.
Use the context below to produce your answers with as much detail as possible.
If you do not know an answer, say so; do not make up information not in the context.

<context>
{context}
</context>
'''

document_prompt = ChatPromptTemplate.from_messages([
    ('system', document_prompt_system_text),
    MessagesPlaceholder(variable_name = 'chat_history', optional = True),
    ('user', '{input}')
])

document_chain = create_stuff_documents_chain(llm, document_prompt)


# document_prompt.format_messages(context = 'You are an expert in Star Wars lore', input = 'Are you knowledgeable about Star Wars?')
# document_chain.invoke(dict(context = [Document(page_content = 'You are an expert in Star Wars lore')], input = 'Are you knowledgeable about Star Wars?'))


# basic_chain = document_prompt | llm | StrOutputParser() # To extract just the message
# basic_chain.invoke(dict(context = 'You are an expert of Star Wars lore', input = 'Are you knowledgeable about Star Wars?'))

In [7]:
script_retriever_prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name = 'chat_history'),
    ('user', '{input}'),
    ('user', '''Given the above conversation, generate a search query to look up relevant information in a database containing the full scripts from the Star Wars films (i.e. just dialogue and brief scene descriptions).
     The query need not be a proper sentence, but a list of keywords likely to be in dialogue or scene descriptions''')
])

script_retriever_chain = create_history_aware_retriever(llm, script_db.as_retriever(), script_retriever_prompt) # Essentially just: prompt | llm | StrOutputParser() | retriever


# script_retriever_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

# script_retriever_chain.invoke(dict(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Luke cloud city'
# ))


In [8]:
woo_retriever_prompt = ChatPromptTemplate.from_messages([
    MessagesPlaceholder(variable_name = 'chat_history'),
    ('user', '{input}'),
    ('user', 'Given the above conversation, generate a search query to find a relevant page in the Star Wars fandom wiki; the query should be something simple, such as the name of a character, place, event, item, etc.')
])

woo_retriever_chain = create_history_aware_retriever(llm, woo_db.as_retriever(), woo_retriever_prompt) # Essentially just: prompt | llm | StrOutputParser() | retriever


# woo_retriever_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

# woo_retriever_chain.invoke(dict(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# ))


In [9]:
# full_chain = create_retrieval_chain(script_retriever_chain, document_chain)
full_chain = create_retrieval_chain(woo_retriever_chain, document_chain)

# full_chain.invoke(dict(
#     # chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = "Who participates in Han's rescue from Jabba? And where is the palace?"
# ))

{'input': "Who participates in Han's rescue from Jabba? And where is the palace?",
 'context': [Document(page_content="After the smuggler Han Solo failed to repay him for lost cargo, Jabba placed a high price on his head. Solo was eventually delivered to him by one of his bounty hunters, Boba Fett, as a gift from Darth Vader. However, this capture brought him to the attention of Jedi Knight Luke Skywalker, who sought to rescue his friend from Jabba's imprisonment. As he attempted to execute the Jedi and his allies in the Great Pit of Carkoon, Jabba was choked to death by Leia Organa. With the Hutts unable to decide who would inherit Jabba's criminal ventures, many of his slaves, including the Niktos, were free, and his palace was occupied by his former Majordomo Bib Fortuna, who took his place as Daimyo of Tatooine until Fett killed and usurped him. Fett sought to rebuild Jabba's criminal empire in his own image, intending to rule with respect rather than the fear that the Hutt instill

In [26]:
# simplify_query_prompt = ChatPromptTemplate.from_messages([
#     ('system', 'Given the above conversation, generate a search query to find a relevant page in the Star Wars fandom wiki; the query should be something simple, at most 4 words, such as the name of a character, place, event, item, etc.'),
#     MessagesPlaceholder('chat_history', optional = True), # Using this form since not clear how to have optional = True in the tuple form
#     ('human', '{query}')
# ])

# simplify_query_chain = simplify_query_prompt | llm | StrOutputParser() # To extract just the message

# # simplify_query_chain.invoke(dict(context = 'You are an expert of Star Wars lore', query = 'Do you know the tragedy of Darth Plagueis the Wise?'))

### Agent version

In [10]:
# Could use Tavily as a generic search engine for a retriever agent as in the docs, but want more specific (if limited) capabilities here

script_tool = create_retriever_tool(
    script_db.as_retriever(search_kwargs = dict(k = 4)),
    'search_film_scripts',
    '''Search the Star Wars film scripts. This tool should be the first choice for Star Wars related questions.
    Queries passed to this tool should be lists of keywords likely to be in dialogue or scene descriptions, and should not include film titles.'''
)


woo_tool = create_retriever_tool(
    woo_db.as_retriever(search_kwargs = dict(k = 4)),
    'search_wookieepedia',
    'Search the Star Wars fandom wiki. This tool should be the first choice for Star Wars related questions.'
    # This tool should be used for queries about details of a particular character, location, event, weapon, etc., and the query should be something simple, such as the name of a character, place, event, item, etc.'''
)

tools = [script_tool, woo_tool]

In [11]:
# Agent - https://python.langchain.com/docs/modules/agents/
#   The agent design pattern is both simpler and better than manual chains since it can make its own choice between tools

agent_system_text = '''
You are a helpful agent who is very knowledgeable about Star Wars and your job is to answer questions about its plot, characters, etc.
Use the context provided in the exchanges to come to produce your answers with as much detail as possible.
If you do not know an answer, say so; do not make up information.
'''

agent_prompt = ChatPromptTemplate.from_messages([
    ('system', agent_system_text),
    MessagesPlaceholder('chat_history', optional = True), # Using this form since not clear how to have optional = True in the tuple form
    ('human', '{input}'),
    ('placeholder', '{agent_scratchpad}') # Required for chat history and the agent's intermediate processing values
])

agent = create_tool_calling_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(agent = agent, tools = tools, verbose = True)


# agent_prompt.format_messages(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# )

# agent_executor.invoke(dict(
#     chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],
#     input = 'Do you know the tragedy of Darth Plagueis the Wise?'
# ))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_film_scripts` with `{'query': 'Darth Plagueis the Wise'}`


[0m[36;1m[1;3mOBI-WAN: (continuing) It can't be . . . It can't be . . .

As ANAKIN surveys the carnage, a DARK-ROBED SITH LORD enters. ANAKIN turns to DARTH SIDIOUS and kneels before him.

ANAKIN: The traitors have been taken care of, Lord Sidious.

DARTH SlDIOUS: Good . . . good . . . You have done well, my new apprentice. Do you feel your power growing?

ANAKIN: Yes, My Master.

DARTH SlDIOUS: Now, Lord Vader, now go and bring peace to the Empire.

OBI-WAN watches in horror. Tears well up in his eyes.

OBI-WAN: I can't watch any more.

OBI-WAN switches off the hologram. The TWO JEDI stand in silence for a few moments.

YODA: Destroy the Sith, we must.

OBI-WAN: Send me to kill the Emperor. I will not kill Anakin.

YODA: To fight this Lord Sidious, strong enough, you are not.

OBI-WAN: He is like my brother ... I cannot do it.

YODA: Twisted by

{'chat_history': [HumanMessage(content='Are you knowledgeable about Star Wars?'),
  AIMessage(content='Very')],
 'input': 'Do you know the tragedy of Darth Plagueis the Wise?',
 'output': 'The dialogue from the Star Wars film scripts does not directly mention the tragedy of Darth Plagueis the Wise. However, in "Star Wars: Episode III - Revenge of the Sith," Chancellor Palpatine tells Anakin Skywalker the story of Darth Plagueis the Wise. According to Palpatine, Darth Plagueis was a Dark Lord of the Sith who was so powerful and wise that he could influence the midi-chlorians to create life and prevent death. Unfortunately, Darth Plagueis was betrayed and killed by his own apprentice, who was seeking to obtain his power. This story plays a significant role in Anakin\'s fall to the dark side as he becomes intrigued by the idea of cheating death, ultimately leading him to become Darth Vader under the influence of Palpatine.'}

### Non-agent chain-logic version

In [12]:
# Determine which retriever is best and generate an appropriate query for it

# Again, many LangChain objects seem to use the class-scope fields instead of instance ones in __init__...
# Separately, the advantage of using a class over a dictionary is simply the descriptions
class DirectedQuery(BaseModel):
    '''Determine whether a query is best answered by looking at scripts rather than articles'''

    query: str = Field(
        ...,
        description = '''The query to either search film scripts or wiki articles.
        A film script query should include character names and relevant keywords of what they are saying in the a scene which is likely to contain the required information.
        A wiki articles search should instead be at most 4 words, simply being the name of a character or location or event whose page is likely to contain the required information.''',
    )
    source: str = Field(
        ...,
        description = 'Either "wiki" or "scripts", indicating which source the query should be passed to.',
    )


# output_parser = PydanticToolsParser(tools = [DirectedQuery])

query_analyser_prompt = ChatPromptTemplate.from_messages([
        ('system', 'You have the ability to issue search queries of one of two kinds to get information to help answer questions.'),
        ('human', '{question}'),
])
structured_llm = llm.with_structured_output(DirectedQuery)
query_generator = dict(question = RunnablePassthrough()) | query_analyser_prompt | structured_llm

In [15]:
retrievers = dict(wiki = woo_db.as_retriever(search_kwargs = dict(k = 4)), scripts = script_db.as_retriever(search_kwargs = dict(k = 4)))

@chain
def compound_retriever(question):
    response = query_generator.invoke(question)
    retriever = retrievers[response.source]
    return retriever.invoke(response.query)


# compound_retriever.invoke('Do you know the tragedy of Darth Plagueis the Wise?')

In [18]:
compound_chain = create_retrieval_chain(compound_retriever, document_chain)

# compound_chain.invoke(dict(input = 'Do you know the tragedy of Darth Plagueis the Wise?'))

{'input': 'Do you know the tragedy of Darth Plagueis the Wise?',
 'context': [Document(page_content='The Tragedy of Darth Plagueis the Wise', metadata={'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis'}),
  Document(page_content='Biography\nEarly life\nDarth Plagueis was a legendary Dark Lord of the Sith trained by the Sith Master Darth Tenebrous. At some point during his life, Plagueis acquired the protocol droid 11-4D. During his time as a Sith Lord and studying the Force, Plagueis acquired a vast amount of knowledge about the dark side and its teachings.\n\nMaster of Darth Sidious', metadata={'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis'}),
  Document(page_content='"Darth Plagueis was a Dark Lord of the Sith so powerful and so wise, he could use the Force to influence the midi-chlorians to create life. He had such a knowledge of the dark side, he could even keep the ones he cared about from dying."\n―Sheev Palpatine', metadata={'source': 'https://starwars.fan