T-Flet commited on
Commit
2ab76e5
1 Parent(s): b7bcc26

Playing around with sources and structure before connecting a LLM

Browse files
Files changed (2) hide show
  1. README.md +15 -1
  2. explore.ipynb +426 -0
README.md CHANGED
@@ -1,2 +1,16 @@
1
  # Star-Wars-Expert
2
- A LLM with RAG making it knowledgeable about Star Wars plot and data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Star-Wars-Expert
2
+ A LLM with RAG making it knowledgeable about Star Wars plot and data.
3
+
4
+
5
+
6
+ ## Data
7
+
8
+ * Scripts obtained from [The Internet Movie Script Database (IMSDb)](https://imsdb.com/)
9
+ * More data from direct search on [Wookieepeedia](https://starwars.fandom.com/wiki/Main_Page)
10
+
11
+ Other datasets to possibly include:
12
+
13
+ * Character data from [Kaggle](https://www.kaggle.com/datasets/jsphyg/star-wars)
14
+ * Character social network from [Kaggle](https://www.kaggle.com/datasets/ruchi798/star-wars)
15
+
16
+
explore.ipynb ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Star Wars Expert"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "code",
12
+ "execution_count": 1,
13
+ "metadata": {},
14
+ "outputs": [
15
+ {
16
+ "data": {
17
+ "text/plain": [
18
+ "True"
19
+ ]
20
+ },
21
+ "execution_count": 1,
22
+ "metadata": {},
23
+ "output_type": "execute_result"
24
+ }
25
+ ],
26
+ "source": [
27
+ "from langchain_openai import ChatOpenAI#, OpenAIEmbeddings # No need to pay for using embeddings as well when have free alternatives\n",
28
+ "from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings # The free alternative (also the default in docs, with model_name = 'all-MiniLM-L6-v2')\n",
29
+ "from langchain.text_splitter import RecursiveCharacterTextSplitter#, TextSplitter # Recursive to better keep related bits contiguous (also recommended in docs: https://python.langchain.com/docs/modules/data_connection/document_transformers/)\n",
30
+ "\n",
31
+ "from langchain_community.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader\n",
32
+ "# from langchain_chroma import Chroma # The documentation uses this one, but it is extremely recent, and the same functionality is available in langchain_community and langchain (which imports community)\n",
33
+ "from langchain_community.vectorstores import Chroma # This has documentation on-hover, while the indirect import through non-community does not\n",
34
+ "\n",
35
+ "from langchain.prompts import PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder\n",
36
+ "from langchain.chains.combine_documents import create_stuff_documents_chain\n",
37
+ "from langchain.chains import create_history_aware_retriever, create_retrieval_chain\n",
38
+ "from langchain_core.output_parsers import StrOutputParser\n",
39
+ "\n",
40
+ "# To manually test pipelines\n",
41
+ "from langchain_core.messages import HumanMessage, AIMessage\n",
42
+ "from langchain_core.documents import Document\n",
43
+ "\n",
44
+ "import os\n",
45
+ "import shutil\n",
46
+ "import re\n",
47
+ "\n",
48
+ "# For Wookieepedia search\n",
49
+ "import requests\n",
50
+ "from bs4 import BeautifulSoup\n",
51
+ "\n",
52
+ "import dotenv\n",
53
+ "dotenv.load_dotenv()"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "markdown",
58
+ "metadata": {},
59
+ "source": [
60
+ "## Data Loaders\n",
61
+ "NOTE: running the chunk below deletes the database file adds data to the database, since content is duplicated otherwise"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "markdown",
66
+ "metadata": {},
67
+ "source": [
68
+ "### Film Scripts"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": 2,
74
+ "metadata": {},
75
+ "outputs": [
76
+ {
77
+ "name": "stdout",
78
+ "output_type": "stream",
79
+ "text": [
80
+ "The script database contains 1485 chunks, with mean length of 697 characters\n"
81
+ ]
82
+ }
83
+ ],
84
+ "source": [
85
+ "# Comparison of vector dbs: https://zackproser.com/blog/vector-databases-compared\n",
86
+ "# Opinion: Milvus (more features, bigger community, higher performance(?), fully free, no enterprise plans) > Weaviate > Chroma\n",
87
+ "# However Milvus and Weaviate both require a separate instance to be up and running\n",
88
+ "# (The documentation uses FAISS, but it seems unnecessarily limited in comparison)\n",
89
+ "# Hence Chroma - https://python.langchain.com/docs/integrations/vectorstores/chroma/\n",
90
+ "\n",
91
+ "# Separately, no need to pay for OpenAIEmbeddings; additionally, all-MiniLM-L6-v2 is default in docs\n",
92
+ "\n",
93
+ "REGENERATE_DATABASE = False\n",
94
+ "\n",
95
+ "if (db_exists := os.path.exists(db_dir := r'scripts\\db')):\n",
96
+ " if REGENERATE_DATABASE:\n",
97
+ " print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')\n",
98
+ " shutil.rmtree(db_dir)\n",
99
+ " else: script_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)\n",
100
+ "\n",
101
+ "if not db_exists or (db_exists and REGENERATE_DATABASE): # Unfortunate disjoining of the two conditional blocks\n",
102
+ " scripts = DirectoryLoader('scripts', glob = '**/[!.]*.txt', loader_cls = TextLoader).load()\n",
103
+ " for s in scripts: s.page_content = re.sub(r'\\t+|[ ]{2,}', '', s.page_content) # Spacing to centre text noise\n",
104
+ "\n",
105
+ " script_chunks = RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 100).split_documents(scripts)\n",
106
+ " # Why not some overlap for extra context just in case?\n",
107
+ " # Separately, chunk size of 500 seems small, while 1000 seems big upon manual inspection \n",
108
+ "\n",
109
+ " script_db = Chroma.from_documents(script_chunks, SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)\n",
110
+ "\n",
111
+ "print(f'The script database contains {len(script_db)} chunks, with mean length of {sum(len(s) for s in script_db.get()[\"documents\"]) / len(script_db):.0f} characters')\n"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "code",
116
+ "execution_count": 3,
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/plain": [
122
+ "[Document(page_content=\"LUKE (with sadness)\\nI found out Darth Vader was my father.\\n\\nBEN\\nTo be a Jedi, Luke, you must confront and then go beyond the dark side \\n- the side your father couldn't get past. Impatience is the easiest \\ndoor - for you, like your father. Only, your father was seduced by what \\nhe found on the other side of the door, and you have held firm. You're \\nno longer so reckless now, Luke. You are strong and patient. And now, \\nyou must face Darth Vader again!\\n\\nLUKE\\nI can't kill my own father.\\n\\nBEN\\nThen the Emperor has already won. You were our only hope.\\n\\nLUKE\\nYoda spoke of another.\\n\\nBEN\\nThe other he spoke of is your twin sister.\\n\\nLUKE\\nBut I have no sister.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
123
+ " Document(page_content=\"LUKE\\nI can't do it, Artoo. I can't go on alone.\\n\\nBEN (OS)\\nYoda will always be with you.\\n\\nLuke looks up to see the shimmering image of BEN KENOBI.\\n\\nLUKE\\nObi-Wan! Why didn't you tell me?\\n\\nThe ghost of Ben Kenobi approaches him through the swamp.\\n\\nLUKE\\nYou told me Vader betrayed and murdered my father.\\n\\nBEN\\nYou father was seduced by the dark side of the Force. He ceased to be \\nAnakin Skywalker\\nand became Darth Vader. When that happened, the good man who was your \\nfather was destroyed. So what I have told you was true... from a \\ncertain point of view.\\n\\nLUKE (turning away, derisive)\\nA certain point of view!\\n\\nBEN\\nLuke, you're going to find that many of the truths we cling to depend \\ngreatly on our own\\npoint of view.\\n\\nLuke is unresponsive. Ben studies him in silence for a moment.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
124
+ " Document(page_content=\"Yoda sighs, and lies back on his bed.\\n\\nLUKE\\nThen I am a Jedi?\\n\\nYODA (shakes his head)\\nOhhh. Not yet. One thing remains: Vader. You must confront Vader. Then, \\nonly then, a Jedi will you be. And confront him you will.\\n\\nLuke is in agony. He is silent for a long moment, screwing up his \\ncourage. Finally he is able to ask.\\n\\nLUKE\\nMaster Yoda... is Darth Vader my father?\\n\\nYoda's eyes are full of weariness and compassion. An odd, sad smile \\ncreases his face. He turns painfully on his side, away from Luke.\\n\\nYODA\\nMmm... rest I need. Yes... rest.\\n\\nLuke watches him, each moment an eternity.\\n\\nLUKE\\nYoda, I must know.\\n\\nYODA\\nYour father he is.\\n\\nLuke reacts as if cut.\\n\\nYODA\\nTold you, did he?\\n\\nLUKE\\nYes.\\n\\nA new look of concern crosses Yoda's face.He closes his eyes.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
125
+ " Document(page_content=\"He beckons the young Jedi closer to him.\\n\\nYODA\\nLuke...Luke...Do not...Do not underestimate the powers of the Emperor, \\nor suffer your father's fate, you will. Luke, when gone am I\\n(cough),\\nthe last of the Jedi will you be. Luke, the Force runs strong in your \\nfamily. Pass on what you have learned, Luke... \\n(with great effort)\\nThere is...another...Sky...Sky...walker.\\n\\nHe catches his breath. A shiver runs through the ancient green \\ncreature, and he dies. Luke stares at his dead master as he disappears \\nin front of his eyes.\\n\\n51EXT DAGOBAH SWAMP - X-WING \\n\\nLuke wanders back to where his ship is sitting. Artoo beeps a greeting, \\nbut is ignored by his depressed master. Luke kneels down, begins to \\nhelp Artoo with the ship, then stops and shakes his head dejectedly.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
126
+ " Document(page_content=\"LUKE\\nYou're gravely mistaken. You won't convert me as you did my father.\\n\\nThe Emperor gets down from his throne and walks up very close to Luke. \\nThe Emperor looks into his eyes and, for the first time, Luke can \\nperceive the evil visage within the hood.\\n\\nEMPEROR\\nOh, no, my young Jedi. You will find that it is you who are \\nmistaken...about a great many things.\\n\\nVADER\\nHis lightsaber.\\n\\nVader extends a gloved hand toward the Emperor, revealing Luke's \\nlightsaber. The Emperor takes it.\\n\\nEMPEROR\\nAh, yes, a Jedi's weapon. Much like your father's. By now you must know \\nyour father can never be turned from the dark side. So will it be with \\nyou.\\n\\nLUKE\\nYou're wrong. Soon I'll be dead...and you with me.\\n\\nThe Emperor laughs.\\n\\nEMPEROR\\nPerhaps you refer to the imminent attack of your Rebel fleet.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
127
+ " Document(page_content=\"Luke is unresponsive. Ben studies him in silence for a moment.\\n\\nBEN\\nI don't blame you for being angry. If I was wrong in what I did, it \\ncertainly wouldn't have been for the first time. You see, what happened \\nto your father was my fault.\\n\\nBen pauses sadly.\\n\\nBEN\\nAnakin was a good friend.\\n\\nLuke turns with interest at this. As Ben speaks, Luke settles on a \\nstump, mesmerized. Artoo comes over to offer his comforting presence.\\n\\nBEN\\nWhen I first knew him, your father was already a great pilot. But I was \\namazed how strongly the Force was with him. I took it upon myself to \\ntrain him as a Jedi. I thought that I could instruct him just as well \\nas Yoda. I was wrong. My pride has had terrible consequences for the \\ngalaxy.\\n\\nLuke is entranced.\\n\\nLUKE\\nThere's still good in him.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
128
+ " Document(page_content=\"Red Leader pats Luke on the back as they stop in front of \\nhis fighter.\\n\\nRED LEADER\\nI met your father once when I was \\njust a boy, he was a great pilot. \\nYou'll do all right. If you've got \\nhalf of your father's skill, you'll \\ndo better than all right.\\n\\nLUKE\\nThank you, sir. I'll try.\\n\\nRed Leader hurries to his own ship.\\n\\nBIGGS\\nI've got to get aboard. Listen, you'll \\ntell me your stories when we come \\nback. All right?\\n\\nLUKE\\nI told you I'd make it someday, Biggs.\\n\\nBIGGS\\n(going off)\\nYou did, all right. It's going to be \\nlike old times, Luke. We're a couple \\nof shooting stars that'll never be \\nstopped!\\n\\nLuke laughs and shakes his head in agreement. He heads for \\nhis ship.\", metadata={'source': 'scripts\\\\Episode IV - A New Hope.txt'}),\n",
129
+ " Document(page_content=\"Luke looks at his father's mechanical hand, then to his own mechanical, \\nblack-gloved hand, and realizes how much he is becoming like his \\nfather. He makes the decision for which he has spent a lifetime in \\npreparation. Luke steps back and hurls his lightsaber away.\\n\\nLUKE\\nNever! I'll never turn to the dark side. You've failed, Your Highness. \\nI am a Jedi,\\nlike my father before me.\\n\\nThe Emperor's glee turns to rage.\\n\\nEMPEROR\\nSo be it...Jedi.\\n\\n123EXT FOREST - GENERATOR BUNKER\\n\\nHan and several of the fighters run out of the bunker and race across \\nthe clearing.\\n\\nHAN\\nMove! Move!\\n\\nA shock wave knocks them flat as the bunker explodes, followed by a \\nspectacular display as the huge shield-generator radar dish explodes \\nalong with the bunker.\\n\\n124INT REBEL STAR CRUISER - BRIDGE\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
130
+ " Document(page_content=\"LUKE\\nI will not turn...and you'll be forced to kill me.\\n\\nVADER\\nIf that is your destiny.\\n\\nLUKE\\nSearch your feelings, father. You can't do this. I feel the conflict \\nwithin you. Let go\\nof your hate.\\n\\nVADER\\nIt is too late for me, son. The Emperor will show you the true nature \\nof the Force. He is\\nyour master now.\\n\\nVader signals to some distant stormtroopers. He and Luke stand staring \\nat one another for a long moment.\\n\\nLUKE\\nThen my father is truly dead.\\n\\n79EXT ENDOR - RIDGE OVERLOOKING SHIELD GENERATOR\\n\\nHan, Leia, Chewbacca, the droids, Wicket, and another Ewok scout, \\nPAPLOO, hide on a ridge overlooking the massive Imperial shield \\ngenerator. At the base of the generator is an Imperial landing \\nplatform. Leia studies the installation.\", metadata={'source': 'scripts\\\\Episode VI - Return of the Jedi.txt'}),\n",
131
+ " Document(page_content=\"LUKE\\nUh, you can call me Luke.\\n\\nTHREEPIO\\nI see, sir Luke.\\n\\nLUKE\\n(laughing)\\nJust Luke.\\n\\nTHREEPIO\\nAnd I am See-Threepio, human-cyborg \\nrelations, and this is my counterpart, \\nArtoo-Detoo.\\n\\nLUKE\\nHello.\\n\\nArtoo beeps in response. Luke unplugs Artoo and begins to \\nscrape several connectors on the robot's head with a chrome \\npick. Threepio climbs out of the oil tub and begins wiping \\noil from his bronze body.\\n\\nLUKE\\nYou got a lot of carbon scoring here. \\nIt looks like you boys have seen a \\nlot of action.\\n\\nTHREEPIO\\nWith all we've been through, sometimes \\nI'm amazed we're in as good condition \\nas we are, what with the Rebellion \\nand all.\\n\\nLUKE\\nYou know of the Rebellion against \\nthe Empire?\\n\\nTHREEPIO\\nThat's how we came to be in your \\nservice, if you take my meaning, \\nsir.\", metadata={'source': 'scripts\\\\Episode IV - A New Hope.txt'})]"
132
+ ]
133
+ },
134
+ "execution_count": 3,
135
+ "metadata": {},
136
+ "output_type": "execute_result"
137
+ }
138
+ ],
139
+ "source": [
140
+ "# Query testing\n",
141
+ "\n",
142
+ "res = script_db.similarity_search('Luke father reveal', k = 10)\n",
143
+ "\n",
144
+ "# for r in res: print(r.page_content)\n",
145
+ "res"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "markdown",
150
+ "metadata": {},
151
+ "source": [
152
+ "### Wookieepedia Articles"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 33,
158
+ "metadata": {},
159
+ "outputs": [
160
+ {
161
+ "name": "stdout",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "Current source pages in Wookieepedia db:\n"
165
+ ]
166
+ },
167
+ {
168
+ "data": {
169
+ "text/plain": [
170
+ "{'https://starwars.fandom.com/wiki/Darth_Plagueis'}"
171
+ ]
172
+ },
173
+ "execution_count": 33,
174
+ "metadata": {},
175
+ "output_type": "execute_result"
176
+ }
177
+ ],
178
+ "source": [
179
+ "REGENERATE_WOOKIEEPEDIA_DATABASE = False\n",
180
+ "\n",
181
+ "if (db_exists := os.path.exists(db_dir := r'wookieepedia_db')) and REGENERATE_WOOKIEEPEDIA_DATABASE:\n",
182
+ " print('Deleting the previous database and creating a new one (because otherwise content is duplicated in the db every time this block is run)')\n",
183
+ " shutil.rmtree(db_dir)\n",
184
+ "\n",
185
+ "woo_db = Chroma(embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2'), persist_directory = db_dir)\n",
186
+ "\n",
187
+ "print('Current source pages in Wookieepedia db:')\n",
188
+ "set(md.get('source') for md in woo_db.get()['metadatas'])\n"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 22,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "\n",
198
+ "def first_wookieepedia_result(query: str) -> str:\n",
199
+ " '''Get the url of the first result when searching Wookieepedia for a query\n",
200
+ " (best for simple names as queries, ideally generated by the llm for something like\n",
201
+ " \"Produce a query consisting of the name of the most important element in the query so that its article can be looked up\")\n",
202
+ " '''\n",
203
+ " search_results = requests.get(f'https://starwars.fandom.com/wiki/Special:Search?query={\"+\".join(query.split(\" \"))}')\n",
204
+ " soup = BeautifulSoup(search_results.content, 'html.parser')\n",
205
+ " first_res = soup.find('a', class_ = 'unified-search__result__link')\n",
206
+ " return first_res['href']\n",
207
+ "\n",
208
+ "# first_wookieepedia_result('Darth Plagueis')\n",
209
+ "\n",
210
+ "\n",
211
+ "def get_new_wookieepedia_chunks(query: str, previous_sources: set[str]) -> list[Document]:\n",
212
+ " '''Retrieve and return chunks of the content of the first result of query on Wookieepedia, then return the closest matches for.\n",
213
+ " '''\n",
214
+ " url = first_wookieepedia_result(query)\n",
215
+ "\n",
216
+ " if url in previous_sources: return []\n",
217
+ " else:\n",
218
+ " doc = WebBaseLoader(url).load()[0] # Only one url passed in => only one Document out; no need to assert\n",
219
+ " \n",
220
+ " # There probably is a very long preamble before the real content, however, if more than one gap then ignore and proceed with full document\n",
221
+ " trimmed = parts[1] if len(parts := doc.page_content.split('\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\xa0 \\xa0')) == 2 else doc.page_content\n",
222
+ " doc.page_content = re.sub(r'[\\n\\t]{2,}', '\\n', trimmed) # And remove excessive spacing\n",
223
+ "\n",
224
+ " return RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 100).split_documents([doc])\n",
225
+ "\n",
226
+ "# get_wookieepedia_chunks('Darth Plagueis', set())"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 30,
232
+ "metadata": {},
233
+ "outputs": [
234
+ {
235
+ "data": {
236
+ "text/plain": [
237
+ "[Document(page_content='Plagueis\\' death served as an example to the apprentice who betrayed and killed him; Sidious regarded his master\\'s trust in him as a mistake, which he vowed to never make. During the Clone Wars, Sidious was aware that his second apprentice, Darth Tyranus, plotted to usurp his title of Sith Master by killing him. Sensing that Tyranus\\' betrayal was imminent, Sidious successfully conspired to have his student killed by his intended replacement, the Jedi Knight and prophesied Chosen One Anakin Skywalker.[4]\\nThe Tragedy of Darth Plagueis the Wise[]\\n\"Did you ever hear the Tragedy of Darth Plagueis the Wise?\"\\n―Sheev Palpatine, to Anakin Skywalker[3]\\n Anakin Skywalker learned about the late Darth Plagueis as recounted by Sheev Palpatine.', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
238
+ " Document(page_content='The death of Darth Plagueis[]\\n\"Unfortunately, he taught his apprentice everything he knew. Then his apprentice killed him in his sleep.\"\\n―Sheev Palpatine[3]\\nAccording to Sidious, Plagueis was powerful enough that he could use the Force to influence the midi-chlorians to create life and keep the ones he cared about from dying, a precious knowledge that awarded him the epithet of \"The Wise.\"[6] However, Plagueis also developed a belief that the Force could \"strike back\" at him for his power.[8] In truth, he became so powerful that the only thing he still dreaded was losing his power.[3]\\n Under the Rule of Two, Sidious killed his master and trained his own Sith apprentice, Darth Maul.', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
239
+ " Document(page_content=\"Lord Darth Plagueis the Wise. After his own demise, Sidious used his master's teachings to cheat death, though true immortality still eluded him up until his final end.\", metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
240
+ " Document(page_content='Contents\\n1 Biography\\n1.1 Early life\\n1.2 Master of Darth Sidious\\n1.3 The death of Darth Plagueis\\n1.4 Legacy\\n1.4.1 The rise of Darth Sidious\\n1.4.2 The Tragedy of Darth Plagueis the Wise\\n1.4.3 Transference\\n2 Personality and traits\\n3 Powers and abilities\\n4 Behind the scenes\\n5 Appearances\\n5.1 Non-canon appearances\\n6 Sources\\n7 Notes and references\\n8 External links\\nBiography[]\\nEarly life[]\\nDarth Plagueis was a legendary[7] Dark Lord of the Sith[3] trained by the Sith Master Darth Tenebrous.[5] At some point during his life, Plagueis acquired the protocol droid 11-4D.[8] During his time as a Sith Lord and studying the Force, Plagueis acquired a vast amount of knowledge about the dark side and its teachings.[3]\\nMaster of Darth Sidious[]', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
241
+ " Document(page_content='Behind the scenes[]\\n Darth Plagueis as depicted in Star Wars Legends \\nDarth Plagueis was first mentioned on-screen in the 2005 film Star Wars: Episode III Revenge of the Sith, the third and final installment of the Star Wars prequel trilogy.[3] Although Palpatine was never confirmed to be Plagueis\\' apprentice in the movie itself, a link to the official encyclopedia on StarWars.com did refer to Sidious as having been \"trained by Darth Plagueis.\"[6]\\nThe character was created by George Lucas as early as the first draft of Revenge of the Sith—dated April 2003—and possibly earlier.[22] His story was massively expanded upon in the 2012 Star Wars Legends novel Darth Plagueis, written by James Luceno.[23] It notably established Plagueis to be a Muun just as Lucas proposed.[24]', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
242
+ " Document(page_content='This article is about the Sith Lord.\\xa0You may be looking for the Legends novel Darth Plagueis.\\nContent approaching. Masters, Shadow of the Sith, Star Wars: The Dark Side, Emperor Palpatine, Star Wars: The Secrets of the Sith, The Odyssey of Star Wars: An Epic Poem–class.\\nParts of this article are no longer up to date.\\nPlease update the article to include missing information, and remove this template when finished.\\nDarth Plagueis\\nBiographical information\\nDied\\nBy 32 BBY[1]\\nDescriptive information\\nSpecies\\nMuun[2]\\nGender\\nMale[3]\\nPronouns\\nHe/him[3]\\nEye color\\nOrange[4]\\nChronological and political information\\nAffiliation(s)\\nSith[3]\\nMasters\\nDarth Tenebrous[5]\\nApprentices\\nDarth Sidious[6]\\n[Source]', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
243
+ " Document(page_content=\"Ultimately, Plagueis' only fear was bound to come true; at some point, Sidious had learned everything he needed from his[3] distracted master,[13] after which he decided that he had no further use for his teacher. In accordance with the Rule of Two tradition,[9] Sidious broke into his master's apartment[13] and disposed of Plagueis, murdering his mentor in cold blood, killing him in his sleep,[3] and ascending to ultimate power and acquiring the title of Sith Master for himself.[9] As noted by Sidious, Plagueis didn't act fast enough to prevent his own demise.[12]\\nSometime before or after Plagueis' demise, Sidious acquired his own pupil in the Nightsister Mother Talzin's son, Darth Maul,[6] a young Dathomirian Zabrak whom Sidious had kidnapped from Talzin when he was only a child.[15]\", metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
244
+ " Document(page_content='Legacy[]\\nThe rise of Darth Sidious[]\\n\"But darkness is eternal. Unlike the light, it cannot be extinguished. It will always find a way to rise again. And I, Darth Sidious, have risen with it.\"\\n―Darth Sidious[4]\\n Plagueis\\' legacy was inherited by Sidious, who used his mentor\\'s knowledge to further his own ambitions.', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
245
+ " Document(page_content='One of the few Jedi to survive was a Padawan named Ferren Barr. Barr began investigating as to why the Jedi fell so quickly and learned of Plagueis\\' existence, putting his file on him in his list of individuals involved in the crisis.[16] In the years that followed his rise to power, Sidious came to genuinely, if posthumously, appreciate Plagueis for the planner and prophet he had been. Sometimes, Sidious mused about how his late master would have reacted when confronted with the trivial, day-to-day matters of Imperial politics, remarking that Plagueis never foresaw that he would become Emperor. Sidious desired not just the immortaltiy his master had craved[8] and achieved,[4] but the power to fashion a universe of his own creation, and make it so the Force could not \"strike back\".[8]', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'}),\n",
246
+ " Document(page_content='According to Darth Sidious, Darth Plagueis was a powerful Sith Lord who was able to use the Force to influence the midi-chlorians to create life,[3] a power that was reportedly to be extremely rare for one sensitive to the Force to possess, and could even be used by certain Sith Lords to prolong or even cheat death. Utilizing this ability, Plagueis was able to conduct unnatural experimentations in his bid to discover immortality.[9] His great knowledge of the dark side of the Force could also even aid him in keeping the ones he cared about from dying.[3]', metadata={'description': \"Darth Plagueis (pronounced /'pleɪɡ.əs/) was a Force-sensitive male Muun Dark Lord of the Sith and the Sith Master of Darth Sidious. Plagueis lusted for immortality, believing the secret laid in science. To that end, he worked with his Sith apprentice, conducting research into bioengineering and...\", 'language': 'en', 'source': 'https://starwars.fandom.com/wiki/Darth_Plagueis', 'title': 'Darth Plagueis | Wookieepedia | Fandom'})]"
247
+ ]
248
+ },
249
+ "execution_count": 30,
250
+ "metadata": {},
251
+ "output_type": "execute_result"
252
+ }
253
+ ],
254
+ "source": [
255
+ "def get_wookieepedia_context(original_query: str, simple_query: str, wdb: Chroma) -> list[Document]:\n",
256
+ " try:\n",
257
+ " new_chunks = get_new_wookieepedia_chunks(simple_query, previous_sources = set(md.get('source') for md in wdb.get()['metadatas']))\n",
258
+ " if new_chunks: wdb.add_documents(new_chunks)\n",
259
+ " except: return []\n",
260
+ "\n",
261
+ " return wdb.similarity_search(original_query, k = 10)\n",
262
+ "\n",
263
+ "get_wookieepedia_context('Do you know the Tragedy of Darth Plagueis the Wise?', 'Darth Plagueis', woo_db)"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "markdown",
268
+ "metadata": {},
269
+ "source": [
270
+ "## Core Chain\n",
271
+ "Standard chains: https://python.langchain.com/docs/modules/chains/#lcel-chains"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 11,
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "llm = ChatOpenAI(model = 'gpt-3.5-turbo-0125', temperature = 0)\n",
281
+ "\n",
282
+ "# llm.invoke('What do you know about Star Wars?')"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 8,
288
+ "metadata": {},
289
+ "outputs": [
290
+ {
291
+ "data": {
292
+ "text/plain": [
293
+ "[SystemMessage(content='\\nYou are very knowledgeable about Star Wars and your job is to answer questions about its plot and characters.\\nUse the context below to produce your answers with as much detail as possible.\\nIf you do not know an answer, say so; do not make up information not in the context.\\n\\n<context>\\nYou are an expert of Star Wars lore\\n</context>\\n'),\n",
294
+ " HumanMessage(content='Are you knowledgeable about Star Wars?')]"
295
+ ]
296
+ },
297
+ "execution_count": 8,
298
+ "metadata": {},
299
+ "output_type": "execute_result"
300
+ }
301
+ ],
302
+ "source": [
303
+ "system_text = '''\n",
304
+ "You are very knowledgeable about Star Wars and your job is to answer questions about its plot and characters.\n",
305
+ "Use the context below to produce your answers with as much detail as possible.\n",
306
+ "If you do not know an answer, say so; do not make up information not in the context.\n",
307
+ "\n",
308
+ "<context>\n",
309
+ "{context}\n",
310
+ "</context>\n",
311
+ "'''\n",
312
+ "\n",
313
+ "# document_prompt = ChatPromptTemplate(\n",
314
+ "# input_variables = ['context', 'chat_history', 'query'],\n",
315
+ "# messages = [ # vv all arguments (i.e. only prompt in this case) are required to be named; rare use of this language feature\n",
316
+ "# SystemMessagePromptTemplate(prompt = PromptTemplate(input_variables = ['context'], template = system_text)),\n",
317
+ "# MessagesPlaceholder(variable_name = 'chat_history', optional = True),\n",
318
+ "# HumanMessagePromptTemplate( prompt = PromptTemplate(input_variables = ['query'], template = '{query}'))\n",
319
+ "# ] # separately, could use ChatPromptTemplate.from_messages for generic roles, but the above core ones are good\n",
320
+ "# )\n",
321
+ "\n",
322
+ "# Same as above but more concise\n",
323
+ "document_prompt = ChatPromptTemplate.from_messages([\n",
324
+ " ('system', system_text),\n",
325
+ " MessagesPlaceholder(variable_name = 'chat_history', optional = True),\n",
326
+ " ('user', '{query}')\n",
327
+ "])\n",
328
+ "\n",
329
+ "document_chain = create_stuff_documents_chain(llm, document_prompt)\n",
330
+ "\n",
331
+ "\n",
332
+ "# document_prompt.format_messages(context = 'You are an expert in Star Wars lore', query = 'Are you knowledgeable about Star Wars?')\n",
333
+ "# document_chain.invoke(dict(context = [Document(page_content = 'You are an expert in Star Wars lore')], query = 'Are you knowledgeable about Star Wars?'))\n",
334
+ "\n",
335
+ "\n",
336
+ "# basic_chain = document_prompt | llm | StrOutputParser() # To extract just the message\n",
337
+ "# basic_chain.invoke(dict(context = 'You are an expert of Star Wars lore', query = 'Are you knowledgeable about Star Wars?'))"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 19,
343
+ "metadata": {},
344
+ "outputs": [
345
+ {
346
+ "data": {
347
+ "text/plain": [
348
+ "[HumanMessage(content='Are you knowledgeable about Star Wars?'),\n",
349
+ " AIMessage(content='Very'),\n",
350
+ " HumanMessage(content='Do you know the tragedy of Darth Plagueis the Wise?'),\n",
351
+ " HumanMessage(content='Given the above conversation, generate a search query to look up information relevant to the conversation')]"
352
+ ]
353
+ },
354
+ "execution_count": 19,
355
+ "metadata": {},
356
+ "output_type": "execute_result"
357
+ }
358
+ ],
359
+ "source": [
360
+ "# retriever_prompt = ChatPromptTemplate(\n",
361
+ "# input_variables = ['chat_history', 'query'],\n",
362
+ "# messages = [\n",
363
+ "# MessagesPlaceholder(variable_name = 'chat_history'),\n",
364
+ "# HumanMessagePromptTemplate(prompt = PromptTemplate(input_variables = ['query'], template = '{query}')),\n",
365
+ "# HumanMessagePromptTemplate(prompt = PromptTemplate(input_variables = [], template = 'Given the above conversation, generate a search query to look up information relevant to the conversation'))\n",
366
+ "# ]\n",
367
+ "# )\n",
368
+ "\n",
369
+ "# Same as above but more concise\n",
370
+ "retriever_prompt = ChatPromptTemplate.from_messages([\n",
371
+ " MessagesPlaceholder(variable_name = 'chat_history'),\n",
372
+ " ('user', '{query}'),\n",
373
+ " ('user', 'Given the above conversation, generate a search query to look up information relevant to the conversation')\n",
374
+ "])\n",
375
+ "\n",
376
+ "retriever_chain = create_history_aware_retriever(llm, script_db.as_retriever(), retriever_prompt)\n",
377
+ "\n",
378
+ "\n",
379
+ "# retriever_prompt.format_messages(\n",
380
+ "# chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],\n",
381
+ "# query = 'Do you know the tragedy of Darth Plagueis the Wise?'\n",
382
+ "# )\n",
383
+ "\n",
384
+ "# retriever_chain.invoke(dict(\n",
385
+ "# chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],\n",
386
+ "# query = 'Do you know the tragedy of Darth Plagueis the Wise?'\n",
387
+ "# ))\n"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "full_chain = create_retrieval_chain(retriever_chain, document_chain)\n",
397
+ "\n",
398
+ "# full_chain.invoke(dict(\n",
399
+ "# # chat_history = [HumanMessage(content = 'Are you knowledgeable about Star Wars?'), AIMessage(content = 'Very')],\n",
400
+ "# query = 'Do you know the tragedy of Darth Plagueis the Wise?'\n",
401
+ "# ))"
402
+ ]
403
+ }
404
+ ],
405
+ "metadata": {
406
+ "kernelspec": {
407
+ "display_name": "ML11",
408
+ "language": "python",
409
+ "name": "python3"
410
+ },
411
+ "language_info": {
412
+ "codemirror_mode": {
413
+ "name": "ipython",
414
+ "version": 3
415
+ },
416
+ "file_extension": ".py",
417
+ "mimetype": "text/x-python",
418
+ "name": "python",
419
+ "nbconvert_exporter": "python",
420
+ "pygments_lexer": "ipython3",
421
+ "version": "3.11.7"
422
+ }
423
+ },
424
+ "nbformat": 4,
425
+ "nbformat_minor": 2
426
+ }