# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.10.57 openai==1.37.0 cohere==5.6.2 tiktoken==0.7.0 chromadb==0.5.5 html2text sentence_transformers pydantic llama-index-vector-stores-chroma==0.1.10 kaleido==0.2.1 llama-index-llms-gemini==0.1.11


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
import os

# Set the following API Keys in the Python environment. Will be used later.
os.environ["OPENAI_API_KEY"] = ""
os.environ["GOOGLE_API_KEY"] = ""

In [5]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

# Load a Model


In [6]:
from llama_index.llms.gemini import Gemini

llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)

 from .autonotebook import tqdm as notebook_tqdm
I0000 00:00:1723471002.830383 5318658 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1723471002.837404 5318658 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


# Create a VectoreStore


In [7]:
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from llama_index.vector_stores.chroma import ChromaVectorStore

# create client and a new collection
# chromadb.EphemeralClient saves data in-memory.
chroma_client = chromadb.PersistentClient(path="./mini-llama-articles")
chroma_collection = chroma_client.get_or_create_collection(
 "mini-llama-articles",
 embedding_function=OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"], model_name="text-embedding-3-small")
)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Load the Dataset (CSV)


## Download


The dataset includes several articles from the TowardsAI blog, which provide an in-depth explanation of the LLaMA2 model. Read the dataset as a long string.


In [8]:
!curl -o ./mini-llama-articles.csv https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv

I0000 00:00:1723471003.927906 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork


 % Total % Received % Xferd Average Speed Time Time Time Current
 Dload Upload Total Spent Left Speed
100 169k 100 169k 0 0 506k 0 --:--:-- --:--:-- --:--:-- 506k


## Read File


In [9]:
import csv

rows = []

# Load the file as a JSON
with open("./mini-llama-articles.csv", mode="r", encoding="utf-8") as file:
 csv_reader = csv.reader(file)

 for idx, row in enumerate(csv_reader):
 if idx == 0:
 continue
 # Skip header row
 rows.append(row)

# The number of characters in the dataset.
len(rows)

14

# Convert to Document obj


In [10]:
from llama_index.core import Document

# Convert the chunks to Document objects so the LlamaIndex framework can process them.
documents = [
 Document(
 text=row[1], metadata={"title": row[0], "url": row[2], "source_name": row[3]}
 )
 for row in rows
]
print(documents[0])

Doc ID: 8908a7bc-6918-4725-9859-6e6a7788f865
Text: LLM Variants and Meta's Open Source Before shedding light on
four major trends, I'd share the latest Meta's Llama 2 and Code Llama.
Meta's Llama 2 represents a sophisticated evolution in LLMs. This
suite spans models pretrained and fine-tuned across a parameter
spectrum of 7 billion to 70 billion. A specialized derivative, Llama
2-Chat, has been...


# Transforming


In [11]:
from llama_index.core.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

In [12]:
from llama_index.core.extractors import (
 SummaryExtractor,
 QuestionsAnsweredExtractor,
 KeywordExtractor,
)
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
 transformations=[
 text_splitter,
 QuestionsAnsweredExtractor(questions=3, llm=llm),
 SummaryExtractor(summaries=["prev", "self"], llm=llm),
 KeywordExtractor(keywords=10, llm=llm),
 OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
 ],
 vector_store=vector_store,
)

nodes = pipeline.run(documents=documents, show_progress=True)

I0000 00:00:1723471005.241134 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
Parsing nodes: 100%|██████████| 14/14 [00:00<00:00, 51.60it/s]
 0%| | 0/108 [00:00, ?it/s]I0000 00:00:1723471005.538637 5318658 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
100%|██████████| 108/108 [04:51<00:00, 2.70s/it] 
100%|██████████| 108/108 [05:05<00:00, 2.83s/it] 
100%|██████████| 108/108 [03:39<00:00, 2.04s/it] 
Generating embeddings: 0%| | 0/108 [00:00, ?it/s]I0000 00:00:1723471822.110812 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
Generating embeddings: 100%|██████████| 108/108 [00:03<00:00, 31.65it/s]


In [13]:
len(nodes)

108

In [14]:
!zip -r vectorstore.zip mini-llama-articles

I0000 00:00:1723471826.032425 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork


updating: mini-llama-articles/ (stored 0%)
updating: mini-llama-articles/chroma.sqlite3 (deflated 66%)
updating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/ (stored 0%)
updating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/data_level0.bin (deflated 100%)
updating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/length.bin (deflated 99%)
updating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/link_lists.bin (stored 0%)
updating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/header.bin (deflated 61%)


# Load Indexes


In [15]:
!unzip -o vectorstore.zip

I0000 00:00:1723471826.688310 5318658 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork


Archive: vectorstore.zip
 inflating: mini-llama-articles/chroma.sqlite3 
 inflating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/data_level0.bin 
 inflating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/length.bin 
 extracting: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/link_lists.bin 
 inflating: mini-llama-articles/6fc7339a-e4bb-4707-8db9-a8a5d4e2b37c/header.bin 


In [16]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# Create your index
db = chromadb.PersistentClient(path="./mini-llama-articles")
chroma_collection = db.get_or_create_collection("mini-llama-articles")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [17]:
# Create your index
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex.from_vector_store(vector_store)

In [18]:
from llama_index.embeddings.openai import OpenAIEmbedding

llama_query_engine = vector_index.as_query_engine(
 llm=llm,
 similarity_top_k=3,
 embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

In [19]:
res = llama_query_engine.query("What is the LLama model?")
print(res.response)

Llama is a family of large language models developed by Meta. 



In [20]:
res.response

'Llama is a family of large language models developed by Meta. \n'

In [21]:
for src in res.source_nodes:
 print("Node ID\t", src.node_id)
 print("Title\t", src.metadata["title"])
 print("Text\t", src.text)
 print("Score\t", src.score)
 print("Metadata\t", src.metadata)
 print("-_" * 20)

Node ID	 18dcfeee-ebbc-476f-a4d9-042b26c38aa2
Title	 Beyond GPT-4: What's New?
Text	 LLM Variants and Meta's Open Source Before shedding light on four major trends, I'd share the latest Meta's Llama 2 and Code Llama. Meta's Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2's superior performance over most extant open-source chat models. Human-centric evaluations, focusing on safety and utility metrics, positioned Llama 2-Chat as a potential contender against proprietary, closed-source counterparts. The development trajectory of Llama 2 emphasized rigorous fine-tuning methodologies. Meta's transparent delineation of these processes aims to catalyze community-driven advancements in LLMs, underscoring a commitment to collaborative and responsible 

# Router

Routers are modules that take in a user query and a set of “choices” (defined by metadata), and returns one or more selected choices.

They can be used for the following use cases and more:

- Selecting the right data source among a diverse range of data sources

- Deciding whether to do summarization (e.g. using summary index query engine) or semantic search (e.g. using vector index query engine)

- Deciding whether to “try” out a bunch of choices at once and combine the results (using multi-routing capabilities).


## Lets create a different query engine with Mistral AI information


In [22]:
from pathlib import Path
import requests

wiki_titles = [
 "Mistral AI",
]

data_path = Path("data_wiki")

for title in wiki_titles:
 response = requests.get(
 "https://en.wikipedia.org/w/api.php",
 params={
 "action": "query",
 "format": "json",
 "titles": title,
 "prop": "extracts",
 "explaintext": True,
 },
 ).json()
 page = next(iter(response["query"]["pages"].values()))
 wiki_text = page["extract"]

 if not data_path.exists():
 Path.mkdir(data_path)

 with open(data_path / f"mistral_ai.txt", "w") as fp:
 fp.write(wiki_text)

In [23]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Assuming you have prepared a directory for Mistral data
documents = SimpleDirectoryReader("data_wiki").load_data()

transformations = [
 text_splitter,
 QuestionsAnsweredExtractor(questions=3, llm=llm),
 SummaryExtractor(summaries=["prev", "self"], llm=llm),
 KeywordExtractor(keywords=10, llm=llm),
 OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
]

mistral_index = VectorStoreIndex.from_documents(
 documents=documents, llm=llm, transformations=transformations
)

mistral_query = mistral_index.as_query_engine(
 llm=llm,
 similarity_top_k=2,
 embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

100%|██████████| 5/5 [00:14<00:00, 2.86s/it]
100%|██████████| 5/5 [00:14<00:00, 2.92s/it]
100%|██████████| 5/5 [00:09<00:00, 1.95s/it]


In [None]:
#from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

#documents = SimpleDirectoryReader("data_wiki").load_data()

In [None]:
#from llama_index.core.text_splitter import TokenTextSplitter

#text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)

In [None]:
#from llama_index.core.extractors import (
# SummaryExtractor,
# QuestionsAnsweredExtractor,
# KeywordExtractor,
#)
#from llama_index.embeddings.openai import OpenAIEmbedding
#from llama_index.core.ingestion import IngestionPipeline
#
#transformations = [
# text_splitter,
# QuestionsAnsweredExtractor(questions=3, llm=llm),
# SummaryExtractor(summaries=["prev", "self"], llm=llm),
# KeywordExtractor(keywords=10, llm=llm),
# OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
#]
#
#mistral_index = VectorStoreIndex.from_documents(
# documents=documents, llm=llm, transformations=transformations
#)

In [None]:
#mistral_query = mistral_index.as_query_engine(
# llm=llm,
# similarity_top_k=2,
# embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
#)

In [24]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector, LLMSingleSelector
from llama_index.core.tools import QueryEngineTool
from llama_index.core import VectorStoreIndex, SummaryIndex

# initialize tools
llama_tool = QueryEngineTool.from_defaults(
 query_engine=llama_query_engine,
 description="Useful for questions about the LLama LLM created by Meta",
)
mistral_tool = QueryEngineTool.from_defaults(
 query_engine=mistral_query,
 description="Useful for questions about the Mistral LLM created by Mistral AI",
)

# initialize router query engine (single selection, pydantic)
query_engine = RouterQueryEngine(
 selector=PydanticSingleSelector.from_defaults(),
 query_engine_tools=[
 llama_tool,
 mistral_tool,
 ],
)

In [25]:
res = query_engine.query(
 "What is the LLama model?",
)
print(res.response)

'Llama 2 is a suite of large language models, spanning 7 billion to 70 billion parameters, trained and fine-tuned for improved performance. \n'

In [26]:
for src in res.source_nodes:
 print("Node ID\t", src.node_id)
 print("Title\t", src.metadata["title"])
 print("Text\t", src.text)
 print("Score\t", src.score)
 print("Metadata\t", src.metadata)
 print("-_" * 20)

Node ID	 18dcfeee-ebbc-476f-a4d9-042b26c38aa2
Title	 Beyond GPT-4: What's New?
Text	 LLM Variants and Meta's Open Source Before shedding light on four major trends, I'd share the latest Meta's Llama 2 and Code Llama. Meta's Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2's superior performance over most extant open-source chat models. Human-centric evaluations, focusing on safety and utility metrics, positioned Llama 2-Chat as a potential contender against proprietary, closed-source counterparts. The development trajectory of Llama 2 emphasized rigorous fine-tuning methodologies. Meta's transparent delineation of these processes aims to catalyze community-driven advancements in LLMs, underscoring a commitment to collaborative and responsible 

In [27]:
res = query_engine.query("What is the Mistral model?")
print(res.response)

'Mistral is a French startup that specializes in developing language models. They have released a variety of models, some open-source and some accessible only through an API. Their models are known for their efficiency and strong performance, particularly in multilingual capabilities and instruction following. \n'

In [28]:
for src in res.source_nodes:
 print("Node ID\t", src.node_id)
 print("Text\t", src.text)
 print("Score\t", src.score)
 print("-_" * 20)

Node ID	 a5dcf99d-0e3f-4c9a-b003-9766cdbad1c1
Text	 fundraising of €105 million ($117 million) with investors including the American fund Lightspeed Venture Partners, Eric Schmidt, Xavier Niel and JCDecaux. The valuation is then estimated by the Financial Times at €240 million ($267 million).
On 27 September 2023, the company made its language processing model “Mistral 7B” available under the free Apache 2.0 license. This model has 7 billion parameters, a small size compared to its competitors.
On 10 December 2023, Mistral AI announced that it had raised €385 million ($428 million) as part of its second fundraising. This round of financing notably involves the Californian fund Andreessen Horowitz, BNP Paribas and the software publisher Salesforce.
On 11 December 2023, the company released the Mixtral 8x7B model with 46.7 billion parameters but using only 12.9 billion per token thanks to the mixture of experts architecture. The model masters 5 languages (French, Spanish, Italian, Englis

# OpenAI Agent

In [29]:
from llama_index.agent.openai import OpenAIAgent

In [30]:
system_message_openai_agent = """You are an AI teacher, answering questions from students of an applied AI course on Large Language Models (LLMs or llm) and Retrieval Augmented Generation (RAG) for LLMs. Topics covered include training models, fine-tuning models, giving memory to LLMs, prompting tips, hallucinations and bias, vector databases, transformer architectures, embeddings, RAG frameworks, Langchain, LlamaIndex, making LLMs interact with tools, AI agents, reinforcement learning with human feedback. Questions should be understood in this context.

Your answers are aimed to teach students, so they should be complete, clear, and easy to understand.

Use the available tools to gather insights pertinent to the field of AI. Always use two tools at the same time. These tools accept a string (a user query rewritten as a statement) and return informative content regarding the domain of AI.
e.g:
User question: 'How can I fine-tune an LLM?'
Input to the tool: 'Fine-tuning an LLM'

User question: How can quantize an LLM?
Input to the tool: 'Quantization for LLMs'

User question: 'Teach me how to build an AI agent"'
Input to the tool: 'Building an AI Agent'

Only some information returned by the tools might be relevant to the question, so ignore the irrelevant part and answer the question with what you have.

Your responses are exclusively based on the output provided by the tools. Refrain from incorporating information not directly obtained from the tool's responses.

When the conversation deepens or shifts focus within a topic, adapt your input to the tools to reflect these nuances. This means if a user requests further elaboration on a specific aspect of a previously discussed topic, you should reformulate your input to the tool to capture this new angle or more profound layer of inquiry.

Provide comprehensive answers, ideally structured in multiple paragraphs, drawing from the tool's variety of relevant details. The depth and breadth of your responses should align with the scope and specificity of the information retrieved.

Should the tools repository lack information on the queried topic, politely inform the user that the question transcends the bounds of your current knowledge base, citing the absence of relevant content in the tool's documentation.

At the end of your answers, always invite the students to ask deeper questions about the topic if they have any. Make sure to reformulate the question to the tool to capture this new angle or more profound layer of inquiry.

Do not refer to the documentation directly, but use the information provided within it to answer questions.

If code is provided in the information, share it with the students. It's important to provide complete code blocks so they can execute the code when they copy and paste them.

Make sure to format your answers in Markdown format, including code blocks and snippets.

Politely reject questions not related to AI, while being cautious not to reject unfamiliar terms or acronyms too quickly."""

In [31]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o")

agent = OpenAIAgent.from_tools(
 llm=llm,
 tools=[llama_tool, mistral_tool],
 system_prompt=system_message_openai_agent,
)

In [32]:
response = agent.chat("What is the LLama model?")
print(response.response)

The **LLaMA (Large Language Model Meta AI)** is a large language model developed by Meta AI. It is designed to perform a variety of natural language processing tasks by leveraging a vast amount of training data and sophisticated neural network architectures.

### Key Points about LLaMA:

1. **Developer**: Meta AI, the artificial intelligence research division of Meta (formerly Facebook).
2. **Model Size**: One of the notable versions is the LLaMA 70B, which indicates it has 70 billion parameters.
3. **Performance**: The LLaMA models are designed to be competitive with other state-of-the-art language models. However, in some benchmarks, models like Mistral AI's Mixtral 8x7B have been noted to outperform LLaMA 70B.
4. **Variants**: There are different versions of the LLaMA model, including LLaMA-2, which represents an evolution or improvement over the original LLaMA models.

LLaMA models are part of the broader trend in AI research to develop increasingly powerful and capable language mo

In [33]:
response = agent.chat("What is the Mistral model?")
print(response.response)

The **Mistral model** refers to a range of large language models (LLMs) developed by Mistral AI. These models are designed to perform a variety of natural language processing tasks and are available in both open-source and API-only formats.

### Key Points about Mistral Models:

1. **Developer**: Mistral AI, a company focused on developing advanced language models.
2. **Model Variants**:
 - **Open-Source Models**:
 - **Mistral 7B**: A general-purpose language model.
 - **Mixtral 8x7B**: A model that combines multiple smaller models to enhance performance.
 - **Mixtral 8x22B**: Another composite model with a larger parameter count.
 - **Mathstral 7B**: Specialized for tasks related to STEM (Science, Technology, Engineering, and Mathematics).
 - **Codestral Mamba 7B**: Tailored for code generation tasks.
 - **API-Only Models**:
 - **Mistral Small**
 - **Mistral Medium**
 - **Mistral Large**
 
3. **Performance**: Mistral models are designed to be competitive with other leading LLMs such a

In [34]:
response = agent.chat("Write the recipe for a chocolate cake.")
print(response.response)

I'm here to help with questions related to AI, particularly in the context of Large Language Models (LLMs) and related technologies. If you have any questions about these topics, feel free to ask!

For non-AI related queries, such as recipes, you might want to consult a cooking website or a recipe book. If you have any questions about AI, please let me know!


# Code related questions to GPT-4o, the remaining questions to Gemini

In [48]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.llms.openai import OpenAI
from llama_index.llms.gemini import Gemini
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticSingleSelector
from llama_index.core.tools import QueryEngineTool

# initialize LLMs
gpt_4o_llm = OpenAI(model="gpt-4o")
gemini_llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=512)

# define query engines
llama_query_engine_code = vector_index.as_query_engine(
 llm=gpt_4o_llm,
 similarity_top_k=3,
 embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)
llama_query_engine_rest = vector_index.as_query_engine(
 llm=gemini_llm,
 similarity_top_k=3,
 embed_model=OpenAIEmbedding(model="text-embedding-3-small", mode="text_search"),
)

# define tools for Llama
llama_tool_code = QueryEngineTool.from_defaults(
 query_engine=llama_query_engine_code,
 description="Useful for code-related questions about the LLama LLM created by Meta",
 name="LLamaCodeTool",
)
llama_tool_rest = QueryEngineTool.from_defaults(
 query_engine=llama_query_engine_rest,
 description="Useful for non-code-related questions about the LLama LLM created by Meta",
 name="LLamaGeneralTool",
)

# Initialize OpenAIAgent with the system message and the router query engine
agent = OpenAIAgent.from_tools(
 llm=gpt_4o_llm, # The base LLM, used only if no other tools apply
 tools=[llama_tool_code, llama_tool_rest],
 system_prompt=system_message_openai_agent,
)

I0000 00:00:1723473141.439669 5318658 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [53]:
# Test the agent with a code-related question
response = agent.chat("How do I fine-tune the LLama model? Write the code for it.")
for source in response.sources:
 print(source.tool_name)

LLamaGeneralTool
LLamaCodeTool


In [54]:
# Test the agent with a code-related question
response = agent.chat("What is the relationship between Llama and Meta?")
for source in response.sources:
 print(source.tool_name)

LLamaGeneralTool
