Spaces:
Sleeping
Sleeping
File size: 12,272 Bytes
05eeb7e |
|
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"id": "ffc66704-4649-455a-ba18-39506faac1b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: langchain in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (0.1.12)Note: you may need to restart the kernel to use updated packages.\n",
"\n",
"Requirement already satisfied: PyYAML>=5.3 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (6.0.1)\n",
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (2.0.30)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (3.9.5)\n",
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.6.7)\n",
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (1.33)\n",
"Requirement already satisfied: langchain-community<0.1,>=0.0.28 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.0.38)\n",
"Requirement already satisfied: langchain-core<0.2.0,>=0.1.31 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.1.52)\n",
"Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.0.2)\n",
"Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.1.77)\n",
"Requirement already satisfied: numpy<2,>=1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (1.26.4)\n",
"Requirement already satisfied: pydantic<3,>=1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (2.7.4)\n",
"Requirement already satisfied: requests<3,>=2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (2.32.3)\n",
"Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (8.3.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.3)\n",
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
"Requirement already satisfied: jsonpointer>=1.9 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from jsonpatch<2.0,>=1.33->langchain) (3.0.0)\n",
"Requirement already satisfied: packaging<24.0,>=23.2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain-core<0.2.0,>=0.1.31->langchain) (23.2)\n",
"Requirement already satisfied: orjson<4.0.0,>=3.9.14 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.5)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from pydantic<3,>=1->langchain) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.18.4 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from pydantic<3,>=1->langchain) (2.18.4)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from pydantic<3,>=1->langchain) (4.12.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (2024.6.2)\n",
"Requirement already satisfied: greenlet!=0.4.17 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
"Requirement already satisfied: mypy-extensions>=0.3.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
]
}
],
"source": [
"pip install langchain"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c800f63a-d5d0-4bbe-8885-c1c9f55e43ff",
"metadata": {},
"outputs": [],
"source": [
"from langchain_huggingface import HuggingFaceEndpoint"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7489c4dd-3a4b-421a-bc92-91de6721f2d7",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# model_id = \"tiiuae/falcon-7b-instruct\"\n",
"model_id = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
"llm_hub = HuggingFaceEndpoint(\n",
" repo_id=model_id,\n",
" temperature=0.1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "d78e9a22-c752-4637-9701-b50374d3cdcc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"load INSTRUCTOR_Transformer\n",
"max_seq_length 512\n"
]
}
],
"source": [
"import torch\n",
"DEVICE = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
"from langchain.embeddings import HuggingFaceInstructEmbeddings\n",
"embeddings = HuggingFaceInstructEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": DEVICE}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "8955a5c5-8821-414f-bd53-d87c7c77ca6c",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PyPDFLoader\n",
"loader = PyPDFLoader(\"maths_for_cs.pdf\")\n",
"documents = loader.load()\n",
"\n",
"# Split the document into chunks\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"# Create an embeddings database using Chroma from the split text chunks.\n",
"from langchain.vectorstores import Chroma\n",
"db = Chroma.from_documents(texts, embedding=embeddings)\n",
"\n",
"# --> Build the QA chain, which utilizes the LLM and retriever for answering questions. \n",
"# By default, the vectorstore retriever uses similarity search. \n",
"# If the underlying vectorstore support maximum marginal relevance search, you can specify that as the search type (search_type=\"mmr\").\n",
"# You can also specify search kwargs like k to use when doing retrieval. k represent how many search results send to llm\n",
"from langchain.chains import RetrievalQA\n",
"conversation_retrieval_chain = RetrievalQA.from_chain_type(\n",
" llm=llm_hub,\n",
" chain_type=\"stuff\",\n",
" retriever=db.as_retriever(search_type=\"mmr\", search_kwargs={'k': 6, 'lambda_mult': 0.25}),\n",
" return_source_documents=False,\n",
" input_key = \"question\"\n",
" # chain_type_kwargs={\"prompt\": prompt} # if you are using prompt template, you need to uncomment this part\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "519e6573-278d-4a02-9159-d6f68cac68e2",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'question': 'what is the content inside?', 'chat_history': '', 'result': ' The content inside is the text of a book or document, which is divided into 14 sections or chapters. The text appears to be about probability theory and statistics, and it includes examples, definitions, and propositions. The content is likely intended for students or professionals in the field of statistics or mathematics. The text does not appear to be a specific book or document, but rather a sample or excerpt from a larger work. The content is written in a formal and technical style, suggesting that it is intended for an audience with a strong background in mathematics and statistics. The text includes mathematical notation and symbols, such as the use of Greek letters and mathematical operators. The content appears to be a mix of theoretical and practical information, with examples and definitions provided to illustrate key concepts. Overall, the content appears to be a sample or excerpt from a textbook or academic paper on probability theory and statistics.'}\n"
]
}
],
"source": [
"# Query the model\n",
"global chat_history\n",
"output = conversation_retrieval_chain.invoke({\"question\": \"what is the content inside?\", \"chat_history\": \"\"})\n",
"print(output)\n",
"answer = output[\"result\"]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "21b43f4f-f368-47f3-b975-bf8dbe67c8cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" The content inside is the text of a book or document, which appears to be a statistics or probability textbook. The text discusses concepts such as partitions, propositions, and events of interest, and includes examples and definitions. It does not appear to be a specific chapter or section title. If you are looking for a specific piece of information, you may need to read the text more closely or search for a specific keyword or phrase.\n"
]
}
],
"source": [
"print(answer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08928f9a-c17d-43b4-adc0-9401cb3a19ba",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|