Spaces:
Sleeping
Sleeping
File size: 12,272 Bytes
05eeb7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"id": "ffc66704-4649-455a-ba18-39506faac1b7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: langchain in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (0.1.12)Note: you may need to restart the kernel to use updated packages.\n",
"\n",
"Requirement already satisfied: PyYAML>=5.3 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (6.0.1)\n",
"Requirement already satisfied: SQLAlchemy<3,>=1.4 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (2.0.30)\n",
"Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (3.9.5)\n",
"Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.6.7)\n",
"Requirement already satisfied: jsonpatch<2.0,>=1.33 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (1.33)\n",
"Requirement already satisfied: langchain-community<0.1,>=0.0.28 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.0.38)\n",
"Requirement already satisfied: langchain-core<0.2.0,>=0.1.31 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.1.52)\n",
"Requirement already satisfied: langchain-text-splitters<0.1,>=0.0.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.0.2)\n",
"Requirement already satisfied: langsmith<0.2.0,>=0.1.17 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (0.1.77)\n",
"Requirement already satisfied: numpy<2,>=1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (1.26.4)\n",
"Requirement already satisfied: pydantic<3,>=1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (2.7.4)\n",
"Requirement already satisfied: requests<3,>=2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (2.32.3)\n",
"Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain) (8.3.0)\n",
"Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n",
"Requirement already satisfied: attrs>=17.3.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n",
"Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.1)\n",
"Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n",
"Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n",
"Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.21.3)\n",
"Requirement already satisfied: typing-inspect<1,>=0.4.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
"Requirement already satisfied: jsonpointer>=1.9 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from jsonpatch<2.0,>=1.33->langchain) (3.0.0)\n",
"Requirement already satisfied: packaging<24.0,>=23.2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langchain-core<0.2.0,>=0.1.31->langchain) (23.2)\n",
"Requirement already satisfied: orjson<4.0.0,>=3.9.14 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from langsmith<0.2.0,>=0.1.17->langchain) (3.10.5)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from pydantic<3,>=1->langchain) (0.7.0)\n",
"Requirement already satisfied: pydantic-core==2.18.4 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from pydantic<3,>=1->langchain) (2.18.4)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from pydantic<3,>=1->langchain) (4.12.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from requests<3,>=2->langchain) (2024.6.2)\n",
"Requirement already satisfied: greenlet!=0.4.17 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from SQLAlchemy<3,>=1.4->langchain) (3.0.3)\n",
"Requirement already satisfied: mypy-extensions>=0.3.0 in c:\\users\\dtkmn\\.pyenv\\pyenv-win\\versions\\3.11.0\\lib\\site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
]
}
],
"source": [
"pip install langchain"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "c800f63a-d5d0-4bbe-8885-c1c9f55e43ff",
"metadata": {},
"outputs": [],
"source": [
"from langchain_huggingface import HuggingFaceEndpoint"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7489c4dd-3a4b-421a-bc92-91de6721f2d7",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# model_id = \"tiiuae/falcon-7b-instruct\"\n",
"model_id = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n",
"llm_hub = HuggingFaceEndpoint(\n",
" repo_id=model_id,\n",
" temperature=0.1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "d78e9a22-c752-4637-9701-b50374d3cdcc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"load INSTRUCTOR_Transformer\n",
"max_seq_length 512\n"
]
}
],
"source": [
"import torch\n",
"DEVICE = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
"from langchain.embeddings import HuggingFaceInstructEmbeddings\n",
"embeddings = HuggingFaceInstructEmbeddings(\n",
" model_name=\"sentence-transformers/all-MiniLM-L6-v2\", model_kwargs={\"device\": DEVICE}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "8955a5c5-8821-414f-bd53-d87c7c77ca6c",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import PyPDFLoader\n",
"loader = PyPDFLoader(\"maths_for_cs.pdf\")\n",
"documents = loader.load()\n",
"\n",
"# Split the document into chunks\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"# Create an embeddings database using Chroma from the split text chunks.\n",
"from langchain.vectorstores import Chroma\n",
"db = Chroma.from_documents(texts, embedding=embeddings)\n",
"\n",
"# --> Build the QA chain, which utilizes the LLM and retriever for answering questions. \n",
"# By default, the vectorstore retriever uses similarity search. \n",
"# If the underlying vectorstore support maximum marginal relevance search, you can specify that as the search type (search_type=\"mmr\").\n",
"# You can also specify search kwargs like k to use when doing retrieval. k represent how many search results send to llm\n",
"from langchain.chains import RetrievalQA\n",
"conversation_retrieval_chain = RetrievalQA.from_chain_type(\n",
" llm=llm_hub,\n",
" chain_type=\"stuff\",\n",
" retriever=db.as_retriever(search_type=\"mmr\", search_kwargs={'k': 6, 'lambda_mult': 0.25}),\n",
" return_source_documents=False,\n",
" input_key = \"question\"\n",
" # chain_type_kwargs={\"prompt\": prompt} # if you are using prompt template, you need to uncomment this part\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "519e6573-278d-4a02-9159-d6f68cac68e2",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'question': 'what is the content inside?', 'chat_history': '', 'result': ' The content inside is the text of a book or document, which is divided into 14 sections or chapters. The text appears to be about probability theory and statistics, and it includes examples, definitions, and propositions. The content is likely intended for students or professionals in the field of statistics or mathematics. The text does not appear to be a specific book or document, but rather a sample or excerpt from a larger work. The content is written in a formal and technical style, suggesting that it is intended for an audience with a strong background in mathematics and statistics. The text includes mathematical notation and symbols, such as the use of Greek letters and mathematical operators. The content appears to be a mix of theoretical and practical information, with examples and definitions provided to illustrate key concepts. Overall, the content appears to be a sample or excerpt from a textbook or academic paper on probability theory and statistics.'}\n"
]
}
],
"source": [
"# Query the model\n",
"global chat_history\n",
"output = conversation_retrieval_chain.invoke({\"question\": \"what is the content inside?\", \"chat_history\": \"\"})\n",
"print(output)\n",
"answer = output[\"result\"]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "21b43f4f-f368-47f3-b975-bf8dbe67c8cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" The content inside is the text of a book or document, which appears to be a statistics or probability textbook. The text discusses concepts such as partitions, propositions, and events of interest, and includes examples and definitions. It does not appear to be a specific chapter or section title. If you are looking for a specific piece of information, you may need to read the text more closely or search for a specific keyword or phrase.\n"
]
}
],
"source": [
"print(answer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08928f9a-c17d-43b4-adc0-9401cb3a19ba",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|