yinkiu602 commited on
Commit
5bbd8cb
·
unverified ·
1 Parent(s): a5bc86d

misc: New collection for new data + upload flow notebook

Browse files

Updated the requirements.txt to prevent "Client.__init__() got an unexpected keyword argument 'proxies'"

Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +1 -1
  3. requirements.txt +1 -0
  4. upload.ipynb +219 -0
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  .env
 
2
  **/__pycache__
3
  awesumcare_data
4
  TestData
 
1
  .env
2
+ .venv
3
  **/__pycache__
4
  awesumcare_data
5
  TestData
app.py CHANGED
@@ -33,7 +33,7 @@ llama_index.core.set_global_handler("arize_phoenix")
33
  openai.api_key = os.getenv("OPENAI_API_KEY")
34
 
35
  IS_LOAD_FROM_VECTOR_STORE = True
36
- VDB_COLLECTION_NAME = "demo-v7"
37
  MODEL_NAME = ChatbotVersion.CHATGPT_4O.value
38
 
39
  CHUNK_SIZE = 8191
 
33
  openai.api_key = os.getenv("OPENAI_API_KEY")
34
 
35
  IS_LOAD_FROM_VECTOR_STORE = True
36
+ VDB_COLLECTION_NAME = "demo-v8"
37
  MODEL_NAME = ChatbotVersion.CHATGPT_4O.value
38
 
39
  CHUNK_SIZE = 8191
requirements.txt CHANGED
@@ -6,6 +6,7 @@ python-dotenv==1.0.1
6
  qdrant-client==1.10.1
7
  arize-phoenix==4.14.1
8
  fastapi>=0.80
 
9
  llama-index-vector-stores-qdrant==0.2.14
10
  llama-index-agent-openai==0.2.9
11
  llama-index-embeddings-azure-openai==0.1.11
 
6
  qdrant-client==1.10.1
7
  arize-phoenix==4.14.1
8
  fastapi>=0.80
9
+ httpx==0.27.2
10
  llama-index-vector-stores-qdrant==0.2.14
11
  llama-index-agent-openai==0.2.9
12
  llama-index-embeddings-azure-openai==0.1.11
upload.ipynb ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Awesum Care dataset upload flow.\n",
8
+ "\n",
9
+ "### This section describe the flow for turning information from text to vector for RAG. The vector db used below is locally hosted. To upload to the production, change the qdrant config.\n",
10
+ "\n",
11
+ "1. Put the data into a text file (.pdf/.docx/.txt/.md), then put then into a subdirectory. (/awesumcare_data in this example).\n",
12
+ "2. Change them to embedding.\n",
13
+ "3. Verify locally if works.\n",
14
+ "\n",
15
+ "(If without old data file)\n",
16
+ "\n",
17
+ "4. Create a duplicate of existing collection.\n",
18
+ "5. Deploy new collection with snapshots and upload new data"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "markdown",
23
+ "metadata": {},
24
+ "source": [
25
+ "### Read the data"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": 1,
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "name": "stderr",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "[nltk_data] Downloading package punkt to\n",
38
+ "[nltk_data] C:\\Users\\josh\\AppData\\Roaming\\nltk_data...\n",
39
+ "[nltk_data] Package punkt is already up-to-date!\n",
40
+ "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
41
+ "[nltk_data] C:\\Users\\josh\\AppData\\Roaming\\nltk_data...\n",
42
+ "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
43
+ "[nltk_data] date!\n",
44
+ "[nltk_data] Downloading package punkt to\n",
45
+ "[nltk_data] C:\\Users\\josh\\AppData\\Roaming\\nltk_data...\n",
46
+ "[nltk_data] Package punkt is already up-to-date!\n",
47
+ "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
48
+ "[nltk_data] C:\\Users\\josh\\AppData\\Roaming\\nltk_data...\n",
49
+ "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
50
+ "[nltk_data] date!\n",
51
+ "[nltk_data] Downloading package punkt to\n",
52
+ "[nltk_data] C:\\Users\\josh\\AppData\\Roaming\\nltk_data...\n",
53
+ "[nltk_data] Package punkt is already up-to-date!\n",
54
+ "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
55
+ "[nltk_data] C:\\Users\\josh\\AppData\\Roaming\\nltk_data...\n",
56
+ "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
57
+ "[nltk_data] date!\n"
58
+ ]
59
+ }
60
+ ],
61
+ "source": [
62
+ "from llama_index.core import SimpleDirectoryReader\n",
63
+ "\n",
64
+ "from custom_io import MarkdownReader, UnstructuredReader, default_file_metadata_func\n",
65
+ "\n",
66
+ "dir_reader = SimpleDirectoryReader(\n",
67
+ " \"./awesumcare_data\",\n",
68
+ " file_extractor={\n",
69
+ " \".pdf\": UnstructuredReader(),\n",
70
+ " \".docx\": UnstructuredReader(),\n",
71
+ " \".pptx\": UnstructuredReader(),\n",
72
+ " \".md\": MarkdownReader(),\n",
73
+ " },\n",
74
+ " recursive=True,\n",
75
+ " exclude=[\"*.png\", \"*.pptx\", \"*.docx\", \"*.pdf\"],\n",
76
+ " file_metadata=default_file_metadata_func,\n",
77
+ ")\n",
78
+ "\n",
79
+ "documents = dir_reader.load_data()"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "markdown",
84
+ "metadata": {},
85
+ "source": [
86
+ "Create the embedding client add feed to the IngestionPipeline"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {},
93
+ "outputs": [
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "WARNING:root:Payload indexes have no effect in the local Qdrant. Please use server Qdrant if you need payload indexes.\n"
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "from llama_index.core import VectorStoreIndex\n",
104
+ "from llama_index.core.ingestion import IngestionPipeline\n",
105
+ "from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding\n",
106
+ "from llama_index.vector_stores.qdrant import QdrantVectorStore\n",
107
+ "\n",
108
+ "\n",
109
+ "import qdrant_client\n",
110
+ "import nest_asyncio\n",
111
+ "\n",
112
+ "client = qdrant_client.QdrantClient(location=\":memory:\")\n",
113
+ "vector_store = QdrantVectorStore(client=client, collection_name=\"test_store\")\n",
114
+ "\n",
115
+ "embedding_client = AzureOpenAIEmbedding(\n",
116
+ " deployment_name=\"text-embedding-ada-002\",\n",
117
+ " api_key=\"\",\n",
118
+ " azure_endpoint=\"\",\n",
119
+ " api_version=\"2024-02-01\",\n",
120
+ ")\n",
121
+ "\n",
122
+ "pipeline = IngestionPipeline(\n",
123
+ " transformations=[\n",
124
+ " embedding_client,\n",
125
+ " ],\n",
126
+ " vector_store=vector_store,\n",
127
+ ")\n",
128
+ "\n",
129
+ "# Need this for the code to run in my jupyter notebook. Not sure if needed in a different env.\n",
130
+ "nest_asyncio.apply()\n",
131
+ "\n",
132
+ "# Ingest directly into a vector db\n",
133
+ "pipeline.run(documents=documents)\n",
134
+ "\n",
135
+ "# Create your index\n",
136
+ "index = VectorStoreIndex.from_vector_store(\n",
137
+ " vector_store=vector_store, embed_model=embedding_client\n",
138
+ ")"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "markdown",
143
+ "metadata": {},
144
+ "source": [
145
+ "Verify embeeding result:"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 3,
151
+ "metadata": {},
152
+ "outputs": [
153
+ {
154
+ "name": "stdout",
155
+ "output_type": "stream",
156
+ "text": [
157
+ "7. 見證人可以是親人嗎?\n",
158
+ "\n",
159
+ " 見證人不能是遺囑的受益人或受益人的配偶。如果親人是遺囑的見證人,那他們便不能在遺囑中有利益關係,意思是親人不能同時作為遺囑的見證人及受益人。見證人必須是年滿18歲且具有完全行為能力的成年人。\n",
160
+ "18. 在遺囑裡分配共同擁有一物業(長命契),”如何分配所持有的物業”一欄應如何填寫?\n",
161
+ "\n",
162
+ " 在填寫遺囑時,如果涉及共同擁有的物業(例如長命契),需要特別注意如何分配這部分資產。共同擁有的物業通常有兩種形式:聯權共有(長命契)和分權共有(分權契)。\n",
163
+ " ### 聯權共有(長命契):\n",
164
+ " > - 在聯權共有的情況下,當其中一位擁有人去世,其持有的份額會自動轉移給其他聯權共有人,這稱為「生者繼承權」。這種情況下,該物業的份額通常不會在遺囑中分配,因為它自動轉移給其他共有人。\n",
165
+ " ### 分權共有(分權契):\n",
166
+ " > - 在分權共有的情況下,每位共有人擁有物業的特定份額,並且這些份額可以在遺囑中分配給指定的受益人。\n",
167
+ " > - 在填寫遺囑的「如何分配所持有的物業」一欄時,可以就分權共有的物業分配,寫上:我所持有的 [物業地址] 的 [百分比或具體份額],應分配給 [受益人姓名]。\n",
168
+ "8. 遺囑一定需要見證人嗎?沒有見證人的遺囑有效嗎?\n",
169
+ "\n",
170
+ " 是的,遺囑必須有見證人。根據《遺囑條例》(第30章)第5(1)(c)條,一份具有法律效力的遺囑必須在兩名年滿18歲且非受益人的獨立見證人面前簽署和加上日期。如果遺囑沒有見證人,則該遺囑可能會被視為無效。請注意,如果立遺囑人年紀較大,或已開始有腦退化症狀,最好找醫生見證。此外,如果遺產承辦處對非律師製作的遺囑的真實性有任何疑問,可能會要求見證人簽署並作出有關見證遺囑的聲明。\n"
171
+ ]
172
+ }
173
+ ],
174
+ "source": [
175
+ "from qdrant_client.http import models\n",
176
+ "import random\n",
177
+ "\n",
178
+ "# You can comment out this line to reuse the same query vector after a run if you find the results unsatisfactory.\n",
179
+ "# This allows you to compare the results after modify the document.\n",
180
+ "query_vector=[random.random() for _ in range(1536)]\n",
181
+ "\n",
182
+ "res = client.search(\n",
183
+ " collection_name=\"test_store\",\n",
184
+ " search_params=models.SearchParams(hnsw_ef=128, exact=False),\n",
185
+ " # create a list of random float with 1536 elements\n",
186
+ " query_vector=query_vector,\n",
187
+ " limit=3,\n",
188
+ ")\n",
189
+ "\n",
190
+ "# Need this line, or will have error: \"NameError: name 'null' is not defined\"\n",
191
+ "null = None\n",
192
+ "print(eval(res[0].payload[\"_node_content\"])[\"text\"])\n",
193
+ "print(eval(res[1].payload[\"_node_content\"])[\"text\"])\n",
194
+ "print(eval(res[2].payload[\"_node_content\"])[\"text\"])"
195
+ ]
196
+ }
197
+ ],
198
+ "metadata": {
199
+ "kernelspec": {
200
+ "display_name": ".venv",
201
+ "language": "python",
202
+ "name": "python3"
203
+ },
204
+ "language_info": {
205
+ "codemirror_mode": {
206
+ "name": "ipython",
207
+ "version": 3
208
+ },
209
+ "file_extension": ".py",
210
+ "mimetype": "text/x-python",
211
+ "name": "python",
212
+ "nbconvert_exporter": "python",
213
+ "pygments_lexer": "ipython3",
214
+ "version": "3.11.3"
215
+ }
216
+ },
217
+ "nbformat": 4,
218
+ "nbformat_minor": 2
219
+ }