tregu0458 commited on
Commit
daa68f6
1 Parent(s): e385e29

Upload pdf_2_dify_workflow.ipynb

Browse files
Files changed (1) hide show
  1. pdf_2_dify_workflow.ipynb +241 -0
pdf_2_dify_workflow.ipynb ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "id": "2RCxpRzpqPrB"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "!pip install gradio unstructured[pdf] langchain-community"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "source": [
31
+ "import getpass\n",
32
+ "\n",
33
+ "DIFY_BASE_URL = getpass.getpass(\"DIFY_BASE_URL:\")\n",
34
+ "DIFY_API_KEY_MYWORKFLOW = getpass.getpass(\"DIFY_API_KEY_MYWORKFLOW: \")\n",
35
+ "\n"
36
+ ],
37
+ "metadata": {
38
+ "id": "iDbRDVzHqZh8"
39
+ },
40
+ "execution_count": null,
41
+ "outputs": []
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "source": [
46
+ "import requests\n",
47
+ "import json\n",
48
+ "url = DIFY_BASE_URL + \"/workflows/run\"\n",
49
+ "\n",
50
+ "headers = {\n",
51
+ " \"Content-Type\": \"application/json\",\n",
52
+ " \"Authorization\": f\"Bearer {DIFY_API_KEY_MYWORKFLOW}\"\n",
53
+ "}\n",
54
+ "\n",
55
+ "data = {\n",
56
+ " \"inputs\": {\n",
57
+ " \"url\":\"\",\n",
58
+ " \"knowledge\":\"\"\"\n",
59
+ " 本作の悪役。千年以上前に生まれた最初の鬼。鬼達の絶対的支配者で、自身の血を人間に与え大量の鬼を作り出した。炭治郎の家族を殺し、禰󠄀豆子を鬼に変えた仇である。\n",
60
+ "鬼達を血に仕込んだ呪いで支配し、「あの方」と呼ばれ恐れられている。外見や攻撃は自由自在で、不死身の鬼を殺すことができる。性格は冷酷非情かつ支配的で、自らの意志に沿わない者は決して許さず、忠実に従っていた下弦の鬼達を些細なことで何ら躊躇なく惨殺したり、報告に来た猗窩座に理不尽な叱責を与えるなどしている。珠世からはその人物像を「いつも何かに怯えている臆病者」と皮肉られている。癇癪で暴力を振るったり、自分を棚に上げた言動をすることも多い。\n",
61
+ "\n",
62
+ " \"\"\",\n",
63
+ " },\n",
64
+ " \"query\": \"\", # クエリ(オプション)\n",
65
+ " \"response_mode\": \"streaming\", # ストリーミング応答\n",
66
+ " \"user\": \"abc_123\", # ユーザーID\n",
67
+ "}\n",
68
+ "\n",
69
+ "response = requests.post(url, headers=headers, json=data, stream=True)\n",
70
+ "\n",
71
+ "response.raise_for_status() # エラーチェック\n",
72
+ "\n",
73
+ "response = requests.post(url, headers=headers, json=data, stream=True)\n",
74
+ "response.raise_for_status()\n",
75
+ "\n",
76
+ "assistant_message = \"\"\n",
77
+ "outputs = {}\n",
78
+ "\n",
79
+ "# APIレスポンスのチャンク処理\n",
80
+ "for chunk in response.iter_lines(delimiter=b\"\\n\\n\"):\n",
81
+ " if chunk:\n",
82
+ " chunk_data = chunk.decode(\"utf-8\").strip()\n",
83
+ " if chunk_data.startswith(\"data:\"):\n",
84
+ " json_data = chunk_data[6:] # \"data: \"を取り除く\n",
85
+ " if json_data:\n",
86
+ " result = json.loads(json_data)\n",
87
+ " if result.get(\"event\") == \"text_chunk\":\n",
88
+ " answer = result.get(\"data\", \"\").get(\"text\", \"\")\n",
89
+ " assistant_message += str(answer)\n",
90
+ " print(str(answer), end=\"\", flush=True)\n",
91
+ " elif result.get(\"event\") == \"workflow_finished\":\n",
92
+ " outputs = result.get('data', \"\")\n",
93
+ " print(assistant_message, outputs)"
94
+ ],
95
+ "metadata": {
96
+ "id": "_1LzsFX7rve1"
97
+ },
98
+ "execution_count": null,
99
+ "outputs": []
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "source": [
104
+ "import gradio as gr\n",
105
+ "import requests\n",
106
+ "from langchain_community.document_loaders import UnstructuredPDFLoader\n",
107
+ "import json\n",
108
+ "\n",
109
+ "def run_workflow(message):\n",
110
+ " try:\n",
111
+ " file = message['files'][0]\n",
112
+ " text_message = message['text']\n",
113
+ "\n",
114
+ " # PDFファイルが選択されているかチェック\n",
115
+ " if not file:\n",
116
+ " return \"PDFファイルを選択してください。\", \"\"\n",
117
+ "\n",
118
+ " # PDFファイルをロードしてテキストを抽出\n",
119
+ " loader = UnstructuredPDFLoader(file)\n",
120
+ " data = loader.load()\n",
121
+ " raw_text = data[0].page_content\n",
122
+ "\n",
123
+ " # APIリクエストのための入力データを準備\n",
124
+ " inputs = {\n",
125
+ " \"url\": \"\",\n",
126
+ " \"knowledge\": raw_text\n",
127
+ " }\n",
128
+ "\n",
129
+ " yield raw_text, \"loading...\", {}\n",
130
+ "\n",
131
+ " # APIエンドポイントURL\n",
132
+ " url = DIFY_BASE_URL + \"/workflows/run\"\n",
133
+ "\n",
134
+ " # APIリクエストのヘッダー\n",
135
+ " headers = {\n",
136
+ " \"Content-Type\": \"application/json\",\n",
137
+ " \"Authorization\": f\"Bearer {DIFY_API_KEY_MYWORKFLOW}\"\n",
138
+ " }\n",
139
+ "\n",
140
+ " # APIリクエストのデータ\n",
141
+ " data = {\n",
142
+ " \"inputs\": inputs,\n",
143
+ " \"query\": \"\",\n",
144
+ " \"response_mode\": \"streaming\",\n",
145
+ " \"user\": \"abc_123\",\n",
146
+ " }\n",
147
+ "\n",
148
+ " # APIにリクエストを送信\n",
149
+ " response = requests.post(url, headers=headers, json=data, stream=True)\n",
150
+ " response.raise_for_status()\n",
151
+ "\n",
152
+ " assistant_message = \"\"\n",
153
+ " outputs = {}\n",
154
+ "\n",
155
+ " # APIレスポンスのチャンク処理\n",
156
+ " for chunk in response.iter_lines(delimiter=b\"\\n\\n\"):\n",
157
+ " if chunk:\n",
158
+ " chunk_data = chunk.decode(\"utf-8\").strip()\n",
159
+ " if chunk_data.startswith(\"data:\"):\n",
160
+ " json_data = chunk_data[6:] # \"data: \"を取り除く\n",
161
+ " if json_data:\n",
162
+ " result = json.loads(json_data)\n",
163
+ " if result.get(\"event\") == \"text_chunk\":\n",
164
+ " answer = result.get(\"data\", \"\").get(\"text\", \"\")\n",
165
+ " assistant_message += str(answer)\n",
166
+ " yield raw_text, assistant_message, result.get(\"data\", \"\")\n",
167
+ " elif result.get(\"event\") == \"workflow_finished\":\n",
168
+ " outputs = result.get('data', \"\")\n",
169
+ " yield raw_text, assistant_message, outputs\n",
170
+ "\n",
171
+ " except Exception as e:\n",
172
+ " error_message = str(e)\n",
173
+ " print(f\"Error: {error_message}\")\n",
174
+ " return \"error\", error_message, {}\n",
175
+ "\n",
176
+ "# Gradioインターフェイスの設定\n",
177
+ "iface = gr.Interface(\n",
178
+ " fn=run_workflow,\n",
179
+ " inputs=[gr.MultimodalTextbox(label=\"PDFファイルをアップロード\", file_types=[\".pdf\"], interactive=True)],\n",
180
+ " outputs=[\n",
181
+ " gr.Textbox(label=\"生テキスト\", show_copy_button=True, max_lines=5),\n",
182
+ " gr.Markdown(),\n",
183
+ " gr.JSON()\n",
184
+ " ],\n",
185
+ " title=\"PDF to Dify Workflow\",\n",
186
+ " description=\"PDFファイルを入力すると、Dify APIのワークフローによって処理された結果が表示されます。\",\n",
187
+ " article=\"\"\"\n",
188
+ "\n",
189
+ " © 2024 @tregu0458. All rights reserved.\n",
190
+ "\n",
191
+ " ## 使用コンポーネント\n",
192
+ " - dify\n",
193
+ " - gradio\n",
194
+ " - langchain_community.document_loaders\n",
195
+ "\n",
196
+ " ## 今回のworkflowの仕様\n",
197
+ " ### 入力\n",
198
+ " - url\n",
199
+ " - knowledge\n",
200
+ " ### 出力\n",
201
+ " - result\n",
202
+ " - row_content\n",
203
+ " - url\n",
204
+ " ### LLM\n",
205
+ " - gemini-1.5-flash\n",
206
+ " ```\n",
207
+ " PDFファイルを入力として受け取り、Dify APIのワークフローを使用してファイルを処理し、結果を返す関数。\n",
208
+ " Args:\n",
209
+ " message (dict): 入力メッセージ。以下のキーを含む辞書。\n",
210
+ " - 'files' (list): アップロードされたPDFファイルのリスト。\n",
211
+ " - 'text' (str): テキストメッセージ。\n",
212
+ " Yields:\n",
213
+ " tuple: 以下の要素を含むタプル。\n",
214
+ " - raw_text (str): PDFファイルから抽出された生テキスト。\n",
215
+ " - assistant_message (str): アシスタントからのメッセージ。\n",
216
+ " - outputs (dict): APIレスポンスのデータ。\n",
217
+ " Returns:\n",
218
+ " tuple: 以下の要素を含むタプル。\n",
219
+ " - status (str): 処理の状態。\"error\" または \"\" (空文字列)。\n",
220
+ " - error_message (str): エラーメッセージ (エラーが発生した場合)。\n",
221
+ " - data (dict): APIレスポンスのデータ。\n",
222
+ " Raises:\n",
223
+ " Exception: 処理中にエラーが発生した場合。\n",
224
+ " Notes:\n",
225
+ " - 関数は非同期的に実行され、処理の進行状況に応じて段階的に結果を返す。\n",
226
+ " - `yield` を使用して、処理の途中経過を表示しながら、最終的な結果を返す。\n",
227
+ " ```\n",
228
+ " \"\"\"\n",
229
+ ")\n",
230
+ "\n",
231
+ "if __name__ == \"__main__\":\n",
232
+ " iface.queue().launch()"
233
+ ],
234
+ "metadata": {
235
+ "id": "yx9f1RwJtFi9"
236
+ },
237
+ "execution_count": null,
238
+ "outputs": []
239
+ }
240
+ ]
241
+ }