pkumc commited on
Commit
ae92d51
1 Parent(s): 0080880

Upload folder using huggingface_hub

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/codeStyles/Project.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <component name="ProjectCodeStyleConfiguration">
2
+ <code_scheme name="Project" version="173">
3
+ <ScalaCodeStyleSettings>
4
+ <option name="MULTILINE_STRING_CLOSING_QUOTES_ON_NEW_LINE" value="true" />
5
+ </ScalaCodeStyleSettings>
6
+ </code_scheme>
7
+ </component>
.idea/codeStyles/codeStyleConfig.xml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ <component name="ProjectCodeStyleConfiguration">
2
+ <state>
3
+ <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
4
+ </state>
5
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" languageLevel="JDK_16" project-jdk-name="Python 3.10 (py3)" project-jdk-type="Python SDK">
4
+ <output url="file://$PROJECT_DIR$/out" />
5
+ </component>
6
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/PyTest.iml" filepath="$PROJECT_DIR$/PyTest.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="105163d0-50fb-4a71-ba3b-6920eac49287" name="Changes" comment="" />
8
+ <option name="SHOW_DIALOG" value="false" />
9
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
10
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
11
+ <option name="LAST_RESOLUTION" value="IGNORE" />
12
+ </component>
13
+ <component name="CodeStyleSettingsInfer">
14
+ <option name="done" value="true" />
15
+ </component>
16
+ <component name="FileTemplateManagerImpl">
17
+ <option name="RECENT_TEMPLATES">
18
+ <list>
19
+ <option value="Python Script" />
20
+ <option value="HTML File" />
21
+ </list>
22
+ </option>
23
+ </component>
24
+ <component name="HighlightingSettingsPerFile">
25
+ <setting file="file://$PROJECT_DIR$/main.py" root0="FORCE_HIGHLIGHTING" />
26
+ <setting file="file://$PROJECT_DIR$/test1.py" root0="FORCE_HIGHLIGHTING" />
27
+ </component>
28
+ <component name="MarkdownSettingsMigration">
29
+ <option name="stateVersion" value="1" />
30
+ </component>
31
+ <component name="ProjectCodeStyleSettingsMigration">
32
+ <option name="version" value="2" />
33
+ </component>
34
+ <component name="ProjectId" id="1woOJMFsJmGkkv90kso4jUXOCPq" />
35
+ <component name="ProjectViewState">
36
+ <option name="hideEmptyMiddlePackages" value="true" />
37
+ <option name="showLibraryContents" value="true" />
38
+ </component>
39
+ <component name="PropertiesComponent">{
40
+ &quot;keyToString&quot;: {
41
+ &quot;DefaultHtmlFileTemplate&quot;: &quot;HTML File&quot;,
42
+ &quot;last_opened_file_path&quot;: &quot;/Users/machi/IdeaProjects/PyTest&quot;,
43
+ &quot;project.structure.last.edited&quot;: &quot;Project&quot;,
44
+ &quot;project.structure.proportion&quot;: &quot;0.0&quot;,
45
+ &quot;project.structure.side.proportion&quot;: &quot;0.0&quot;,
46
+ &quot;settings.editor.selected.configurable&quot;: &quot;preferences.pluginManager&quot;
47
+ }
48
+ }</component>
49
+ <component name="RecentsManager">
50
+ <key name="CopyFile.RECENT_KEYS">
51
+ <recent name="$PROJECT_DIR$" />
52
+ </key>
53
+ <key name="MoveFile.RECENT_KEYS">
54
+ <recent name="$PROJECT_DIR$/accuracy" />
55
+ <recent name="$PROJECT_DIR$" />
56
+ </key>
57
+ </component>
58
+ <component name="RunManager" selected="Python.test2">
59
+ <configuration name="flask_app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
60
+ <module name="PyTest" />
61
+ <option name="INTERPRETER_OPTIONS" value="" />
62
+ <option name="PARENT_ENVS" value="true" />
63
+ <envs>
64
+ <env name="PYTHONUNBUFFERED" value="1" />
65
+ </envs>
66
+ <option name="SDK_HOME" value="$USER_HOME$/miniconda3/envs/py3/bin/python" />
67
+ <option name="SDK_NAME" value="Python 3.10 (py3)" />
68
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
69
+ <option name="IS_MODULE_SDK" value="true" />
70
+ <option name="ADD_CONTENT_ROOTS" value="true" />
71
+ <option name="ADD_SOURCE_ROOTS" value="true" />
72
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/flask_app.py" />
73
+ <option name="PARAMETERS" value="" />
74
+ <option name="SHOW_COMMAND_LINE" value="false" />
75
+ <option name="EMULATE_TERMINAL" value="false" />
76
+ <option name="MODULE_MODE" value="false" />
77
+ <option name="REDIRECT_INPUT" value="false" />
78
+ <option name="INPUT_FILE" value="" />
79
+ <method v="2" />
80
+ </configuration>
81
+ <configuration name="main" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
82
+ <module name="PyTest" />
83
+ <option name="INTERPRETER_OPTIONS" value="" />
84
+ <option name="PARENT_ENVS" value="true" />
85
+ <envs>
86
+ <env name="PYTHONUNBUFFERED" value="1" />
87
+ </envs>
88
+ <option name="SDK_HOME" value="$USER_HOME$/miniconda3/envs/py2/bin/python" />
89
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
90
+ <option name="IS_MODULE_SDK" value="true" />
91
+ <option name="ADD_CONTENT_ROOTS" value="true" />
92
+ <option name="ADD_SOURCE_ROOTS" value="true" />
93
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/main.py" />
94
+ <option name="PARAMETERS" value="" />
95
+ <option name="SHOW_COMMAND_LINE" value="false" />
96
+ <option name="EMULATE_TERMINAL" value="false" />
97
+ <option name="MODULE_MODE" value="false" />
98
+ <option name="REDIRECT_INPUT" value="false" />
99
+ <option name="INPUT_FILE" value="" />
100
+ <method v="2" />
101
+ </configuration>
102
+ <configuration name="run" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
103
+ <module name="PyTest" />
104
+ <option name="INTERPRETER_OPTIONS" value="" />
105
+ <option name="PARENT_ENVS" value="true" />
106
+ <envs>
107
+ <env name="PYTHONUNBUFFERED" value="1" />
108
+ </envs>
109
+ <option name="SDK_HOME" value="" />
110
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
111
+ <option name="IS_MODULE_SDK" value="true" />
112
+ <option name="ADD_CONTENT_ROOTS" value="true" />
113
+ <option name="ADD_SOURCE_ROOTS" value="true" />
114
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/run.py" />
115
+ <option name="PARAMETERS" value="" />
116
+ <option name="SHOW_COMMAND_LINE" value="false" />
117
+ <option name="EMULATE_TERMINAL" value="false" />
118
+ <option name="MODULE_MODE" value="false" />
119
+ <option name="REDIRECT_INPUT" value="false" />
120
+ <option name="INPUT_FILE" value="" />
121
+ <method v="2" />
122
+ </configuration>
123
+ <configuration name="test1" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
124
+ <module name="PyTest" />
125
+ <option name="INTERPRETER_OPTIONS" value="" />
126
+ <option name="PARENT_ENVS" value="true" />
127
+ <envs>
128
+ <env name="PYTHONUNBUFFERED" value="1" />
129
+ </envs>
130
+ <option name="SDK_HOME" value="$USER_HOME$/miniconda3/envs/py3/bin/python" />
131
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
132
+ <option name="IS_MODULE_SDK" value="false" />
133
+ <option name="ADD_CONTENT_ROOTS" value="true" />
134
+ <option name="ADD_SOURCE_ROOTS" value="true" />
135
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/test1.py" />
136
+ <option name="PARAMETERS" value="" />
137
+ <option name="SHOW_COMMAND_LINE" value="false" />
138
+ <option name="EMULATE_TERMINAL" value="false" />
139
+ <option name="MODULE_MODE" value="false" />
140
+ <option name="REDIRECT_INPUT" value="false" />
141
+ <option name="INPUT_FILE" value="" />
142
+ <method v="2" />
143
+ </configuration>
144
+ <configuration name="test2" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
145
+ <module name="PyTest" />
146
+ <option name="INTERPRETER_OPTIONS" value="" />
147
+ <option name="PARENT_ENVS" value="true" />
148
+ <envs>
149
+ <env name="PYTHONUNBUFFERED" value="1" />
150
+ </envs>
151
+ <option name="SDK_HOME" value="" />
152
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
153
+ <option name="IS_MODULE_SDK" value="true" />
154
+ <option name="ADD_CONTENT_ROOTS" value="true" />
155
+ <option name="ADD_SOURCE_ROOTS" value="true" />
156
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/test2.py" />
157
+ <option name="PARAMETERS" value="" />
158
+ <option name="SHOW_COMMAND_LINE" value="false" />
159
+ <option name="EMULATE_TERMINAL" value="false" />
160
+ <option name="MODULE_MODE" value="false" />
161
+ <option name="REDIRECT_INPUT" value="false" />
162
+ <option name="INPUT_FILE" value="" />
163
+ <method v="2" />
164
+ </configuration>
165
+ <list>
166
+ <item itemvalue="Python.flask_app" />
167
+ <item itemvalue="Python.test2" />
168
+ <item itemvalue="Python.test1" />
169
+ <item itemvalue="Python.main" />
170
+ <item itemvalue="Python.run" />
171
+ </list>
172
+ <recent_temporary>
173
+ <list>
174
+ <item itemvalue="Python.test2" />
175
+ <item itemvalue="Python.flask_app" />
176
+ <item itemvalue="Python.test1" />
177
+ <item itemvalue="Python.main" />
178
+ </list>
179
+ </recent_temporary>
180
+ </component>
181
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
182
+ <component name="TaskManager">
183
+ <task active="true" id="Default" summary="Default task">
184
+ <changelist id="105163d0-50fb-4a71-ba3b-6920eac49287" name="Changes" comment="" />
185
+ <created>1629121407956</created>
186
+ <option name="number" value="Default" />
187
+ <option name="presentableId" value="Default" />
188
+ <updated>1629121407956</updated>
189
+ </task>
190
+ <servers />
191
+ </component>
192
+ <component name="XDebuggerManager">
193
+ <breakpoint-manager>
194
+ <breakpoints>
195
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
196
+ <url>file://$USER_HOME$/miniconda3/envs/py3/lib/python3.10/site-packages/datasets/load.py</url>
197
+ <line>1752</line>
198
+ <option name="timeStamp" value="2" />
199
+ </line-breakpoint>
200
+ <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
201
+ <url>file://$USER_HOME$/miniconda3/envs/py3/lib/python3.10/site-packages/datasets/load.py</url>
202
+ <line>1733</line>
203
+ <option name="timeStamp" value="3" />
204
+ </line-breakpoint>
205
+ </breakpoints>
206
+ </breakpoint-manager>
207
+ <watches-manager>
208
+ <configuration name="PythonConfigurationType">
209
+ <watch expression="t.element_spec" />
210
+ <watch expression="t.element_spec" />
211
+ <watch expression="cnt[0]" />
212
+ </configuration>
213
+ </watches-manager>
214
+ </component>
215
+ </project>
PyTest.iml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager" inherit-compiler-output="true">
4
+ <exclude-output />
5
+ <content url="file://$MODULE_DIR$" />
6
+ <orderEntry type="inheritedJdk" />
7
+ <orderEntry type="sourceFolder" forTests="false" />
8
+ </component>
9
+ </module>
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
  title: PyTest
3
- emoji: 📉
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 3.44.3
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: PyTest
3
+ app_file: test2.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.36.1
 
 
6
  ---
 
 
__pycache__/flask_app.cpython-310.pyc ADDED
Binary file (1.75 kB). View file
 
__pycache__/test1.cpython-310.pyc ADDED
Binary file (1.04 kB). View file
 
accuracy/accuracy.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Accuracy metric."""
15
+
16
+ import datasets
17
+ from sklearn.metrics import accuracy_score
18
+
19
+ import evaluate
20
+
21
+
22
+ _DESCRIPTION = """
23
+ Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
24
+ Accuracy = (TP + TN) / (TP + TN + FP + FN)
25
+ Where:
26
+ TP: True positive
27
+ TN: True negative
28
+ FP: False positive
29
+ FN: False negative
30
+ """
31
+
32
+
33
+ _KWARGS_DESCRIPTION = """
34
+ Args:
35
+ predictions (`list` of `int`): Predicted labels.
36
+ references (`list` of `int`): Ground truth labels.
37
+ normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
38
+ sample_weight (`list` of `float`): Sample weights Defaults to None.
39
+
40
+ Returns:
41
+ accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
42
+
43
+ Examples:
44
+
45
+ Example 1-A simple example
46
+ >>> accuracy_metric = evaluate.load("accuracy")
47
+ >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
48
+ >>> print(results)
49
+ {'accuracy': 0.5}
50
+
51
+ Example 2-The same as Example 1, except with `normalize` set to `False`.
52
+ >>> accuracy_metric = evaluate.load("accuracy")
53
+ >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
54
+ >>> print(results)
55
+ {'accuracy': 3.0}
56
+
57
+ Example 3-The same as Example 1, except with `sample_weight` set.
58
+ >>> accuracy_metric = evaluate.load("accuracy")
59
+ >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
60
+ >>> print(results)
61
+ {'accuracy': 0.8778625954198473}
62
+ """
63
+
64
+
65
+ _CITATION = """
66
+ @article{scikit-learn,
67
+ title={Scikit-learn: Machine Learning in {P}ython},
68
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
69
+ and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
70
+ and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
71
+ Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
72
+ journal={Journal of Machine Learning Research},
73
+ volume={12},
74
+ pages={2825--2830},
75
+ year={2011}
76
+ }
77
+ """
78
+
79
+
80
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
81
+ class Accuracy(evaluate.Metric):
82
+ def _info(self):
83
+ return evaluate.MetricInfo(
84
+ description=_DESCRIPTION,
85
+ citation=_CITATION,
86
+ inputs_description=_KWARGS_DESCRIPTION,
87
+ features=datasets.Features(
88
+ {
89
+ "predictions": datasets.Sequence(datasets.Value("int32")),
90
+ "references": datasets.Sequence(datasets.Value("int32")),
91
+ }
92
+ if self.config_name == "multilabel"
93
+ else {
94
+ "predictions": datasets.Value("int32"),
95
+ "references": datasets.Value("int32"),
96
+ }
97
+ ),
98
+ reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
99
+ )
100
+
101
+ def _compute(self, predictions, references, normalize=True, sample_weight=None):
102
+ return {
103
+ "accuracy": float(
104
+ accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
105
+ )
106
+ }
covid.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*
2
+
3
+ import sys
4
+ sys.path.append('/Users/machi/Library/Python/3.8/lib/python/site-packages')
5
+
6
+ import os
7
+ import asyncio
8
+ # from pyppeteer import launcher
9
+ # # 在导入 launch 之前 把 --enable-automation 禁用 防止监测webdriver
10
+ # launcher.AUTOMATION_ARGS.remove("--enable-automation")
11
+
12
+ from pyppeteer import launch
13
+ from bs4 import BeautifulSoup
14
+ import re
15
+ import time
16
+
17
+ async def pyppteer_fetchUrl(url):
18
+ browser = await launch({'headless': False,'dumpio':True, 'autoClose':True})
19
+ page = await browser.newPage()
20
+
21
+ # await page.setDefaultNavigationTimeout(60000)
22
+ await page.goto(url)
23
+ await asyncio.wait([page.waitForNavigation()])
24
+ str = await page.content()
25
+ await browser.close()
26
+ return str
27
+
28
+ def fetchUrl(url):
29
+ return asyncio.get_event_loop().run_until_complete(pyppteer_fetchUrl(url))
30
+
31
+ def getPageUrl():
32
+ for page in range(1,5):
33
+ if page == 1:
34
+ yield 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd.shtml'
35
+ else:
36
+ url = 'http://www.nhc.gov.cn/xcs/yqtb/list_gzbd_'+ str(page) +'.shtml'
37
+ yield url
38
+
39
+ def getTitleUrl(html):
40
+
41
+ bsobj = BeautifulSoup(html,'html.parser')
42
+ titleList = bsobj.find('div', attrs={"class":"list"}).ul.find_all("li")
43
+ for item in titleList:
44
+ link = "http://www.nhc.gov.cn" + item.a["href"];
45
+ title = item.a["title"]
46
+ date = item.span.text
47
+ yield title, link, date
48
+
49
+ def getInfo(pat, s):
50
+ res = re.search(pat, s)
51
+ if res:
52
+ return res.group(1)
53
+ return '0'
54
+
55
+ def getContent(html):
56
+
57
+ bsobj = BeautifulSoup(html,'html.parser')
58
+ cnt = bsobj.find('div', attrs={"id":"xw_box"}).find_all("p")
59
+ res = []
60
+
61
+ if cnt:
62
+ # 从第一段解析
63
+ s = cnt[0].text
64
+ res.append(getInfo(r'新增确诊病例(\d+)例', s))
65
+ res.append(getInfo(r'本土病例(\d+)例', s))
66
+ res.append(getInfo(r'新增死亡病例(\d+)例', s))
67
+
68
+ # 从第二段解析
69
+ s = cnt[1].text
70
+ res.append(getInfo(r'新增治愈出院病例(\d+)例', s))
71
+
72
+ # 从第五段解析
73
+ s = cnt[4].text
74
+ res.append(getInfo(r'新增无症状感染者(\d+)例', s))
75
+ res.append(getInfo(r'本土(\d+)例', s))
76
+
77
+ return res
78
+
79
+ def saveFile(path, filename, content):
80
+
81
+ if not os.path.exists(path):
82
+ os.makedirs(path)
83
+
84
+ # 保存文件
85
+ with open(path + filename + ".txt", 'w', encoding='utf-8') as f:
86
+ f.write(content)
87
+
88
+ if "__main__" == __name__:
89
+ # print(getInfo(r'新增死亡病例(\d+)例', '无新增死亡病例。'))
90
+ # s = '4月28日0—24时,31个省(自治区、直辖市)和新疆生产建设兵团报告新增确诊病例5659例。其中境外输入病例13例(广东3例,北京2例,上海2例,福建2例,黑龙江1例,浙江1例,广西1例,四川1例),含2例由无症状感染者转为确诊病例(浙江1例,福建1例);本土病例5646例(上海5487例,北京47例,吉林42例,浙江31例,山东7例,广东7例,黑龙江4例,江西4例,内蒙古3例,江苏3例,四川3例,河南2例,辽宁1例,福建1例,湖南1例,广西1例,重庆1例,云南1例),含5125例由无症状感染者转为确诊病例(上海5062例,吉林31例,浙江28例,辽宁1例,山东1例,河南1例,云南1例)。新增死亡病例52例,均为本土病例,在上海;无新增疑似病例。'
91
+ # res = re.search( r'新增确诊病例(\d+)例', s)
92
+ # print(res.group(1))
93
+ #
94
+ # res = re.search( r'本土病例.*),', s)
95
+ # print(res.group())
96
+ #
97
+ # res = re.search( r'新增死亡病例\d+例', s)
98
+ # print(res.group())
99
+ #
100
+ # res = re.search( r'新增治愈出院病例\d+例', s)
101
+ # print(res.group())
102
+ #
103
+ with open('/Users/machi/Desktop/covid.csv', 'w') as f:
104
+ header = ','.join(['日期', '新增确诊病例', '本土新增确诊病例', '新增死亡病例', '新增治愈出院病例', '新增无症状感染者', '本土新增无症状感染者'])
105
+ f.write(header + '\n')
106
+
107
+ for url in getPageUrl():
108
+ print(url)
109
+ try:
110
+ s =fetchUrl(url)
111
+ except:
112
+ continue
113
+
114
+ for title,link,date in getTitleUrl(s):
115
+ print(title,link)
116
+ # time.sleep(5)
117
+ try:
118
+ html =fetchUrl(link)
119
+ content = getContent(html)
120
+
121
+ s = ','.join([date] + content)
122
+ f.write(s + '\n')
123
+ print('%s write finish' % date)
124
+ except Exception as e:
125
+ print('%s process failed' % date, e)
126
+ continue
127
+
128
+ # break
flask_app.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request
2
+ from flask_restful import Resource, Api
3
+
4
+ app = Flask(__name__)
5
+ api = Api(app)
6
+
7
+ todos = {}
8
+
9
+ class TodoSimple(Resource):
10
+ def get(self, todo_id):
11
+ return {todo_id: todos[todo_id]}
12
+
13
+ def put(self, todo_id):
14
+ todos[todo_id] = request.form['data']
15
+ return {todo_id: todos[todo_id]}
16
+
17
+ api.add_resource(TodoSimple, '/<string:todo_id>')
18
+
19
+ if __name__ == '__main__':
20
+ app.run(debug=True)
main.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*
2
+ # from __future__ import print_function
3
+ import sys
4
+ import tensorflow as tf
5
+ # import tensorflow_datasets as tfds
6
+ import numpy as np
7
+ import json
8
+
9
+ tf.enable_eager_execution()
10
+
11
+ def test():
12
+ # mirrored_strategy = tf.distribute.MirroredStrategy()
13
+ # # 在config中加入镜像策略
14
+ # config = tf.estimator.RunConfig(train_distribute=mirrored_strategy, eval_distribute=mirrored_strategy)
15
+ # 把config加到模型里
16
+ regressor = tf.estimator.LinearRegressor(
17
+ feature_columns=[tf.feature_column.numeric_column('feats')],
18
+ optimizer='SGD'
19
+ # ,config=config
20
+ )
21
+ def input_fn():
22
+ dataset = tf.data.Dataset.from_tensors(({"feats":[1.]}, [1.]))
23
+ return dataset.repeat(1000).batch(10)
24
+
25
+ # 正常训练,正常评估
26
+ regressor.train(input_fn=input_fn
27
+ , steps=20
28
+ )
29
+ regressor.evaluate(input_fn=input_fn
30
+ # , steps=10
31
+ )
32
+
33
+ def parse_from_json(config_path):
34
+ """ parse feature columns from feature config path
35
+
36
+ Args:
37
+ config_path: string, a feature config path
38
+ """
39
+ total = 0
40
+ correct = 0
41
+ with open(config_path, "r") as f:
42
+ config = json.load(f)
43
+
44
+ feature_names = set()
45
+ features = config["features"]
46
+ for feature in features:
47
+ feature_name = feature['feature_name']
48
+ if '#' in feature_name:
49
+ feature_name = feature_name.split('#')[0]
50
+ feature_names.add(feature_name)
51
+ return feature_names
52
+
53
+ #convert model's format from *.pb to *.pbtxt
54
+ def parse_model_2_txt(saved_model_dir ,output_file):
55
+ from tensorflow.python.saved_model import loader_impl
56
+ from google.protobuf import text_format
57
+ saved_model = loader_impl._parse_saved_model(saved_model_dir)
58
+ with open(output_file, 'w') as f:
59
+ f.write(text_format.MessageToString(saved_model))
60
+
61
+ # parse_model_2_txt('/Users/machi/git/internal/starship_galaxy/model_zoo/scheduler/2022q2combo/old', '/Users/machi/git/internal/starship_galaxy/model_zoo/scheduler/2022q2combo/old/saved_model.pbtxt')
62
+
63
+ import os
64
+ def build_serving_input_new():
65
+ import pickle
66
+ with tf.gfile.Open('feature_desc.pkl', mode='rb') as f:
67
+ feature_dec = pickle.load(f)
68
+ sep_placeholder = {}
69
+ for name, desc in feature_dec.items():
70
+ if 'sg_poi_click_time_gap_seq_2d' in name:
71
+ print(desc)
72
+
73
+ # return sep_placeholder
74
+
75
+ def read_schema(file):
76
+ d = {}
77
+ with open(file) as f:
78
+ for line in f:
79
+ line = line.strip()
80
+ fds = line.split(' ')
81
+ d[fds[0]] = fds[1]
82
+ return d
83
+
84
+
85
+
86
+ def sparse_tensor():
87
+ indices_tf = tf.constant([[0, 0], [0, 1], [1, 1], [2, 2]], dtype=tf.int64)
88
+ values_tf = tf.constant([1, 2, 3, 4], dtype=tf.int32)
89
+ dense_shape_tf = tf.constant([3, 3], dtype=tf.int64)
90
+
91
+ sparse_tf = tf.SparseTensor(indices=indices_tf,
92
+ values=values_tf,
93
+ dense_shape=dense_shape_tf)
94
+ dense_tf = tf.sparse_tensor_to_dense(sparse_tf)
95
+
96
+ # print(dense_tf)
97
+
98
+
99
+ user_tf = tf.constant([1, 2, 3], dtype=tf.int32, shape=[3, 1])
100
+
101
+ # 一行为一个session,每一行包含不同个数的样本。以下示例中,共有3个session,第1个session包含3个样本,第2个session包含2个样本,第3个session行包含1个样本
102
+ # b为non_common特征
103
+ b = tf.constant([[1, 2, 1], [0, 3, 2], [0, 0, 4]])
104
+
105
+ # a为common特征,3个session有3个值
106
+ a = tf.constant([1, 2, 3], shape=[3, 1])
107
+
108
+ # 将a扩展为和b相同维度
109
+ a = tf.tile(a, tf.constant([1, 3]))
110
+ print(a)
111
+
112
+ # 获取b中非0元素的下标
113
+ indices = tf.where(tf.not_equal(b, 0))
114
+ print(indices)
115
+
116
+ # 将非0元素的下标处的a和b的值拼接起来,即样本展开后的结果
117
+ c = tf.concat(values=[tf.expand_dims(tf.gather_nd(a, indices), axis=1), tf.expand_dims(tf.gather_nd(b, indices), axis=1)], axis=1)
118
+ print(c)
119
+
120
+
121
+ def kkv_attention(query, key, value, mask=None):
122
+ # Transpose key and value matrices
123
+ key_transpose = tf.transpose(key, perm=[0, 2, 1])
124
+ value_transpose = tf.transpose(value, perm=[0, 2, 1])
125
+
126
+ # Compute dot product between query and key
127
+ logits = tf.matmul(query, key_transpose)
128
+
129
+ # Apply mask (if provided) to logits
130
+ if mask is not None:
131
+ logits += mask
132
+
133
+ # Apply softmax activation to obtain attention scores
134
+ attention_scores = tf.nn.softmax(logits, axis=-1)
135
+
136
+ # Apply attention scores to value to obtain context vector
137
+ context_vector = tf.matmul(attention_scores, value_transpose)
138
+
139
+ # Transpose back the output
140
+ context_vector = tf.transpose(context_vector, perm=[0, 2, 1])
141
+
142
+ return context_vector, attention_scores
143
+
144
+ # write kkv attention function
145
+ def write_kkv_attention(query, key, value, mask=None):
146
+ # Transpose key and value matrices
147
+ # key_transpose = tf.transpose(key, perm=[0, 2, 1])
148
+ # value_transpose = tf.transpose(value, perm=[0, 2, 1])
149
+
150
+ # Compute dot product between query and key
151
+ logits = tf.matmul(query, key)
152
+
153
+ # Apply mask (if provided) to logits
154
+ if mask is not None:
155
+ logits += mask
156
+
157
+ # Apply softmax activation to obtain attention scores
158
+ attention_scores = tf.nn.softmax(logits, axis=-1)
159
+
160
+ # Apply attention scores to value to obtain context vector
161
+ context_vector = tf.matmul(attention_scores, value)
162
+
163
+ # Transpose back the output
164
+ # context_vector = tf.transpose(context_vector, perm=[0, 2, 1])
165
+
166
+ return context_vector, attention_scores
167
+
168
+ # test write_kkv_attention
169
+ def test_write_kkv_attention():
170
+ # define query and key matrices
171
+ query = tf.constant([[-0.1250, 0.0000, -0.5000, 0.5000, 0.0000]])
172
+
173
+ key = tf.constant([[ -0.1250, 0.0000, -0.5000, 0.5000, 0.0000],
174
+ [-0.5000, 0.0000, 0.5000, 0.5000, 0.0000],
175
+ [-0.2500, -0.5000, 0.0000, 0.5000, 0.2500],
176
+ [ 0.0000, 0.0000, 0.0000, 0.5000, 0.5000],
177
+ [ 0.5000, 0.5000, 0.0000, -0.5000, 0.5000]])
178
+
179
+ value = tf.constant([[-0.5000, 0.0000, 0.5000, 0.5000, 0.0000],
180
+ [-0.5000, 0.0000, 0.5000, 0.5000, 0.0000],
181
+ [-0.5000, 0.0000, 0.5000, 0.5000, 0.0000],
182
+ [ 0.0000, 0.0000, 0.5000, 0.5000, 0.5000],
183
+ [ 0.5000, 0.5000, 0.0000, -0.5000, 0.5000]])
184
+
185
+
186
+ mask = None
187
+
188
+ # call write_kkv_attention and obtain context vector and attention scores
189
+ context_vector, attention_scores = write_kkv_attention(query, key, value,mask)
190
+
191
+ # print results
192
+ print context_vector
193
+ print attention_scores
194
+
195
+
196
+ print '123', 1
run.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import dgl
4
+ import dgl.function as fn
5
+ import dgl.nn as dglnn
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+ class RGCN(nn.Module):
10
+ def __init__(self, in_feats, hid_feats, out_feats, rel_names):
11
+ super().__init__()
12
+ # 实例化HeteroGraphConv,in_feats是输入特征的维度,out_feats是输出特征的维度,aggregate是聚合函数的类型
13
+ self.conv1 = dglnn.HeteroGraphConv({
14
+ rel: dglnn.GraphConv(in_feats[rel], hid_feats)
15
+ for rel in rel_names}, aggregate='sum')
16
+ self.conv2 = dglnn.HeteroGraphConv({
17
+ rel: dglnn.GraphConv(hid_feats, out_feats)
18
+ for rel in rel_names}, aggregate='sum')
19
+
20
+ def forward(self, graph, inputs):
21
+ # 输入是节点的特征字典
22
+ h = self.conv1(graph, inputs)
23
+ h = {k: F.relu(v) for k, v in h.items()}
24
+ h = self.conv2(graph, h)
25
+ return h
26
+
27
+ class HeteroDotProductPredictor(nn.Module):
28
+ def forward(self, graph, h, etype):
29
+ # h是从5.1节中对异构图的每种类型的边所计算的节点表示
30
+ with graph.local_scope():
31
+ graph.ndata['h'] = h
32
+ graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
33
+ return graph.edges[etype].data['score']
34
+
35
+
36
+ class Model(nn.Module):
37
+ def __init__(self, in_features, hidden_features, out_features, rel_names):
38
+ super().__init__()
39
+ self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
40
+ self.pred = HeteroDotProductPredictor()
41
+ def forward(self, g, neg_g, x, etype):
42
+ h = self.sage(g, x)
43
+ return self.pred(g, h, etype), self.pred(neg_g, h, etype)
44
+
45
+
46
+ def construct_negative_graph(graph, k, etype):
47
+ utype, _, vtype = etype
48
+ src, dst = graph.edges(etype=etype)
49
+ neg_src = src.repeat_interleave(k)
50
+ neg_dst = torch.randint(0, graph.num_nodes(vtype), (len(src) * k,))
51
+ return dgl.heterograph(
52
+ {etype: (neg_src, neg_dst)},
53
+ num_nodes_dict={ntype: graph.num_nodes(ntype) for ntype in graph.ntypes})
54
+
55
+
56
+ def compute_loss(pos_score, neg_score):
57
+ # 间隔损失
58
+ n_edges = pos_score.shape[0]
59
+ return (1 - pos_score.unsqueeze(1) + neg_score.view(n_edges, -1)).clamp(min=0).mean()
60
+
61
+
62
+
63
+ n_users = 1000
64
+ n_items = 500
65
+ n_follows = 3000
66
+ n_clicks = 5000
67
+ n_dislikes = 500
68
+ n_hetero_features_user = 10
69
+ n_hetero_features_item = 5
70
+ n_user_classes = 5
71
+ n_max_clicks = 10
72
+
73
+ follow_src = np.random.randint(0, n_users, n_follows)
74
+ follow_dst = np.random.randint(0, n_users, n_follows)
75
+ click_src = np.random.randint(0, n_users, n_clicks)
76
+ click_dst = np.random.randint(0, n_items, n_clicks)
77
+ dislike_src = np.random.randint(0, n_users, n_dislikes)
78
+ dislike_dst = np.random.randint(0, n_items, n_dislikes)
79
+
80
+ hetero_graph = dgl.heterograph({
81
+ ('user', 'follow', 'user'): (follow_src, follow_dst),
82
+ ('user', 'followed-by', 'user'): (follow_dst, follow_src),
83
+ ('user', 'click', 'item'): (click_src, click_dst),
84
+ ('item', 'clicked-by', 'user'): (click_dst, click_src),
85
+ ('user', 'dislike', 'item'): (dislike_src, dislike_dst),
86
+ ('item', 'disliked-by', 'user'): (dislike_dst, dislike_src)})
87
+
88
+ hetero_graph.nodes['user'].data['feature'] = torch.randn(n_users, n_hetero_features_user)
89
+ hetero_graph.nodes['item'].data['feature'] = torch.randn(n_items, n_hetero_features_item)
90
+ hetero_graph.nodes['user'].data['label'] = torch.randint(0, n_user_classes, (n_users,))
91
+ hetero_graph.edges['click'].data['label'] = torch.randint(1, n_max_clicks, (n_clicks,)).float()
92
+ # 在user类型的节点和click类型的边上随机生成训练集的掩码
93
+ hetero_graph.nodes['user'].data['train_mask'] = torch.zeros(n_users, dtype=torch.bool).bernoulli(0.6)
94
+ hetero_graph.edges['click'].data['train_mask'] = torch.zeros(n_clicks, dtype=torch.bool).bernoulli(0.6)
95
+
96
+ # print(hetero_graph)
97
+ hetero_features_dims = {
98
+ 'follow': n_hetero_features_user,
99
+ 'followed-by': n_hetero_features_user,
100
+ 'click': n_hetero_features_user,
101
+ 'clicked-by': n_hetero_features_item,
102
+ 'dislike': n_hetero_features_user,
103
+ 'disliked-by': n_hetero_features_item
104
+ }
105
+
106
+ k = 5
107
+ model = Model(hetero_features_dims, 20, 5, hetero_graph.etypes)
108
+ user_feats = hetero_graph.nodes['user'].data['feature']
109
+ item_feats = hetero_graph.nodes['item'].data['feature']
110
+ node_features = {'user': user_feats, 'item': item_feats}
111
+ opt = torch.optim.Adam(model.parameters())
112
+ for epoch in range(10):
113
+ negative_graph = construct_negative_graph(hetero_graph, k, ('user', 'click', 'item'))
114
+ pos_score, neg_score = model(hetero_graph, negative_graph, node_features, ('user', 'click', 'item'))
115
+ loss = compute_loss(pos_score, neg_score)
116
+ opt.zero_grad()
117
+ loss.backward()
118
+ opt.step()
119
+ print(loss.item())
run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python -m main
run_wm_rgcn-20220407.py ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modeling Relational Data with Graph Convolutional Networks
3
+ Paper: https://arxiv.org/abs/1703.06103
4
+ Code: https://github.com/tkipf/relational-gcn
5
+ Difference compared to tkipf/relation-gcn
6
+ * l2norm applied to all weights
7
+ * remove nodes that won't be touched
8
+ """
9
+ import argparse, gc
10
+ import numpy as np
11
+ import time
12
+ import torch as th
13
+ import torch.nn as nn
14
+ import dgl.function as fn
15
+ import torch.nn.functional as F
16
+ import dgl
17
+ import dgl.multiprocessing as mp
18
+ from torch.nn.parallel import DistributedDataParallel
19
+ from dgl import DGLGraph
20
+ from functools import partial
21
+
22
+ from dgl.data.rdf import AIFBDataset
23
+ from src.skeleton.graph_builder import StandaloneGraphBuilder
24
+ from src.skeleton.train_type import SamplingGraphTraining
25
+ from src.application.rgcn.rgcn import RelGraphEmbedLayer, EntityClassify
26
+ from dgl.contrib.hostmap_tensor import HostMapTensor
27
+ from src.skeleton.dataloader import Dataloader
28
+ import tqdm
29
+
30
+ from sklearn.metrics import roc_auc_score
31
+ # from torch.utils.tensorboard import SummaryWriter
32
+
33
+ '''
34
+ 这是单机的异构图节点分类任务-Demo:
35
+
36
+ 适用于:
37
+ -- 图的数据量较大,比如100万~1亿点, 1000万~10亿边。
38
+
39
+ class RgcnGraphBuilder 负责加载数据
40
+ class RgcnTrainer 负责训练和预测
41
+ class RgcnTrainingDataLoader 负责做训练采样和数据遍历
42
+
43
+ 用户如果需要改动只需要:
44
+
45
+ 1、改动RgcnGraphBuilder.build_dataset 此方法负责从DGL图中分离训练数据、预测数据、测试数据
46
+ 2、改动RgcnTrainer.train 此方法负责训练逻辑
47
+ 3、改动RgcnTrainer.evaluate 此方法负责离线预测逻辑
48
+ 4、改动RgcnTrainingDataLoader.init 此方法负责输出返回一个迭代遍历器、用于遍历数据集
49
+
50
+ 这里使用AIFB数据集做精度对齐(epoch=50, batch_size=128)
51
+ 社区aifb数据集节点分类测试集精度: Final Test Accuracy: 0.9250 | Test loss: 0.3929
52
+ 平台aifb数据集节点分类测试集精度: Final Test Accuracy: 0.9250 | Test loss: 0.2953
53
+ '''
54
+ class RgcnGraphBuilder(StandaloneGraphBuilder):
55
+
56
+ def build_dataset(self, g):
57
+
58
+ hg = g
59
+ # category = self.flags.category
60
+ num_classes = self.flags.num_classes
61
+
62
+ num_rels = len(hg.canonical_etypes)
63
+ num_of_ntype = len(hg.ntypes)
64
+
65
+ # train_mask = hg.nodes[category].data.pop(self.flags.train_mask)
66
+ # test_mask = hg.nodes[category].data.pop(self.flags.test_mask)
67
+ # labels = hg.nodes[category].data.pop(self.flags.label)
68
+
69
+ eids = th.arange(g.number_of_edges())
70
+ #eids = np.random.permutation(eids)
71
+ val_size = int(len(eids) * 0.1)
72
+ test_size = int(len(eids) * 0.2)
73
+ # train_size = g.number_of_edges() - val_size - test_size
74
+ # valid_eids = eids[:val_size]
75
+ # test_eids = eids[val_size: val_size + test_size]
76
+ # train_eids = eids[val_size + test_size:]
77
+
78
+ valid_eids = dgl.contrib.HostMapTensor('valid_eids', eids[:val_size])
79
+ test_eids = dgl.contrib.HostMapTensor('test_eids', eids[val_size: val_size + test_size])
80
+ train_eids = dgl.contrib.HostMapTensor('train_eids', eids[val_size + test_size:])
81
+
82
+ # train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
83
+ # test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
84
+
85
+ # val_idx = train_idx
86
+
87
+ node_feats = {}
88
+ for ntype in hg.ntypes:
89
+ if len(hg.nodes[ntype].data) == 0 or self.flags.node_feats is False:
90
+ node_feats[str(hg.get_ntype_id(ntype))] = hg.number_of_nodes(ntype)
91
+ else:
92
+ assert len(hg.nodes[ntype].data) == 1
93
+ feat = hg.nodes[ntype].data.pop(self.flags.feat)
94
+ if feat is not None:
95
+ feats = HostMapTensor(ntype + '__' + self.flags.feat, feat)
96
+ node_feats[str(hg.get_ntype_id(ntype))] = feats
97
+
98
+ # get target category id
99
+ # category_id = len(hg.ntypes)
100
+ # for i, ntype in enumerate(hg.ntypes):
101
+ # if ntype == category:
102
+ # category_id = i
103
+ # print('{}:{}'.format(i, ntype))
104
+
105
+ g = dgl.to_homogeneous(hg)
106
+ ntype_tensor = g.ndata[dgl.NTYPE]
107
+ ntype_tensor.share_memory_()
108
+ etype_tensor = g.edata[dgl.ETYPE]
109
+ etype_tensor = dgl.contrib.HostMapTensor('etype_tensor', etype_tensor)
110
+
111
+ typeid_tensor = g.ndata[dgl.NID]
112
+ typeid_tensor.share_memory_()
113
+
114
+
115
+
116
+
117
+ #ntype_tensor = dgl.contrib.HostMapTensor('ntype_tensor', g.ndata[dgl.NTYPE])
118
+ #etype_tensor = dgl.contrib.HostMapTensor('etype_tensor', g.edata[dgl.ETYPE])
119
+ #typeid_tensor = dgl.contrib.HostMapTensor('typeid_tensor', g.edata[dgl.NID])
120
+
121
+ # node_ids = th.arange(g.number_of_nodes())
122
+
123
+ # # find out the target node ids
124
+ # node_tids = g.ndata[dgl.NTYPE]
125
+ # loc = (node_tids == category_id)
126
+ # target_idx = node_ids[loc]
127
+ # target_idx.share_memory_()
128
+ # train_idx.share_memory_()
129
+ # val_idx.share_memory_()
130
+ # test_idx.share_memory_()
131
+
132
+ # # This is a graph with multiple node types, so we want a way to map
133
+ # # our target node from their global node numberings, back to their
134
+ # # numberings within their type. This is used when taking the nodes in a
135
+ # # mini-batch, and looking up their type-specific labels
136
+ # inv_target = th.empty(node_ids.shape,
137
+ # dtype=node_ids.dtype)
138
+ # inv_target.share_memory_()
139
+ # inv_target[target_idx] = th.arange(0, target_idx.shape[0],
140
+ # dtype=inv_target.dtype)
141
+
142
+ # Create csr/coo/csc formats before launching training processes with multi-gpu.
143
+ # This avoids creating certain formats in each sub-process, which saves momory and CPU.
144
+ g.create_formats_()
145
+
146
+ g = g.shared_memory('g')
147
+
148
+ return g, node_feats, num_of_ntype, num_classes, num_rels, train_eids, valid_eids, test_eids, ntype_tensor, etype_tensor, typeid_tensor
149
+
150
+
151
+ class RgcnTrainer(SamplingGraphTraining):
152
+
153
+ def train(self, g, dataset, device, n_gpus, proc_id, **kwargs):
154
+
155
+ dev_id = -1 if n_gpus == 0 else device.index
156
+ queue = kwargs['queue'] if n_gpus > 1 else None
157
+
158
+ g, node_feats, num_of_ntype, num_classes, num_rels, train_eids, valid_eids, test_eids, ntype_tensor, etype_tensor, typeid_tensor = dataset
159
+
160
+ node_tids = ntype_tensor
161
+ world_size = n_gpus
162
+
163
+ if n_gpus > 0:
164
+
165
+ etype_tensor.uva(device)
166
+
167
+ for key in node_feats:
168
+ if not isinstance(node_feats[key], int):
169
+ node_feats[key].uva(device)
170
+
171
+ if n_gpus == 1:
172
+ g = g.to(device)
173
+
174
+ if n_gpus > 1:
175
+
176
+ g = g.uva(device)
177
+ dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
178
+ master_ip='127.0.0.1', master_port=self.flags.master_port)
179
+
180
+ th.distributed.init_process_group(backend=self.flags.communication_backend,
181
+ init_method=dist_init_method,
182
+ world_size=world_size,
183
+ rank=proc_id)
184
+
185
+ # node features
186
+ # None for one-hot feature, if not none, it should be the feature tensor.
187
+ embed_layer = RelGraphEmbedLayer(dev_id if self.flags.embedding_gpu or not self.flags.dgl_sparse else -1,
188
+ dev_id,
189
+ g.number_of_nodes(),
190
+ node_tids,
191
+ num_of_ntype,
192
+ node_feats,
193
+ self.flags.num_hidden,
194
+ dgl_sparse=self.flags.dgl_sparse)
195
+
196
+ # 设置目标函数
197
+ loss_fcn = CrossEntropyLoss()
198
+
199
+ # create model
200
+ # all model params are in device.
201
+ model = EntityClassify(dev_id,
202
+ g.number_of_nodes(),
203
+ self.flags.num_hidden,
204
+ num_classes,
205
+ num_rels,
206
+ num_bases=self.flags.num_bases,
207
+ num_hidden_layers=self.flags.num_layers - 2,
208
+ dropout=self.flags.dropout,
209
+ use_self_loop=self.flags.use_self_loop,
210
+ low_mem=self.flags.low_mem,
211
+ layer_norm=self.flags.layer_norm)
212
+
213
+ if n_gpus == 1:
214
+ th.cuda.set_device(dev_id)
215
+ model.cuda(dev_id)
216
+ if self.flags.dgl_sparse:
217
+ embed_layer.cuda(dev_id)
218
+
219
+ elif n_gpus > 1:
220
+ if dev_id >= 0:
221
+ model.cuda(dev_id)
222
+ model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
223
+ if self.flags.dgl_sparse:
224
+ embed_layer.cuda(dev_id)
225
+ if len(list(embed_layer.parameters())) > 0:
226
+ embed_layer = DistributedDataParallel(embed_layer, device_ids=[dev_id], output_device=dev_id)
227
+ else:
228
+ if len(list(embed_layer.parameters())) > 0:
229
+ embed_layer = DistributedDataParallel(embed_layer, device_ids=None, output_device=None)
230
+
231
+ # optimizer
232
+ dense_params = list(model.parameters())
233
+ if self.flags.node_feats:
234
+ if n_gpus > 1:
235
+ dense_params += list(embed_layer.module.embeds.parameters())
236
+ else:
237
+ dense_params += list(embed_layer.embeds.parameters())
238
+ optimizer = th.optim.Adam(dense_params, lr=self.flags.lr, weight_decay=self.flags.l2norm)
239
+
240
+ if self.flags.dgl_sparse:
241
+ all_params = list(model.parameters()) + list(embed_layer.parameters())
242
+ optimizer = th.optim.Adam(all_params, lr=self.flags.lr, weight_decay=self.flags.l2norm)
243
+ if n_gpus > 1 and isinstance(embed_layer, DistributedDataParallel):
244
+ dgl_emb = embed_layer.module.dgl_emb
245
+ else:
246
+ dgl_emb = embed_layer.dgl_emb
247
+ emb_optimizer = dgl.optim.SparseAdam(params=dgl_emb, lr=self.flags.sparse_lr, eps=1e-8) if len(dgl_emb) > 0 else None
248
+ else:
249
+ if n_gpus > 1:
250
+ embs = list(embed_layer.module.node_embeds.parameters())
251
+ else:
252
+ embs = list(embed_layer.node_embeds.parameters())
253
+ emb_optimizer = th.optim.SparseAdam(embs, lr=self.flags.sparse_lr) if len(embs) > 0 else None
254
+
255
+ ntype_tensor = ntype_tensor.to(device)
256
+ # etype_tensor = etype_tensor.to(device)
257
+ typeid_tensor = typeid_tensor.to(device)
258
+
259
+ # train_eids = train_eids.to(device)
260
+ # valid_eids = valid_eids.to(device)
261
+ # test_eids = test_eids.to(device)
262
+
263
+ dataset = train_eids, valid_eids, test_eids, device
264
+ dataloader = RgcnTrainingDataLoader(self.flags).init(g, dataset)
265
+ loader, val_loader, test_loader = dataloader
266
+
267
+ # training loop
268
+ print("start training...")
269
+ forward_time = []
270
+ backward_time = []
271
+
272
+ train_time = 0
273
+ validation_time = 0
274
+ test_time = 0
275
+ last_val_acc = 0.0
276
+ do_test = False
277
+
278
+ for epoch in range(self.flags.num_epochs):
279
+
280
+ if n_gpus > 1:
281
+ loader.set_epoch(epoch)
282
+
283
+ tstart = time.time()
284
+ model.train()
285
+ embed_layer.train()
286
+
287
+ # for i, sample_data in enumerate(loader):
288
+ for i, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(loader):
289
+
290
+ # input_nodes, seeds, blocks = sample_data
291
+ # # map the seed nodes back to their type-specific ids, so that they
292
+ # # can be used to look up their respective labels
293
+ # seeds = inv_target[seeds]
294
+
295
+ for block in blocks:
296
+ gen_norm(block, ntype_tensor, etype_tensor, typeid_tensor)
297
+
298
+ t0 = time.time()
299
+ feats = embed_layer(blocks[0].srcdata[dgl.NID],
300
+ blocks[0].srcdata['ntype'],
301
+ blocks[0].srcdata['type_id'],
302
+ node_feats)
303
+ blocks = [block.long().to(device) for block in blocks]
304
+ # logits = model(blocks, feats)
305
+
306
+ pos_graph = pos_graph.to(device)
307
+ neg_graph = neg_graph.to(device)
308
+ batch_pred = model(blocks, feats)
309
+
310
+ f_step = time.time()
311
+ loss = loss_fcn(batch_pred, pos_graph, neg_graph)
312
+
313
+
314
+ # loss = F.cross_entropy(logits, labels[seeds])
315
+ # writer.add_scalar('loss', loss, global_step)
316
+ t1 = time.time()
317
+ optimizer.zero_grad()
318
+ if emb_optimizer is not None:
319
+ emb_optimizer.zero_grad()
320
+
321
+ loss.backward()
322
+ if emb_optimizer is not None:
323
+ emb_optimizer.step()
324
+ optimizer.step()
325
+ t2 = time.time()
326
+
327
+ forward_time.append(t1 - t0)
328
+ backward_time.append(t2 - t1)
329
+ # train_acc = th.sum(logits.argmax(dim=1) == labels[seeds]).item() / len(seeds)
330
+ if i % 100 == 0 and proc_id == 0:
331
+ print("Train Loss: {:.4f}".
332
+ format(loss.item()))
333
+ # writer.add_scalar('train_acc', train_acc, global_step)
334
+ # global_step += 1
335
+
336
+ print("Epoch {:05d}:{:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}".
337
+ format(epoch, self.flags.num_epochs, forward_time[-1], backward_time[-1]))
338
+ tend = time.time()
339
+ train_time += (tend - tstart)
340
+
341
+ # val_acc, val_loss, validation_time = self._evaluate(n_gpus, labels, queue, proc_id, model, embed_layer,
342
+ # val_loader, node_feats, inv_target, 'Validation')
343
+
344
+ # do_test = val_acc > last_val_acc
345
+ # last_val_acc = val_acc
346
+
347
+ # if n_gpus > 1:
348
+ # th.distributed.barrier()
349
+ # if proc_id == 0:
350
+ # for i in range(1, n_gpus):
351
+ # queue.put(do_test)
352
+ # else:
353
+ # do_test = queue.get()
354
+
355
+ # if epoch == self.flags.num_epochs - 1 or (epoch > 0 and do_test):
356
+ # test_acc, test_loss, test_time = self._evaluate(n_gpus, labels, queue, proc_id, model, embed_layer,
357
+ # test_loader, node_feats, inv_target, 'Test')
358
+ # if n_gpus > 1:
359
+ # th.distributed.barrier()
360
+
361
+ print("{}/{} Mean forward time: {:4f}".format(proc_id, n_gpus,
362
+ np.mean(forward_time[len(forward_time) // 4:])))
363
+ print("{}/{} Mean backward time: {:4f}".format(proc_id, n_gpus,
364
+ np.mean(backward_time[len(backward_time) // 4:])))
365
+ # if proc_id == 0:
366
+ # print("Final Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss))
367
+ # print("Train {}s, valid {}s, test {}s".format(train_time, validation_time, test_time))
368
+
369
+ def _evaluate(self, n_gpus, labels, queue, proc_id, model, embed_layer,
370
+ data_loader, node_feats, inv_target, mode):
371
+
372
+ tstart = time.time()
373
+ time_cost = 0
374
+ acc = 0
375
+ loss = 0
376
+ logits, seeds = evaluate(model, embed_layer,
377
+ data_loader, node_feats,
378
+ inv_target)
379
+ if queue is not None:
380
+ queue.put((logits, seeds))
381
+
382
+ if proc_id == 0:
383
+ loss, acc = self._collect_eval(n_gpus, labels, queue) if queue is not None else \
384
+ (F.cross_entropy(logits, labels[seeds].cpu()).item(), \
385
+ th.sum(logits.argmax(dim=1) == labels[seeds].cpu()).item() / len(seeds))
386
+
387
+ print("{} Accuracy: {:.4f} | {} loss: {:.4f}".format(mode, acc, mode, loss))
388
+
389
+ tend = time.time()
390
+ time_cost = (tend-tstart)
391
+ return acc, loss, time_cost
392
+
393
+ def _collect_eval(self, n_gpus, labels, queue):
394
+
395
+ eval_logits = []
396
+ eval_seeds = []
397
+ for i in range(n_gpus):
398
+
399
+ log = queue.get()
400
+ eval_l, eval_s = log
401
+ eval_logits.append(eval_l)
402
+ eval_seeds.append(eval_s)
403
+
404
+ eval_logits = th.cat(eval_logits)
405
+ eval_seeds = th.cat(eval_seeds)
406
+ eval_loss = F.cross_entropy(eval_logits, labels[eval_seeds].cpu()).item()
407
+ eval_acc = th.sum(eval_logits.argmax(dim=1) == labels[eval_seeds].cpu()).item() / len(eval_seeds)
408
+ return eval_loss, eval_acc
409
+
410
+ class RgcnTrainingDataLoader(Dataloader):
411
+
412
+ def init(self, g, dataset):
413
+
414
+ train_eids, valid_eids, test_eids, device = dataset
415
+
416
+ # target_idx = target_idx.to(device)
417
+
418
+ # 查找有几块GPU
419
+ n_gpus = len(list(map(int, self.flags.gpu.split(','))))
420
+
421
+ # 每层邻居数
422
+ fanouts = [int(fanout) for fanout in self.flags.fanout.split(',')]
423
+
424
+ sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
425
+
426
+ loader = dgl.dataloading.EdgeDataLoader(
427
+ g, train_eids, sampler,
428
+ negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
429
+ batch_size=self.flags.batch_size,
430
+ device=device,
431
+ use_ddp=n_gpus > 1,
432
+ shuffle=True,
433
+ drop_last=False,
434
+ num_workers=self.flags.num_workers)
435
+
436
+ val_loader = dgl.dataloading.EdgeDataLoader(
437
+ g, valid_eids, sampler,
438
+ negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
439
+ batch_size=self.flags.batch_size,
440
+ device=device,
441
+ use_ddp=n_gpus > 1,
442
+ shuffle=False,
443
+ drop_last=False,
444
+ num_workers=self.flags.num_workers)
445
+
446
+ test_loader = dgl.dataloading.EdgeDataLoader(
447
+ g, test_eids, sampler,
448
+ negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
449
+ batch_size=self.flags.batch_size,
450
+ device=device,
451
+ use_ddp=n_gpus > 1,
452
+ shuffle=True,
453
+ drop_last=False,
454
+ num_workers=self.flags.num_workers)
455
+
456
+ # loader = dgl.dataloading.NodeDataLoader(
457
+ # g,
458
+ # target_idx[train_idx],
459
+ # sampler,
460
+ # use_ddp=n_gpus > 1,
461
+ # device=device if self.flags.num_workers == 0 else None,
462
+ # batch_size=self.flags.batch_size,
463
+ # shuffle=True,
464
+ # drop_last=False,
465
+ # num_workers=self.flags.num_workers)
466
+
467
+ # # validation sampler
468
+ # val_loader = dgl.dataloading.NodeDataLoader(
469
+ # g,
470
+ # target_idx[val_idx],
471
+ # sampler,
472
+ # use_ddp=n_gpus > 1,
473
+ # device=device if self.flags.num_workers == 0 else None,
474
+ # batch_size=self.flags.batch_size,
475
+ # shuffle=False,
476
+ # drop_last=False,
477
+ # num_workers=self.flags.num_workers)
478
+
479
+ # # test sampler
480
+ # test_sampler = dgl.dataloading.MultiLayerNeighborSampler([-1] * self.flags.num_layers)
481
+ # test_loader = dgl.dataloading.NodeDataLoader(
482
+ # g,
483
+ # target_idx[test_idx],
484
+ # test_sampler,
485
+ # use_ddp=n_gpus > 1,
486
+ # device=device if self.flags.num_workers == 0 else None,
487
+ # batch_size=self.flags.eval_batch_size,
488
+ # shuffle=False,
489
+ # drop_last=False,
490
+ # num_workers=self.flags.num_workers)
491
+
492
+ return loader, val_loader, test_loader
493
+
494
+
495
+ def gen_norm(g, ntype_tensor, etype_tensor, typeid_tensor):
496
+
497
+ _, v, eid = g.all_edges(form='all')
498
+ _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
499
+ degrees = count[inverse_index]
500
+ norm = th.ones(eid.shape[0], device=eid.device) / degrees
501
+ norm = norm.unsqueeze(1)
502
+ g.edata['norm'] = norm
503
+
504
+ g.srcdata['ntype'] = ntype_tensor[g.srcdata[dgl.NID]]
505
+ g.edata['etype'] = etype_tensor[eid]
506
+ g.srcdata['type_id'] = typeid_tensor[g.srcdata[dgl.NID]]
507
+
508
+
509
+ def evaluate(model, embed_layer, eval_loader, node_feats, inv_target, ntype_tensor, etype_tensor, typeid_tensor):
510
+
511
+ model.eval()
512
+ embed_layer.eval()
513
+ eval_logits = []
514
+ eval_seeds = []
515
+
516
+ with th.no_grad():
517
+ th.cuda.empty_cache()
518
+ for i, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(eval_loader):
519
+
520
+ for block in blocks:
521
+ gen_norm(block, ntype_tensor, etype_tensor, typeid_tensor)
522
+
523
+ feats = embed_layer(blocks[0].srcdata[dgl.NID],
524
+ blocks[0].srcdata['ntype'],
525
+ blocks[0].srcdata['type_id'],
526
+ node_feats)
527
+ logits = model(blocks, feats)
528
+
529
+ loss_fcn = AUC()
530
+ auc = loss_fcn(logits, pos_graph, neg_graph)
531
+ print("valid auc: {:.4f}".
532
+ format(auc.item()))
533
+
534
+ # eval_logits.append(logits.cpu())
535
+
536
+ eval_logits = th.cat(eval_logits)
537
+ eval_seeds = th.cat(eval_seeds)
538
+
539
+ return eval_logits, eval_seeds
540
+
541
+
542
+ class CrossEntropyLoss(nn.Module):
543
+
544
+ def forward(self, block_outputs, pos_graph, neg_graph):
545
+
546
+ with pos_graph.local_scope():
547
+ pos_graph.ndata['h'] = block_outputs
548
+ pos_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
549
+ pos_score = pos_graph.edata['score']
550
+ with neg_graph.local_scope():
551
+ neg_graph.ndata['h'] = block_outputs
552
+ neg_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
553
+ neg_score = neg_graph.edata['score']
554
+
555
+ score = th.cat([pos_score, neg_score])
556
+ label = th.cat([th.ones_like(pos_score), th.zeros_like(neg_score)]).long()
557
+ loss = F.binary_cross_entropy_with_logits(score, label.float())
558
+ return loss
559
+
560
+
561
+ class AUC(nn.Module):
562
+
563
+ def forward(self, block_outputs, pos_graph, neg_graph):
564
+
565
+ with pos_graph.local_scope():
566
+ pos_graph.ndata['h'] = block_outputs
567
+ pos_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
568
+ pos_score = pos_graph.edata['score']
569
+ with neg_graph.local_scope():
570
+ neg_graph.ndata['h'] = block_outputs
571
+ neg_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
572
+ neg_score = neg_graph.edata['score']
573
+
574
+ score = th.cat([pos_score, neg_score]).numpy()
575
+ label = th.cat([th.ones_like(pos_score), th.zeros_like(neg_score)]).numpy()
576
+
577
+ return roc_auc_score(label, score)
run_wm_rgcn.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modeling Relational Data with Graph Convolutional Networks
3
+ Paper: https://arxiv.org/abs/1703.06103
4
+ Code: https://github.com/tkipf/relational-gcn
5
+ Difference compared to tkipf/relation-gcn
6
+ * l2norm applied to all weights
7
+ * remove nodes that won't be touched
8
+ """
9
+ import argparse, gc
10
+ import numpy as np
11
+ import time
12
+ import torch as th
13
+ import torch.nn as nn
14
+ import dgl.function as fn
15
+ import torch.nn.functional as F
16
+ import dgl
17
+ import dgl.multiprocessing as mp
18
+ from torch.nn.parallel import DistributedDataParallel
19
+ from dgl import DGLGraph
20
+ from functools import partial
21
+
22
+ from dgl.data.rdf import AIFBDataset
23
+ from src.skeleton.graph_builder import StandaloneGraphBuilder
24
+ from src.skeleton.train_type import SamplingGraphTraining
25
+ from src.application.rgcn.rgcn import RelGraphEmbedLayer, EntityClassify
26
+ from dgl.contrib.hostmap_tensor import HostMapTensor
27
+ from src.skeleton.dataloader import Dataloader
28
+ import tqdm
29
+
30
+ from sklearn.metrics import roc_auc_score
31
+ # from torch.utils.tensorboard import SummaryWriter
32
+
33
+ '''
34
+ 这是单机的异构图节点分类任务-Demo:
35
+
36
+ 适用于:
37
+ -- 图的数据量较大,比如100万~1亿点, 1000万~10亿边。
38
+
39
+ class RgcnGraphBuilder 负责加载数据
40
+ class RgcnTrainer 负责训练和预测
41
+ class RgcnTrainingDataLoader 负责做训练采样和数据遍历
42
+
43
+ 用户如果需要改动只需要:
44
+
45
+ 1、改动RgcnGraphBuilder.build_dataset 此方法负责从DGL图中分离训练数据、预测数据、测试数据
46
+ 2、改动RgcnTrainer.train 此方法负责训练逻辑
47
+ 3、改动RgcnTrainer.evaluate 此方法负责离线预测逻辑
48
+ 4、改动RgcnTrainingDataLoader.init 此方法负责输出返回一个迭代遍历器、用于遍历数据集
49
+
50
+ 这里使用AIFB数据集做精度对齐(epoch=50, batch_size=128)
51
+ 社区aifb数据集节点分类测试集精度: Final Test Accuracy: 0.9250 | Test loss: 0.3929
52
+ 平台aifb数据集节点分类测试集精度: Final Test Accuracy: 0.9250 | Test loss: 0.2953
53
+ '''
54
+ class RgcnGraphBuilder(StandaloneGraphBuilder):
55
+
56
+ def build_dataset(self, g):
57
+
58
+ hg = g
59
+ # category = self.flags.category
60
+ num_classes = self.flags.num_classes
61
+
62
+ num_rels = len(hg.canonical_etypes)
63
+ num_of_ntype = len(hg.ntypes)
64
+
65
+ # train_mask = hg.nodes[category].data.pop(self.flags.train_mask)
66
+ # test_mask = hg.nodes[category].data.pop(self.flags.test_mask)
67
+ # labels = hg.nodes[category].data.pop(self.flags.label)
68
+
69
+ eids = th.arange(g.number_of_edges())
70
+ #eids = np.random.permutation(eids)
71
+ val_size = int(len(eids) * 0.1)
72
+ test_size = int(len(eids) * 0.2)
73
+ # train_size = g.number_of_edges() - val_size - test_size
74
+ valid_eids = eids[:val_size]
75
+ test_eids = eids[val_size: val_size + test_size]
76
+ train_eids = eids[val_size + test_size:]
77
+
78
+
79
+ # train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
80
+ # test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
81
+
82
+ # val_idx = train_idx
83
+
84
+ node_feats = {}
85
+ for ntype in hg.ntypes:
86
+ if len(hg.nodes[ntype].data) == 0 or self.flags.node_feats is False:
87
+ node_feats[str(hg.get_ntype_id(ntype))] = hg.number_of_nodes(ntype)
88
+ else:
89
+ assert len(hg.nodes[ntype].data) == 1
90
+ feat = hg.nodes[ntype].data.pop(self.flags.feat)
91
+ if feat is not None:
92
+ feats = HostMapTensor(ntype + '__' + self.flags.feat, feat)
93
+ node_feats[str(hg.get_ntype_id(ntype))] = feats
94
+
95
+ # get target category id
96
+ # category_id = len(hg.ntypes)
97
+ # for i, ntype in enumerate(hg.ntypes):
98
+ # if ntype == category:
99
+ # category_id = i
100
+ # print('{}:{}'.format(i, ntype))
101
+
102
+ g = dgl.to_homogeneous(hg)
103
+ ntype_tensor = g.ndata[dgl.NTYPE]
104
+ ntype_tensor.share_memory_()
105
+ etype_tensor = g.edata[dgl.ETYPE]
106
+ etype_tensor.share_memory_()
107
+ typeid_tensor = g.ndata[dgl.NID]
108
+ typeid_tensor.share_memory_()
109
+
110
+ #ntype_tensor = dgl.contrib.HostMapTensor('ntype_tensor', g.ndata[dgl.NTYPE])
111
+ #etype_tensor = dgl.contrib.HostMapTensor('etype_tensor', g.edata[dgl.ETYPE])
112
+ #typeid_tensor = dgl.contrib.HostMapTensor('typeid_tensor', g.edata[dgl.NID])
113
+
114
+ # node_ids = th.arange(g.number_of_nodes())
115
+
116
+ # # find out the target node ids
117
+ # node_tids = g.ndata[dgl.NTYPE]
118
+ # loc = (node_tids == category_id)
119
+ # target_idx = node_ids[loc]
120
+ # target_idx.share_memory_()
121
+ # train_idx.share_memory_()
122
+ # val_idx.share_memory_()
123
+ # test_idx.share_memory_()
124
+
125
+ # # This is a graph with multiple node types, so we want a way to map
126
+ # # our target node from their global node numberings, back to their
127
+ # # numberings within their type. This is used when taking the nodes in a
128
+ # # mini-batch, and looking up their type-specific labels
129
+ # inv_target = th.empty(node_ids.shape,
130
+ # dtype=node_ids.dtype)
131
+ # inv_target.share_memory_()
132
+ # inv_target[target_idx] = th.arange(0, target_idx.shape[0],
133
+ # dtype=inv_target.dtype)
134
+
135
+ # Create csr/coo/csc formats before launching training processes with multi-gpu.
136
+ # This avoids creating certain formats in each sub-process, which saves momory and CPU.
137
+ g.create_formats_()
138
+
139
+ g = g.shared_memory('g')
140
+
141
+ return g, node_feats, num_of_ntype, num_classes, num_rels, train_eids, valid_eids, test_eids, ntype_tensor, etype_tensor, typeid_tensor
142
+
143
+
144
+ class RgcnTrainer(SamplingGraphTraining):
145
+
146
+ def train(self, g, dataset, device, n_gpus, proc_id, **kwargs):
147
+
148
+ dev_id = -1 if n_gpus == 0 else device.index
149
+ queue = kwargs['queue'] if n_gpus > 1 else None
150
+
151
+ g, node_feats, num_of_ntype, num_classes, num_rels, train_eids, valid_eids, test_eids, ntype_tensor, etype_tensor, typeid_tensor = dataset
152
+
153
+ node_tids = ntype_tensor
154
+ world_size = n_gpus
155
+
156
+ if n_gpus > 0:
157
+ for key in node_feats:
158
+ if not isinstance(node_feats[key], int):
159
+ node_feats[key].uva(device)
160
+
161
+ if n_gpus == 1:
162
+ g = g.to(device)
163
+
164
+ if n_gpus > 1:
165
+
166
+ g = g.uva(device)
167
+ dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
168
+ master_ip='127.0.0.1', master_port=self.flags.master_port)
169
+
170
+ th.distributed.init_process_group(backend=self.flags.communication_backend,
171
+ init_method=dist_init_method,
172
+ world_size=world_size,
173
+ rank=proc_id)
174
+
175
+ # node features
176
+ # None for one-hot feature, if not none, it should be the feature tensor.
177
+ embed_layer = RelGraphEmbedLayer(dev_id if self.flags.embedding_gpu or not self.flags.dgl_sparse else -1,
178
+ dev_id,
179
+ g.number_of_nodes(),
180
+ node_tids,
181
+ num_of_ntype,
182
+ node_feats,
183
+ self.flags.num_hidden,
184
+ dgl_sparse=self.flags.dgl_sparse)
185
+
186
+ # 设置目标函数
187
+ loss_fcn = CrossEntropyLoss()
188
+
189
+ # create model
190
+ # all model params are in device.
191
+ model = EntityClassify(dev_id,
192
+ g.number_of_nodes(),
193
+ self.flags.num_hidden,
194
+ num_classes,
195
+ num_rels,
196
+ num_bases=self.flags.num_bases,
197
+ num_hidden_layers=self.flags.num_layers - 2,
198
+ dropout=self.flags.dropout,
199
+ use_self_loop=self.flags.use_self_loop,
200
+ low_mem=self.flags.low_mem,
201
+ layer_norm=self.flags.layer_norm)
202
+
203
+ if n_gpus == 1:
204
+ th.cuda.set_device(dev_id)
205
+ #labels = labels.to(dev_id)
206
+ model.cuda(dev_id)
207
+ if self.flags.dgl_sparse:
208
+ embed_layer.cuda(dev_id)
209
+
210
+ elif n_gpus > 1:
211
+ #labels = labels.to(dev_id)
212
+ if dev_id >= 0:
213
+ model.cuda(dev_id)
214
+ model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
215
+ if self.flags.dgl_sparse:
216
+ embed_layer.cuda(dev_id)
217
+ if len(list(embed_layer.parameters())) > 0:
218
+ embed_layer = DistributedDataParallel(embed_layer, device_ids=[dev_id], output_device=dev_id)
219
+ else:
220
+ if len(list(embed_layer.parameters())) > 0:
221
+ embed_layer = DistributedDataParallel(embed_layer, device_ids=None, output_device=None)
222
+
223
+ # optimizer
224
+ dense_params = list(model.parameters())
225
+ if self.flags.node_feats:
226
+ if n_gpus > 1:
227
+ dense_params += list(embed_layer.module.embeds.parameters())
228
+ else:
229
+ dense_params += list(embed_layer.embeds.parameters())
230
+ optimizer = th.optim.Adam(dense_params, lr=self.flags.lr, weight_decay=self.flags.l2norm)
231
+
232
+ if self.flags.dgl_sparse:
233
+ all_params = list(model.parameters()) + list(embed_layer.parameters())
234
+ optimizer = th.optim.Adam(all_params, lr=self.flags.lr, weight_decay=self.flags.l2norm)
235
+ if n_gpus > 1 and isinstance(embed_layer, DistributedDataParallel):
236
+ dgl_emb = embed_layer.module.dgl_emb
237
+ else:
238
+ dgl_emb = embed_layer.dgl_emb
239
+ emb_optimizer = dgl.optim.SparseAdam(params=dgl_emb, lr=self.flags.sparse_lr, eps=1e-8) if len(dgl_emb) > 0 else None
240
+ else:
241
+ if n_gpus > 1:
242
+ embs = list(embed_layer.module.node_embeds.parameters())
243
+ else:
244
+ embs = list(embed_layer.node_embeds.parameters())
245
+ emb_optimizer = th.optim.SparseAdam(embs, lr=self.flags.sparse_lr) if len(embs) > 0 else None
246
+
247
+ ntype_tensor = ntype_tensor.to(device)
248
+ etype_tensor = etype_tensor.to(device)
249
+ typeid_tensor = typeid_tensor.to(device)
250
+ train_eids = train_eids.to(device)
251
+ valid_eids = valid_eids.to(device)
252
+ test_eids = test_eids.to(device)
253
+
254
+ dataset = train_eids, valid_eids, test_eids, device
255
+ dataloader = RgcnTrainingDataLoader(self.flags).init(g, dataset)
256
+ loader, val_loader, test_loader = dataloader
257
+
258
+ # training loop
259
+ print("start training...")
260
+ forward_time = []
261
+ backward_time = []
262
+
263
+ train_time = 0
264
+ validation_time = 0
265
+ test_time = 0
266
+ last_val_acc = 0.0
267
+ do_test = False
268
+
269
+ for epoch in range(self.flags.num_epochs):
270
+
271
+ if n_gpus > 1:
272
+ loader.set_epoch(epoch)
273
+
274
+ tstart = time.time()
275
+ model.train()
276
+ embed_layer.train()
277
+
278
+ # for i, sample_data in enumerate(loader):
279
+ for i, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(loader):
280
+
281
+ # input_nodes, seeds, blocks = sample_data
282
+ # # map the seed nodes back to their type-specific ids, so that they
283
+ # # can be used to look up their respective labels
284
+ # seeds = inv_target[seeds]
285
+
286
+ for block in blocks:
287
+ gen_norm(block, ntype_tensor, etype_tensor, typeid_tensor)
288
+
289
+ t0 = time.time()
290
+ feats = embed_layer(blocks[0].srcdata[dgl.NID],
291
+ blocks[0].srcdata['ntype'],
292
+ blocks[0].srcdata['type_id'],
293
+ node_feats)
294
+ blocks = [block.int().to(device) for block in blocks]
295
+ # logits = model(blocks, feats)
296
+
297
+ pos_graph = pos_graph.to(device)
298
+ neg_graph = neg_graph.to(device)
299
+ batch_pred = model(blocks, feats)
300
+
301
+ f_step = time.time()
302
+ loss = loss_fcn(batch_pred, pos_graph, neg_graph)
303
+
304
+
305
+ # loss = F.cross_entropy(logits, labels[seeds])
306
+ # writer.add_scalar('loss', loss, global_step)
307
+ t1 = time.time()
308
+ optimizer.zero_grad()
309
+ if emb_optimizer is not None:
310
+ emb_optimizer.zero_grad()
311
+
312
+ loss.backward()
313
+ if emb_optimizer is not None:
314
+ emb_optimizer.step()
315
+ optimizer.step()
316
+ t2 = time.time()
317
+
318
+ forward_time.append(t1 - t0)
319
+ backward_time.append(t2 - t1)
320
+ # train_acc = th.sum(logits.argmax(dim=1) == labels[seeds]).item() / len(seeds)
321
+ if i % 100 == 0 and proc_id == 0:
322
+ print("Train Loss: {:.4f}".
323
+ format(loss.item()))
324
+ # writer.add_scalar('train_acc', train_acc, global_step)
325
+ # global_step += 1
326
+
327
+ print("Epoch {:05d}:{:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}".
328
+ format(epoch, self.flags.num_epochs, forward_time[-1], backward_time[-1]))
329
+ tend = time.time()
330
+ train_time += (tend - tstart)
331
+
332
+ # val_acc, val_loss, validation_time = self._evaluate(n_gpus, labels, queue, proc_id, model, embed_layer,
333
+ # val_loader, node_feats, inv_target, 'Validation')
334
+
335
+ # do_test = val_acc > last_val_acc
336
+ # last_val_acc = val_acc
337
+
338
+ # if n_gpus > 1:
339
+ # th.distributed.barrier()
340
+ # if proc_id == 0:
341
+ # for i in range(1, n_gpus):
342
+ # queue.put(do_test)
343
+ # else:
344
+ # do_test = queue.get()
345
+
346
+ # if epoch == self.flags.num_epochs - 1 or (epoch > 0 and do_test):
347
+ # test_acc, test_loss, test_time = self._evaluate(n_gpus, labels, queue, proc_id, model, embed_layer,
348
+ # test_loader, node_feats, inv_target, 'Test')
349
+ # if n_gpus > 1:
350
+ # th.distributed.barrier()
351
+
352
+ print("{}/{} Mean forward time: {:4f}".format(proc_id, n_gpus,
353
+ np.mean(forward_time[len(forward_time) // 4:])))
354
+ print("{}/{} Mean backward time: {:4f}".format(proc_id, n_gpus,
355
+ np.mean(backward_time[len(backward_time) // 4:])))
356
+ # if proc_id == 0:
357
+ # print("Final Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss))
358
+ # print("Train {}s, valid {}s, test {}s".format(train_time, validation_time, test_time))
359
+
360
+ def _evaluate(self, n_gpus, labels, queue, proc_id, model, embed_layer,
361
+ data_loader, node_feats, inv_target, mode):
362
+
363
+ tstart = time.time()
364
+ time_cost = 0
365
+ acc = 0
366
+ loss = 0
367
+ logits, seeds = evaluate(model, embed_layer,
368
+ data_loader, node_feats,
369
+ inv_target)
370
+ if queue is not None:
371
+ queue.put((logits, seeds))
372
+
373
+ if proc_id == 0:
374
+ loss, acc = self._collect_eval(n_gpus, labels, queue) if queue is not None else \
375
+ (F.cross_entropy(logits, labels[seeds].cpu()).item(), \
376
+ th.sum(logits.argmax(dim=1) == labels[seeds].cpu()).item() / len(seeds))
377
+
378
+ print("{} Accuracy: {:.4f} | {} loss: {:.4f}".format(mode, acc, mode, loss))
379
+
380
+ tend = time.time()
381
+ time_cost = (tend-tstart)
382
+ return acc, loss, time_cost
383
+
384
+ def _collect_eval(self, n_gpus, labels, queue):
385
+
386
+ eval_logits = []
387
+ eval_seeds = []
388
+ for i in range(n_gpus):
389
+
390
+ log = queue.get()
391
+ eval_l, eval_s = log
392
+ eval_logits.append(eval_l)
393
+ eval_seeds.append(eval_s)
394
+
395
+ eval_logits = th.cat(eval_logits)
396
+ eval_seeds = th.cat(eval_seeds)
397
+ eval_loss = F.cross_entropy(eval_logits, labels[eval_seeds].cpu()).item()
398
+ eval_acc = th.sum(eval_logits.argmax(dim=1) == labels[eval_seeds].cpu()).item() / len(eval_seeds)
399
+ return eval_loss, eval_acc
400
+
401
+ class RgcnTrainingDataLoader(Dataloader):
402
+
403
+ def init(self, g, dataset):
404
+
405
+ train_eids, valid_eids, test_eids, device = dataset
406
+
407
+ # target_idx = target_idx.to(device)
408
+
409
+ # 查找有几块GPU
410
+ n_gpus = len(list(map(int, self.flags.gpu.split(','))))
411
+
412
+ # 每层邻居数
413
+ fanouts = [int(fanout) for fanout in self.flags.fanout.split(',')]
414
+
415
+ sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
416
+
417
+ loader = dgl.dataloading.EdgeDataLoader(
418
+ g, train_eids, sampler,
419
+ negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
420
+ batch_size=self.flags.batch_size,
421
+ device=device,
422
+ use_ddp=n_gpus > 1,
423
+ shuffle=True,
424
+ drop_last=False,
425
+ num_workers=self.flags.num_workers)
426
+
427
+ val_loader = dgl.dataloading.EdgeDataLoader(
428
+ g, valid_eids, sampler,
429
+ negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
430
+ batch_size=self.flags.batch_size,
431
+ device=device,
432
+ use_ddp=n_gpus > 1,
433
+ shuffle=False,
434
+ drop_last=False,
435
+ num_workers=self.flags.num_workers)
436
+
437
+ test_loader = dgl.dataloading.EdgeDataLoader(
438
+ g, test_eids, sampler,
439
+ negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
440
+ batch_size=self.flags.batch_size,
441
+ device=device,
442
+ use_ddp=n_gpus > 1,
443
+ shuffle=True,
444
+ drop_last=False,
445
+ num_workers=self.flags.num_workers)
446
+
447
+ # loader = dgl.dataloading.NodeDataLoader(
448
+ # g,
449
+ # target_idx[train_idx],
450
+ # sampler,
451
+ # use_ddp=n_gpus > 1,
452
+ # device=device if self.flags.num_workers == 0 else None,
453
+ # batch_size=self.flags.batch_size,
454
+ # shuffle=True,
455
+ # drop_last=False,
456
+ # num_workers=self.flags.num_workers)
457
+
458
+ # # validation sampler
459
+ # val_loader = dgl.dataloading.NodeDataLoader(
460
+ # g,
461
+ # target_idx[val_idx],
462
+ # sampler,
463
+ # use_ddp=n_gpus > 1,
464
+ # device=device if self.flags.num_workers == 0 else None,
465
+ # batch_size=self.flags.batch_size,
466
+ # shuffle=False,
467
+ # drop_last=False,
468
+ # num_workers=self.flags.num_workers)
469
+
470
+ # # test sampler
471
+ # test_sampler = dgl.dataloading.MultiLayerNeighborSampler([-1] * self.flags.num_layers)
472
+ # test_loader = dgl.dataloading.NodeDataLoader(
473
+ # g,
474
+ # target_idx[test_idx],
475
+ # test_sampler,
476
+ # use_ddp=n_gpus > 1,
477
+ # device=device if self.flags.num_workers == 0 else None,
478
+ # batch_size=self.flags.eval_batch_size,
479
+ # shuffle=False,
480
+ # drop_last=False,
481
+ # num_workers=self.flags.num_workers)
482
+
483
+ return loader, val_loader, test_loader
484
+
485
+
486
+ def gen_norm(g, ntype_tensor, etype_tensor, typeid_tensor):
487
+
488
+ _, v, eid = g.all_edges(form='all')
489
+ _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
490
+ degrees = count[inverse_index]
491
+ norm = th.ones(eid.shape[0], device=eid.device) / degrees
492
+ norm = norm.unsqueeze(1)
493
+ g.edata['norm'] = norm
494
+
495
+ g.srcdata['ntype'] = ntype_tensor[g.srcdata[dgl.NID]]
496
+ g.edata['etype'] = etype_tensor[eid]
497
+ g.srcdata['type_id'] = typeid_tensor[g.srcdata[dgl.NID]]
498
+
499
+
500
+ def evaluate(model, embed_layer, eval_loader, node_feats, inv_target):
501
+
502
+ model.eval()
503
+ embed_layer.eval()
504
+ eval_logits = []
505
+ eval_seeds = []
506
+
507
+ with th.no_grad():
508
+ th.cuda.empty_cache()
509
+ for i, (input_nodes, pos_graph, neg_graph, blocks) in enumerate(eval_loader):
510
+
511
+ for block in blocks:
512
+ gen_norm(block)
513
+
514
+ feats = embed_layer(blocks[0].srcdata[dgl.NID],
515
+ blocks[0].srcdata['ntype'],
516
+ blocks[0].srcdata['type_id'],
517
+ node_feats)
518
+ logits = model(blocks, feats)
519
+
520
+ loss_fcn = AUC()
521
+ auc = loss_fcn(logits, pos_graph, neg_graph)
522
+ print("valid auc: {:.4f}".
523
+ format(auc.item()))
524
+
525
+ # eval_logits.append(logits.cpu())
526
+
527
+ eval_logits = th.cat(eval_logits)
528
+ eval_seeds = th.cat(eval_seeds)
529
+
530
+ return eval_logits, eval_seeds
531
+
532
+
533
+ class CrossEntropyLoss(nn.Module):
534
+
535
+ def forward(self, block_outputs, pos_graph, neg_graph):
536
+
537
+ with pos_graph.local_scope():
538
+ pos_graph.ndata['h'] = block_outputs
539
+ pos_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
540
+ pos_score = pos_graph.edata['score']
541
+ with neg_graph.local_scope():
542
+ neg_graph.ndata['h'] = block_outputs
543
+ neg_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
544
+ neg_score = neg_graph.edata['score']
545
+
546
+ score = th.cat([pos_score, neg_score])
547
+ label = th.cat([th.ones_like(pos_score), th.zeros_like(neg_score)]).long()
548
+ loss = F.binary_cross_entropy_with_logits(score, label.float())
549
+ return loss
550
+
551
+
552
+ class AUC(nn.Module):
553
+
554
+ def forward(self, block_outputs, pos_graph, neg_graph):
555
+
556
+ with pos_graph.local_scope():
557
+ pos_graph.ndata['h'] = block_outputs
558
+ pos_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
559
+ pos_score = pos_graph.edata['score']
560
+ with neg_graph.local_scope():
561
+ neg_graph.ndata['h'] = block_outputs
562
+ neg_graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
563
+ neg_score = neg_graph.edata['score']
564
+
565
+ score = th.cat([pos_score, neg_score]).numpy()
566
+ label = th.cat([th.ones_like(pos_score), th.zeros_like(neg_score)]).numpy()
567
+
568
+ return roc_auc_score(label, score)
test1.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ from fastapi import FastAPI
4
+ from pydantic import BaseModel
5
+
6
+ app = FastAPI()
7
+
8
+
9
+ class Item(BaseModel):
10
+ name: str
11
+ price: float
12
+ is_offer: Union[bool, None] = None
13
+
14
+
15
+ @app.get("/")
16
+ def read_root():
17
+ return {"Hello": "World"}
18
+
19
+
20
+ @app.get("/items/{item_id}")
21
+ def read_item(item_id: int, q: Union[str, None] = None):
22
+ return {"item_id": item_id, "q": q}
23
+
24
+
25
+ @app.put("/items/{item_id}")
26
+ def update_item(item_id: int, item: Item):
27
+ return {"item_name": item.name, "item_id": item_id}
test2.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def greet(name):
4
+ return "Hello " + name + "!"
5
+
6
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+
8
+ demo.launch()
~/Desktop/roberta-base/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
~/Desktop/roberta-base/models--roberta-base/blobs/5606f48548d99a9829d10a96cd364b816b02cd21 ADDED
The diff for this file is too large to render. See raw diff
 
~/Desktop/roberta-base/models--roberta-base/blobs/5606f48548d99a9829d10a96cd364b816b02cd21.lock ADDED
File without changes
~/Desktop/roberta-base/models--roberta-base/blobs/8db5e7ac5bfc9ec8b613b776009300fe3685d957 ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-05,
14
+ "max_position_embeddings": 514,
15
+ "model_type": "roberta",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 1,
19
+ "type_vocab_size": 1,
20
+ "vocab_size": 50265
21
+ }
~/Desktop/roberta-base/models--roberta-base/blobs/8db5e7ac5bfc9ec8b613b776009300fe3685d957.lock ADDED
File without changes
~/Desktop/roberta-base/models--roberta-base/refs/main ADDED
@@ -0,0 +1 @@
 
 
1
+ bc2764f8af2e92b6eb5679868df33e224075ca68
~/Desktop/roberta-base/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
~/Desktop/roberta-base/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
~/Desktop/roberta-base/tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "cls_token": "<s>",
5
+ "eos_token": "</s>",
6
+ "errors": "replace",
7
+ "mask_token": "<mask>",
8
+ "model_max_length": 512,
9
+ "pad_token": "<pad>",
10
+ "sep_token": "</s>",
11
+ "special_tokens_map_file": null,
12
+ "tokenizer_class": "RobertaTokenizer",
13
+ "trim_offsets": true,
14
+ "unk_token": "<unk>"
15
+ }
~/Desktop/roberta-base/vocab.json ADDED
The diff for this file is too large to render. See raw diff