Lihuchen commited on
Commit
f3f272c
1 Parent(s): 38de4aa

add example

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/AcroBERT.iml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="TestRunnerService">
9
+ <option name="PROJECT_TEST_RUNNER" value="pytest" />
10
+ </component>
11
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="10">
8
+ <item index="0" class="java.lang.String" itemvalue="scipy" />
9
+ <item index="1" class="java.lang.String" itemvalue="tensorflow" />
10
+ <item index="2" class="java.lang.String" itemvalue="tensorflow-estimator" />
11
+ <item index="3" class="java.lang.String" itemvalue="tensorboard" />
12
+ <item index="4" class="java.lang.String" itemvalue="Keras" />
13
+ <item index="5" class="java.lang.String" itemvalue="numpy" />
14
+ <item index="6" class="java.lang.String" itemvalue="t" />
15
+ <item index="7" class="java.lang.String" itemvalue="torch" />
16
+ <item index="8" class="java.lang.String" itemvalue="python-Levenshtein" />
17
+ <item index="9" class="java.lang.String" itemvalue="pytorch-metric-learning" />
18
+ </list>
19
+ </value>
20
+ </option>
21
+ </inspection_tool>
22
+ </profile>
23
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/AcroBERT.iml" filepath="$PROJECT_DIR$/.idea/AcroBERT.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
__pycache__/constant.cpython-38.pyc ADDED
Binary file (49.2 kB). View file
 
__pycache__/maddog.cpython-38.pyc ADDED
Binary file (25.1 kB). View file
 
__pycache__/utils.cpython-38.pyc ADDED
Binary file (7.67 kB). View file
 
acrobert.py CHANGED
@@ -8,8 +8,7 @@ from maddog import Extractor
8
  import spacy
9
  import constant
10
 
11
- import spacy.cli
12
- spacy.cli.download("en_core_web_sm")
13
  nlp = spacy.load("en_core_web_sm")
14
  ruleExtractor = Extractor()
15
  kb = utils.load_acronym_kb('acronym_kb.json')
@@ -40,14 +39,15 @@ def softmax(elements):
40
 
41
 
42
  def predict(topk, model, short_form, context, batch_size, acronym_kb, device):
43
- ori_candidate = utils.get_candidate(acronym_kb, short_form, can_num=10)
44
  long_terms = [str.lower(can) for can in ori_candidate]
45
  scores = cal_score(model.model, model.tokenizer, long_terms, context, batch_size, device)
46
  #indexes = [np.argmax(scores)]
47
  topk = min(len(scores), topk)
48
  indexes = np.array(scores).argsort()[::-1][:topk]
49
  names = [ori_candidate[i] for i in indexes]
50
- return names
 
51
 
52
 
53
  def cal_score(model, tokenizer, long_forms, contexts, batch_size, device):
@@ -79,13 +79,16 @@ def acrobert(sentence, model, device):
79
  tokens = [t.text for t in nlp(sentence) if len(t.text.strip()) > 0]
80
  rulebased_pairs = ruleExtractor.extract(tokens, constant.RULES)
81
 
82
- results = list()
83
  for acronym in rulebased_pairs.keys():
84
  if rulebased_pairs[acronym][0] != '':
85
  results.append((acronym, rulebased_pairs[acronym][0]))
86
  else:
87
- pred = predict(1, model, acronym, sentence, batch_size=10, acronym_kb=kb, device=device)
88
- results.append((acronym, pred[0]))
 
 
 
89
  return results
90
 
91
 
@@ -124,6 +127,8 @@ if __name__ == '__main__':
124
  # be discredited and diminished in the public ’s eye. More often than not, PR is
125
  # a preemptive process. Celebrity publicists are paid lots of money to keep certain
126
  # stories out of the news."""
127
- sentence = "AI is the ability of a digital computer or computer-controlled robot to perform tasks commonly associated with intelligent beings, including NLP that processes text or document"
 
 
128
  results = acronym_linker(sentence)
129
  print(results)
 
8
  import spacy
9
  import constant
10
 
11
+
 
12
  nlp = spacy.load("en_core_web_sm")
13
  ruleExtractor = Extractor()
14
  kb = utils.load_acronym_kb('acronym_kb.json')
 
39
 
40
 
41
  def predict(topk, model, short_form, context, batch_size, acronym_kb, device):
42
+ ori_candidate = utils.get_candidate(acronym_kb, short_form, can_num=20)
43
  long_terms = [str.lower(can) for can in ori_candidate]
44
  scores = cal_score(model.model, model.tokenizer, long_terms, context, batch_size, device)
45
  #indexes = [np.argmax(scores)]
46
  topk = min(len(scores), topk)
47
  indexes = np.array(scores).argsort()[::-1][:topk]
48
  names = [ori_candidate[i] for i in indexes]
49
+ confidences = [round(scores[i], 3) for i in indexes]
50
+ return names, confidences
51
 
52
 
53
  def cal_score(model, tokenizer, long_forms, contexts, batch_size, device):
 
79
  tokens = [t.text for t in nlp(sentence) if len(t.text.strip()) > 0]
80
  rulebased_pairs = ruleExtractor.extract(tokens, constant.RULES)
81
 
82
+ results = dict()
83
  for acronym in rulebased_pairs.keys():
84
  if rulebased_pairs[acronym][0] != '':
85
  results.append((acronym, rulebased_pairs[acronym][0]))
86
  else:
87
+ pred, scores = predict(5, model, acronym, sentence, batch_size=10, acronym_kb=kb, device=device)
88
+ output = list(zip(pred, scores))
89
+ #print(output)
90
+ results[acronym] = output
91
+ #results.append((acronym, pred[0], scores[0]))
92
  return results
93
 
94
 
 
127
  # be discredited and diminished in the public ’s eye. More often than not, PR is
128
  # a preemptive process. Celebrity publicists are paid lots of money to keep certain
129
  # stories out of the news."""
130
+ sentence = """
131
+ AI is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence.
132
+ """
133
  results = acronym_linker(sentence)
134
  print(results)
app.py CHANGED
@@ -6,5 +6,16 @@ def greet(sentence):
6
  results = acronym_linker(sentence, mode='acrobert')
7
  return results
8
 
 
 
 
 
 
 
 
 
 
 
 
9
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
10
  iface.launch()
 
6
  results = acronym_linker(sentence, mode='acrobert')
7
  return results
8
 
9
+
10
+ sample_list = [
11
+ "AI is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence. ",
12
+ """A whistleblower like monologist Mike Daisey gets targeted as a scapegoat who must
13
+ be discredited and diminished in the public eyes. More often than not, PR is
14
+ a preemptive process. Celebrity publicists are paid lots of money to keep certain
15
+ stories out of the news.""",
16
+ "This new genome assembly and the annotation are tagged as a RefSeq genome by NCBI and thus provide substantially enhanced genomic resources for future research involving S. scovelli.",
17
+ "In this study , we found that miR-34a demonstrated greater expression in the lungs of patients with IPF and in mice with experimental pulmonary fibrosis , with its primary localization in lung fibroblasts.",
18
+ ]
19
+
20
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
21
  iface.launch()