Spaces:

Lihuchen
/

AcroBERT

Running

App Files Files Community

Lihuchen commited on Mar 31, 2023

Commit

f3f272c

•

1 Parent(s): 38de4aa

add example

Browse files

Files changed (12) hide show

.idea/.gitignore +3 -0
.idea/AcroBERT.iml +11 -0
.idea/inspectionProfiles/Project_Default.xml +23 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
__pycache__/constant.cpython-38.pyc +0 -0
__pycache__/maddog.cpython-38.pyc +0 -0
__pycache__/utils.cpython-38.pyc +0 -0
acrobert.py +13 -8
app.py +11 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/AcroBERT.iml ADDED Viewed

	@@ -0,0 +1,11 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="pytest" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,23 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="10">
+            <item index="0" class="java.lang.String" itemvalue="scipy" />
+            <item index="1" class="java.lang.String" itemvalue="tensorflow" />
+            <item index="2" class="java.lang.String" itemvalue="tensorflow-estimator" />
+            <item index="3" class="java.lang.String" itemvalue="tensorboard" />
+            <item index="4" class="java.lang.String" itemvalue="Keras" />
+            <item index="5" class="java.lang.String" itemvalue="numpy" />
+            <item index="6" class="java.lang.String" itemvalue="t" />
+            <item index="7" class="java.lang.String" itemvalue="torch" />
+            <item index="8" class="java.lang.String" itemvalue="python-Levenshtein" />
+            <item index="9" class="java.lang.String" itemvalue="pytorch-metric-learning" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/AcroBERT.iml" filepath="$PROJECT_DIR$/.idea/AcroBERT.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

__pycache__/constant.cpython-38.pyc ADDED Viewed

Binary file (49.2 kB). View file

__pycache__/maddog.cpython-38.pyc ADDED Viewed

Binary file (25.1 kB). View file

__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (7.67 kB). View file

acrobert.py CHANGED Viewed

@@ -8,8 +8,7 @@ from maddog import Extractor
 import spacy
 import constant
-import spacy.cli
-spacy.cli.download("en_core_web_sm")
 nlp = spacy.load("en_core_web_sm")
 ruleExtractor = Extractor()
 kb = utils.load_acronym_kb('acronym_kb.json')
@@ -40,14 +39,15 @@ def softmax(elements):
 def predict(topk, model, short_form, context, batch_size, acronym_kb, device):
-    ori_candidate = utils.get_candidate(acronym_kb, short_form, can_num=10)
     long_terms = [str.lower(can) for can in ori_candidate]
     scores = cal_score(model.model, model.tokenizer, long_terms, context, batch_size, device)
     #indexes = [np.argmax(scores)]
     topk = min(len(scores), topk)
     indexes = np.array(scores).argsort()[::-1][:topk]
     names = [ori_candidate[i] for i in indexes]
-    return names
 def cal_score(model, tokenizer, long_forms, contexts, batch_size, device):
@@ -79,13 +79,16 @@ def acrobert(sentence, model, device):
     tokens = [t.text for t in nlp(sentence) if len(t.text.strip()) > 0]
     rulebased_pairs = ruleExtractor.extract(tokens, constant.RULES)
-    results = list()
     for acronym in rulebased_pairs.keys():
         if rulebased_pairs[acronym][0] != '':
             results.append((acronym, rulebased_pairs[acronym][0]))
         else:
-            pred = predict(1, model, acronym, sentence, batch_size=10, acronym_kb=kb, device=device)
-            results.append((acronym, pred[0]))
     return results
@@ -124,6 +127,8 @@ if __name__ == '__main__':
 # be discredited and diminished in the public ’s eye. More often than not, PR is
 # a preemptive process. Celebrity publicists are paid lots of money to keep certain
 # stories out of the news."""
-    sentence = "AI is the ability of a digital computer or computer-controlled robot to perform tasks commonly associated with intelligent beings, including NLP that processes text or document"
     results = acronym_linker(sentence)
     print(results)

 import spacy
 import constant
 nlp = spacy.load("en_core_web_sm")
 ruleExtractor = Extractor()
 kb = utils.load_acronym_kb('acronym_kb.json')
 def predict(topk, model, short_form, context, batch_size, acronym_kb, device):
+    ori_candidate = utils.get_candidate(acronym_kb, short_form, can_num=20)
     long_terms = [str.lower(can) for can in ori_candidate]
     scores = cal_score(model.model, model.tokenizer, long_terms, context, batch_size, device)
     #indexes = [np.argmax(scores)]
     topk = min(len(scores), topk)
     indexes = np.array(scores).argsort()[::-1][:topk]
     names = [ori_candidate[i] for i in indexes]
+    confidences = [round(scores[i], 3) for i in indexes]
+    return names, confidences
 def cal_score(model, tokenizer, long_forms, contexts, batch_size, device):
     tokens = [t.text for t in nlp(sentence) if len(t.text.strip()) > 0]
     rulebased_pairs = ruleExtractor.extract(tokens, constant.RULES)
+    results = dict()
     for acronym in rulebased_pairs.keys():
         if rulebased_pairs[acronym][0] != '':
             results.append((acronym, rulebased_pairs[acronym][0]))
         else:
+            pred, scores = predict(5, model, acronym, sentence, batch_size=10, acronym_kb=kb, device=device)
+            output = list(zip(pred, scores))
+            #print(output)
+            results[acronym] = output
+            #results.append((acronym, pred[0], scores[0]))
     return results
 # be discredited and diminished in the public ’s eye. More often than not, PR is
 # a preemptive process. Celebrity publicists are paid lots of money to keep certain
 # stories out of the news."""
+    sentence = """
+    AI is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence.
+    """
     results = acronym_linker(sentence)
     print(results)

app.py CHANGED Viewed

@@ -6,5 +6,16 @@ def greet(sentence):
     results = acronym_linker(sentence, mode='acrobert')
     return results
 iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()

     results = acronym_linker(sentence, mode='acrobert')
     return results
+sample_list = [
+    "AI is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence. ",
+    """A whistleblower like monologist Mike Daisey gets targeted as a scapegoat who must
+    be discredited and diminished in the public eyes. More often than not, PR is
+    a preemptive process. Celebrity publicists are paid lots of money to keep certain
+    stories out of the news.""",
+    "This new genome assembly and the annotation are tagged as a RefSeq genome by NCBI and thus provide substantially enhanced genomic resources for future research involving S. scovelli.",
+    "In this study , we found that miR-34a demonstrated greater expression in the lungs of patients with IPF and in mice with experimental pulmonary fibrosis , with its primary localization in lung fibroblasts.",
+]
 iface = gr.Interface(fn=greet, inputs="text", outputs="text")
 iface.launch()