finiteautomata commited on
Commit
8739181
1 Parent(s): d01b5c5

First commit

Browse files
Files changed (3) hide show
  1. .gitignore +136 -0
  2. app.py +82 -0
  3. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+ data/*
131
+ config/*.ini
132
+ *.bin
133
+ **/**/hs_clf
134
+ models/*
135
+ wandb/*
136
+ *.pt
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Streamlit app to highlight NER entities
2
+ import random
3
+ import streamlit as st
4
+ from datasets import load_dataset
5
+ from annotated_text import annotated_text
6
+
7
+ # Load data
8
+ ds = load_dataset("hs-knowledge/hateval_ner")
9
+ ds_2 = load_dataset("hs-knowledge/hateval_ner_2")
10
+
11
+ # Show highlighted ner entities in a tweet
12
+
13
+
14
+ def display_text(example):
15
+ # Use annotated_text to show entities
16
+ ner_output = example["ner_output"]
17
+
18
+ chunks = []
19
+ current_chunk = ""
20
+ current_type = None
21
+
22
+ # Check if there are two labels repeated
23
+ previous_label = None
24
+
25
+ for label in ner_output["labels"]:
26
+ if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
27
+ pass
28
+ previous_label = label
29
+
30
+ for token, label in zip(ner_output["tokens"], ner_output["labels"]):
31
+ if label is None:
32
+ # Perhaps it is too long
33
+ continue
34
+ if label == "O":
35
+ if current_type is not None:
36
+ # Add previous entity
37
+ chunks.append((current_chunk.strip(), current_type))
38
+ current_chunk = token + " "
39
+ current_type = None
40
+ else:
41
+ current_chunk += token + " "
42
+ current_type = None
43
+ elif label.startswith("B-"):
44
+ if current_chunk:
45
+ chunks.append((current_chunk.strip(), current_type))
46
+ current_chunk = token + " "
47
+ current_type = label[2:]
48
+ elif label.startswith('I-'):
49
+ current_chunk += token + " "
50
+ current_type = label[2:]
51
+ else:
52
+ # It doesn't start with B- or I- => add single token
53
+ if label != current_type:
54
+ chunks.append((current_chunk.strip(), current_type))
55
+ current_chunk = token + " "
56
+ current_type = label
57
+ else:
58
+ current_chunk += token + " "
59
+ current_type = label
60
+
61
+ if current_chunk:
62
+ chunks.append((current_chunk.strip(), current_type))
63
+
64
+ # remove nones
65
+ chunks = [(c, t) if t is not None else c for c, t in chunks]
66
+ annotated_text(*chunks)
67
+
68
+ # Get first 1000 examples
69
+
70
+
71
+ elements = random.choices(range(len(ds["train"])), k=300)
72
+ ds["train"] = ds["train"].select(elements)
73
+ ds_2["train"] = ds_2["train"].select(elements)
74
+
75
+ for ex1, ex2 in zip(ds["train"], ds_2["train"]):
76
+ st.write("====================================")
77
+ st.write("NER model: robertuito", "\n")
78
+ display_text(ex1)
79
+ st.write("NER model: roberta-large", "\n")
80
+ display_text(ex2)
81
+ st.write("\n")
82
+ st.write(f"Original text: {ex1['text']}")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ datasets==2.9.0
2
+ streamlit==1.18.0
3
+ st-annotated-text==3.0.0