andrewzamai commited on
Commit
26b46c0
1 Parent(s): b164e8a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +57 -3
README.md CHANGED
@@ -1,3 +1,57 @@
1
- ---
2
- license: llama2
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama2
3
+ ---
4
+
5
+ \begin{table*}[htb]
6
+ \centering
7
+ \footnotesize
8
+ \resizebox{\textwidth}{!}{
9
+ \begin{tabular}{llc|cc|ccccc|c}
10
+ \toprule
11
+
12
+ %{ll|ccccccc|c} \textbf{Model} & \textbf{Backbone} & \textbf{Movie} & \textbf{Restaurant} & \textbf{AI} & \textbf{Literature} & \textbf{Music} & \textbf{Politics} & \textbf{Science} & \textbf{AVG}\\
13
+ %Model & Backbone & Movie & Restaurant & AI & Literature & Music & Politics & Science & AVG\\
14
+
15
+ \textbf{Model} & \textbf{Backbone} & \textbf{\#Params} &
16
+ \multicolumn{2}{c|}{\textbf{MIT}} & \multicolumn{5}{c|}{\textbf{CrossNER}} & \textbf{AVG}\\
17
+
18
+ \textbf{} & \textbf{} & \textbf{} & Movie & Restaurant & AI & Literature & Music & Politics & Science & \\
19
+ \midrule
20
+
21
+
22
+ ChatGPT & gpt-3.5-turbo & - & 5.3 & 32.8 & 52.4 & 39.8 & 66.6 & 68.5 & 67.0 & 47.5\\
23
+
24
+ InstructUIE & Flan-T5-xxl & 11B & 63.0 & 21.0 & 49.0 & 47.2 & 53.2 & 48.2 & 49.3 & 47.3\\
25
+
26
+ UniNER-type & LLaMA-1 & 7B & 42.4 & 31.7 & 53.5 & 59.4 & 65.0 & 60.8 & 61.1 & 53.4\\
27
+
28
+ UniNER-def & LLaMA-1 & 7B & 27.1 & 27.9 & 44.5 & 49.2 & 55.8 & 57.5 & 52.9 & 45.0\\
29
+
30
+ UniNER-type+sup. & LLaMA-1 & 7B & 61.2 & 35.2 & 62.9 & 64.9 & 70.6 & 66.9 & 70.8 & 61.8\\
31
+
32
+ GoLLIE & Code-LLaMA & 7B & 63.0 & 43.4 & 59.1 & 62.7 & 67.8 & 57.2 & 55.5 & 58.4 \\ % todo uniNER eval
33
+
34
+ GLiNER-L & DeBERTa-v3 & 0.3B & 57.2 & 42.9 & 57.2 & 64.4 & 69.6 & 72.6 & 62.6 & 60.9 \\ % todo uniNER eval
35
+
36
+ GNER-T5 & Flan-T5-xxl & 11B & 62.5 & 51.0 & 68.2 & 68.7 & 81.2 & 75.1 & 76.7 & 69.1\\
37
+
38
+ GNER-LLaMA & LLaMA-1 & 7B & 68.6 & 47.5 & 63.1 & 68.2 & 75.7 & 69.4 & 69.9 & 66.1\\
39
+
40
+ %\midrule
41
+ %FullPileNER w/o D\&G & LLaMA-2-7B-chat & 49.8 & 33.7 & 52.9 & 60.2 & 67.7 & 60.3 & 60.1 & 55.0\\
42
+ \midrule
43
+
44
+ %our-391x5-FDef & LLaMA-2-7B-chat & 47.2 & 39.3 & 51.0 & 57.3 & 56.9 & 56.3 & 51.8 & $51.4 \pm {x}$\\
45
+ SLIMER w/o D\&G & LLaMA-2-chat & 7B & $46.4 \pm {1.8}$ & $36.3\pm{2.1}$ & $49.6\pm{3.2}$ & $58.4\pm{1.7}$ & $56.8\pm{2.1}$ & $57.9\pm{2.1}$ & $53.8\pm{1.7}$ & $51.3 \pm {2.0}$\\
46
+ %our-391x5-TDef & LLaMA-2-7B-chat & 48.6 & 38.1 & 52.2 & 57.5 & 58.4 & 62.2 & 56.0 & $53.3 \pm {x}$\\
47
+ \textbf{SLIMER} & \textbf{LLaMA-2-chat} & \textbf{7B} & $\textbf{50.9}\pm\textbf{0.9}$ & $\textbf{38.2}\pm\textbf{0.3}$ & $\textbf{50.1}\pm\textbf{2.4}$ & $\textbf{58.7}\pm\textbf{0.2}$ & $\textbf{60.0}\pm\textbf{0.5}$ & $\textbf{63.9}\pm\textbf{1.0}$ & $\textbf{56.3}\pm\textbf{0.6}$ & $\textbf{54.0}\pm\textbf{0.5}$\\
48
+
49
+ \bottomrule
50
+ \end{tabular}
51
+ }
52
+ \caption{Comparison of OOD performance for SLIMER and state-of-the-art models on MIT and CrossNER benchmark. %for the purpose of evaluating generalization to out-of-distribution input domains.
53
+ With the exception of UniNER-def, all the competitors' results are taken from their respective papers as listed in Section~\ref{sec:compared_models}.}
54
+ \label{tab:MIT_CrossNER_comparison}
55
+
56
+ %\caption{\st{Zero-shot} Out-Of-Domain evaluation results on CrossNER and MIT datasets. While most models have been trained on large NER datasets, either by collecting existing human annotated datasets or by using synthetic annotated data composed of many samples and containing up to 13020 different NEs, we show that with only 5 positive and 5 negative samples per NE, using only the most frequent 391 NEs in pileNER, we can achieve competitive performance on MIT and CrossNER datasets. In particular, while other models have already seen most of the NEs (in the case of models trained on pileNER-type, the overlap can be considered 100\%), our model is new to the majority of the NEs. Nevertheless, it shows a strong generezability, not only on an out-of-domain input, but also on never seen Named Entities. When guidelines are employed, the model further improves its performance by 1.9 points. A LLaMA-2-7B model was trained on the fullPileNER type for comparison with the UniNER type-7B, which is based on a LLaMA-1 model instead and uses a different template for instruction tuning.}
57
+ \end{table*}