Readme and docstring updates
Browse files- FairEval.py +37 -45
- README.md +88 -42
FairEval.py
CHANGED
@@ -45,61 +45,53 @@ _DESCRIPTION = """\
|
|
45 |
New evaluation method that more accurately reflects true annotation quality by ensuring that every error is counted
|
46 |
only once - avoiding the penalty to close-to-target annotations happening in traditional evaluation.
|
47 |
In addition to the traditional categories of true positives (TP), false positives (FP), and false negatives
|
48 |
-
(FN), the new method takes into account
|
49 |
-
|
50 |
-
of boundary errors: BES (the system's annotation is smaller than the target span), BEL (the system's annotation is
|
51 |
-
larger than the target span) and BEO (the system span overlaps with the target span)
|
52 |
"""
|
53 |
|
54 |
_KWARGS_DESCRIPTION = """
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
58 |
Args:
|
59 |
-
predictions: list of
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
64 |
Returns:
|
65 |
A dictionary with:
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
BES: segment of the BE where the prediction is smaller than the reference
|
73 |
-
BEL: segment of the BE where the prediction is larger than the reference
|
74 |
-
LBE : count of Label-and-Boundary Errors
|
75 |
-
Prec: fair precision
|
76 |
-
Rec: fair recall
|
77 |
-
F1: fair F1-score
|
78 |
Examples:
|
79 |
-
>>> faireval = evaluate.load("
|
80 |
-
>>> pred = ['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
|
81 |
-
>>> ref = ['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
|
82 |
-
>>> results = faireval.compute(predictions=pred, references=ref)
|
83 |
>>> print(results)
|
84 |
-
{'TP': 1,
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
"""
|
97 |
|
98 |
|
99 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
100 |
class FairEvaluation(evaluate.Metric):
|
101 |
-
"""Counts the number of redefined traditional errors (FP, FN), newly defined errors (BE, LE, LBE) and fine-grained
|
102 |
-
boundary errors (BES, BEL, BEO). Then computes the fair Precision, Recall and F1-Score. """
|
103 |
|
104 |
def _info(self):
|
105 |
return evaluate.MetricInfo(
|
@@ -128,10 +120,9 @@ class FairEvaluation(evaluate.Metric):
|
|
128 |
scheme: Optional[str] = None,
|
129 |
mode: Optional[str] = 'fair',
|
130 |
error_format: Optional[str] = 'count',
|
131 |
-
sample_weight: Optional[List[int]] = None,
|
132 |
zero_division: Union[str, int] = "warn",
|
133 |
):
|
134 |
-
"""Returns the error counts and
|
135 |
# (1) SEQEVAL INPUT MANAGEMENT
|
136 |
if scheme is not None:
|
137 |
try:
|
@@ -223,6 +214,7 @@ class FairEvaluation(evaluate.Metric):
|
|
223 |
|
224 |
|
225 |
def seq_to_fair(seq_sentences):
|
|
|
226 |
out = []
|
227 |
for seq_sentence in seq_sentences:
|
228 |
sentence = []
|
|
|
45 |
New evaluation method that more accurately reflects true annotation quality by ensuring that every error is counted
|
46 |
only once - avoiding the penalty to close-to-target annotations happening in traditional evaluation.
|
47 |
In addition to the traditional categories of true positives (TP), false positives (FP), and false negatives
|
48 |
+
(FN), the new method takes into account more fine-grained error types: labeling errors (LE), boundary errors (BE),
|
49 |
+
and labeling-boundary errors (LBE).
|
|
|
|
|
50 |
"""
|
51 |
|
52 |
_KWARGS_DESCRIPTION = """
|
53 |
+
Outputs the error count (TP, FP, etc.) and resulting scores (Precision, Recall and F1) from a reference list of
|
54 |
+
spans compared against a predicted one. The user can choose to see traditional or fair error counts and scores by
|
55 |
+
switching the argument 'mode'.
|
56 |
+
For the computation of the fair metrics from the error count please refer to: https://aclanthology.org/2022.lrec-1.150.pdf
|
57 |
Args:
|
58 |
+
predictions: a list of lists of predicted labels, i.e. estimated targets as returned by a tagger.
|
59 |
+
references: list of ground truth reference labels. Predicted sentences must have the same number of tokens as the references.
|
60 |
+
mode: 'fair' or 'traditional'. Controls the desired output. 'Traditional' is equivalent to seqeval's metrics. The default value is 'fair'.
|
61 |
+
error_format: 'count' or 'proportion'. Controls the desired output for TP, FP, BE, LE, etc. 'count' gives the absolute count per parameter. 'proportion' gives the precentage with respect to the total errors that each parameter represents. Default value is 'count'.
|
62 |
+
zero_division: which value to substitute as a metric value when encountering zero division. Should be one of [0,1,"warn"]. "warn" acts as 0, but the warning is raised.
|
63 |
+
suffix: True if the IOB tag is a suffix (after type) instead of a prefix (before type), False otherwise. The default value is False, i.e. the IOB tag is a prefix (before type).
|
64 |
+
scheme: the target tagging scheme, which can be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU]. The default value is None.
|
65 |
Returns:
|
66 |
A dictionary with:
|
67 |
+
- Overall error parameter count (or ratio) and resulting scores.
|
68 |
+
- A nested dictionary per label with its respective error parameter count (or ratio) and resulting scores
|
69 |
+
|
70 |
+
If mode is 'traditional', the error parameters shown are the classical TP, FP and FN. If mode is 'fair', TP remain the same,
|
71 |
+
FP and FN are shown as per the fair definition and additional errors BE, LE and LBE are shown.
|
72 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
Examples:
|
74 |
+
>>> faireval = evaluate.load("hpi-dhc/FairEval")
|
75 |
+
>>> pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']]
|
76 |
+
>>> ref = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']]
|
77 |
+
>>> results = faireval.compute(predictions=pred, references=ref, mode='fair', error_format='count)
|
78 |
>>> print(results)
|
79 |
+
{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'TP': 0,'FP': 0,'FN': 0,'LE': 0,'BE': 1,'LBE': 0},
|
80 |
+
'PER': {'precision': 1.0,'recall': 1.0,'f1': 1.0,'TP': 1,'FP': 0,'FN': 0,'LE': 0,'BE': 0,'LBE': 0},
|
81 |
+
'overall_precision': 0.6666666666666666,
|
82 |
+
'overall_recall': 0.6666666666666666,
|
83 |
+
'overall_f1': 0.6666666666666666,
|
84 |
+
'TP': 1,
|
85 |
+
'FP': 0,
|
86 |
+
'FN': 0,
|
87 |
+
'LE': 0,
|
88 |
+
'BE': 1,
|
89 |
+
'LBE': 0}
|
90 |
+
"""
|
|
|
91 |
|
92 |
|
93 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
94 |
class FairEvaluation(evaluate.Metric):
|
|
|
|
|
95 |
|
96 |
def _info(self):
|
97 |
return evaluate.MetricInfo(
|
|
|
120 |
scheme: Optional[str] = None,
|
121 |
mode: Optional[str] = 'fair',
|
122 |
error_format: Optional[str] = 'count',
|
|
|
123 |
zero_division: Union[str, int] = "warn",
|
124 |
):
|
125 |
+
"""Returns the error parameter counts and scores"""
|
126 |
# (1) SEQEVAL INPUT MANAGEMENT
|
127 |
if scheme is not None:
|
128 |
try:
|
|
|
214 |
|
215 |
|
216 |
def seq_to_fair(seq_sentences):
|
217 |
+
"Transforms input anotated sentences from seqeval span format to FairEval span format"
|
218 |
out = []
|
219 |
for seq_sentence in seq_sentences:
|
220 |
sentence = []
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
tags:
|
4 |
- evaluate
|
5 |
- metric
|
@@ -14,60 +14,106 @@ pinned: false
|
|
14 |
|
15 |
## Metric Description
|
16 |
The traditional evaluation of NLP labeled spans with precision, recall, and F1-score leads to double penalties for
|
17 |
-
close-to-correct annotations. As Manning (2006)
|
18 |
-
undesirable effects when systems are optimized for these traditional metrics.
|
19 |
-
|
20 |
-
Building on his ideas, Katrin Ortmann (2022) develops FairEval: a new evaluation method that more accurately reflects
|
21 |
-
true annotation quality by ensuring that every error is counted only once. In addition to the traditional categories of
|
22 |
-
true positives (TP), false positives (FP), and false negatives (FN), the new method takes into account the more
|
23 |
-
fine-grained error types suggested by Manning: labeling errors (LE), boundary errors (BE), and labeling-boundary
|
24 |
-
errors (LBE). Additionally, the system also distinguishes different types of boundary errors:
|
25 |
-
- BES: the system's annotation is smaller than the target span
|
26 |
-
- BEL: the system's annotation is larger than the target span
|
27 |
-
- BEO: the system span overlaps with the target span
|
28 |
-
|
29 |
-
For more information on the reasoning and computation of the fair metrics from the redefined error count pleas refer to the [original paper](https://aclanthology.org/2022.lrec-1.150.pdf).
|
30 |
|
31 |
## How to Use
|
32 |
-
|
33 |
-
The
|
|
|
|
|
|
|
34 |
|
35 |
```python
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
```
|
41 |
|
42 |
### Inputs
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
- **references** *(list)*: list of
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
### Output Values
|
50 |
A dictionary with:
|
51 |
-
-
|
52 |
-
-
|
53 |
-
- FN: count of False Negatives
|
54 |
-
- LE: count of Labeling Errors
|
55 |
-
- BE: count of Boundary Errors
|
56 |
-
- BEO: segment of the BE where the prediction overlaps with the reference
|
57 |
-
- BES: segment of the BE where the prediction is smaller than the reference
|
58 |
-
- BEL: segment of the BE where the prediction is larger than the reference
|
59 |
-
- LBE : count of Label-and-Boundary Errors
|
60 |
-
- Prec: fair precision
|
61 |
-
- Rec: fair recall
|
62 |
-
- F1: fair F1-score
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
*Under construction*
|
68 |
|
69 |
### Examples
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
*Under construction*
|
73 |
|
|
|
1 |
---
|
2 |
+
title: FairEval
|
3 |
tags:
|
4 |
- evaluate
|
5 |
- metric
|
|
|
14 |
|
15 |
## Metric Description
|
16 |
The traditional evaluation of NLP labeled spans with precision, recall, and F1-score leads to double penalties for
|
17 |
+
close-to-correct annotations. As [Manning (2006)](https://nlpers.blogspot.com/2006/08/doing-named-entity-recognition-dont.html)
|
18 |
+
argues in an article about named entity recognition, this can lead to undesirable effects when systems are optimized for these traditional metrics.
|
19 |
+
Building on his ideas, [Katrin Ortmann (2022)](https://aclanthology.org/2022.lrec-1.150.pdf) develops FairEval.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
## How to Use
|
22 |
+
FairEval outputs the error count (TP, FP, etc.) and resulting scores (Precision, Recall and F1) from a reference list of
|
23 |
+
spans compared against a predicted one. The user can choose to see traditional or fair error counts and scores by
|
24 |
+
switching the argument **mode**.
|
25 |
+
|
26 |
+
The minimal example is:
|
27 |
|
28 |
```python
|
29 |
+
faireval = evaluate.load("hpi-dhc/FairEval")
|
30 |
+
pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']]
|
31 |
+
ref = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']]
|
32 |
+
results = faireval.compute(predictions=pred, references=ref)
|
33 |
```
|
34 |
|
35 |
### Inputs
|
36 |
+
FairEval handles input annotations as seqeval. The supported formats are IOB1, IOB2, IOE1, IOE2 and IOBES.
|
37 |
+
Predicted sentences must have the same number of tokens as the references.
|
38 |
+
- **predictions** *(list)*: a list of lists of predicted labels, i.e. estimated targets as returned by a tagger.
|
39 |
+
- **references** *(list)*: list of ground truth reference labels.
|
40 |
+
|
41 |
+
The optional arguments are:
|
42 |
+
- **mode** *(str)*: 'fair' or 'traditional'. Controls the desired output. 'Traditional' is equivalent to seqeval's metrics. The default value is 'fair'.
|
43 |
+
- **error_format** *(str)*: 'count' or 'proportion'. Controls the desired output for TP, FP, BE, LE, etc. 'count' gives the absolute count per parameter. 'proportion' gives the precentage with respect to the total errors that each parameter represents. Default value is 'count'.
|
44 |
+
- **zero_division** *(str)*: which value to substitute as a metric value when encountering zero division. Should be one of [0,1,"warn"]. "warn" acts as 0, but the warning is raised.
|
45 |
+
- **suffix** *(boolean)*: True if the IOB tag is a suffix (after type) instead of a prefix (before type), False otherwise. The default value is False, i.e. the IOB tag is a prefix (before type).
|
46 |
+
- **scheme** *(str)*: the target tagging scheme, which can be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU]. The default value is None.
|
47 |
|
48 |
### Output Values
|
49 |
A dictionary with:
|
50 |
+
- Overall error parameter count (or ratio) and resulting scores.
|
51 |
+
- A nested dictionary per label with its respective error parameter count (or ratio) and resulting scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
+
If mode is 'traditional', the error parameters shown are the classical TP, FP and FN. If mode is 'fair', TP remain the same,
|
54 |
+
FP and FN are shown as per the fair definition and additional errors BE, LE and LBE are shown.
|
|
|
|
|
55 |
|
56 |
### Examples
|
57 |
+
Considering the following input annotated sentences:
|
58 |
+
```python
|
59 |
+
>>> r1 = ['O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER']
|
60 |
+
>>> p1 = ['O', 'O', 'B-PER', 'I-PER', 'O', 'O' ] #1FN
|
61 |
+
>>>
|
62 |
+
>>> r2 = ['O', 'B-INT', 'B-OUT']
|
63 |
+
>>> p2 = ['B-INT', 'I-INT', 'B-OUT'] #1BE
|
64 |
+
>>>
|
65 |
+
>>> r3 = ['B-INT', 'I-INT', 'B-OUT']
|
66 |
+
>>> p3 = ['B-OUT', 'O', 'B-PER'] #1LBE, 1LE
|
67 |
+
>>>
|
68 |
+
>>> y_true = [r1, r2, r3]
|
69 |
+
>>> y_pred = [p1, p2, p3]
|
70 |
+
```
|
71 |
+
|
72 |
+
The output for different modes and error_formats is:
|
73 |
+
```python
|
74 |
+
>>> faireval.compute(predictions=y_pred, references=y_true, mode='traditional', error_format='count')
|
75 |
+
{'PER': {'precision': 0.5, 'recall': 0.5, 'f1': 0.5, 'TP': 1, 'FP': 1, 'FN': 1},
|
76 |
+
'INT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'TP': 0, 'FP': 1, 'FN': 2},
|
77 |
+
'OUT': {'precision': 0.5, 'recall': 0.5, 'f1': 0.5, 'TP': 1, 'FP': 1, 'FN': 1},
|
78 |
+
'overall_precision': 0.4,
|
79 |
+
'overall_recall': 0.3333,
|
80 |
+
'overall_f1': 0.3636,
|
81 |
+
'TP': 2,
|
82 |
+
'FP': 3,
|
83 |
+
'FN': 4}
|
84 |
+
```
|
85 |
+
|
86 |
+
```python
|
87 |
+
>>> faireval.compute(predictions=y_pred, references=y_true, mode='traditional', error_format='proportion')
|
88 |
+
{'PER': {'precision': 0.5, 'recall': 0.5, 'f1': 0.5, 'TP': 1, 'FP': 0.1428, 'FN': 0.1428},
|
89 |
+
'INT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'TP': 0, 'FP': 0.1428, 'FN': 0.2857},
|
90 |
+
'OUT': {'precision': 0.5, 'recall': 0.5, 'f1': 0.5, 'TP': 1, 'FP': 0.1428, 'FN': 0.1428},
|
91 |
+
'overall_precision': 0.4,
|
92 |
+
'overall_recall': 0.3333,
|
93 |
+
'overall_f1': 0.3636,
|
94 |
+
'TP': 2,
|
95 |
+
'FP': 0.4285,
|
96 |
+
'FN': 0.5714}
|
97 |
+
```
|
98 |
+
|
99 |
+
```python
|
100 |
+
>>> faireval.compute(predictions=y_pred, references=y_true, mode='fair', error_format='count')
|
101 |
+
{'PER': {'precision': 1.0, 'recall': 0.5, 'f1': 0.6666, 'TP': 1, 'FP': 0, 'FN': 1, 'LE': 0, 'BE': 0, 'LBE': 0},
|
102 |
+
'INT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'TP': 0, 'FP': 0, 'FN': 0, 'LE': 0, 'BE': 1, 'LBE': 1},
|
103 |
+
'OUT': {'precision': 0.6666, 'recall': 0.6666, 'f1': 0.6666, 'TP': 1, 'FP': 0, 'FN': 0, 'LE': 1, 'BE': 0, 'LBE': 0},
|
104 |
+
'overall_precision': 0.5714,
|
105 |
+
'overall_recall': 0.4444444444444444,
|
106 |
+
'overall_f1': 0.5,
|
107 |
+
'TP': 2,
|
108 |
+
'FP': 0,
|
109 |
+
'FN': 1,
|
110 |
+
'LE': 1,
|
111 |
+
'BE': 1,
|
112 |
+
'LBE': 1}
|
113 |
+
```
|
114 |
+
|
115 |
+
#### Values from Popular Papers
|
116 |
+
*Examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
117 |
|
118 |
*Under construction*
|
119 |
|