-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcoco_evaluator.py
411 lines (350 loc) · 14.8 KB
/
coco_evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
""" version ported from https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py
Notes:
1) The default area thresholds here follows the values defined in COCO, that is,
small: area <= 32**2
medium: 32**2 <= area <= 96**2
large: 96**2 <= area.
If area is not specified, all areas are considered.
2) COCO's ground truths contain an 'area' attribute that is associated with the segmented area if
segmentation-level information exists. While coco uses this 'area' attribute to distinguish between
'small', 'medium', and 'large' objects, this implementation simply uses the associated bounding box
area to filter the ground truths.
3) COCO uses floating point bounding boxes, thus, the calculation of the box area
for IoU purposes is the simple open-ended delta (x2 - x1) * (y2 - y1).
PASCALVOC uses integer-based bounding boxes, and the area includes the outer edge,
that is, (x2 - x1 + 1) * (y2 - y1 + 1). This implementation assumes the open-ended (former)
convention for area calculation.
"""
from collections import defaultdict
import numpy as np
from src.bounding_box import BBFormat
import torch
def get_coco_summary(groundtruth_bbs, detected_bbs):
"""Calculate the 12 standard metrics used in COCOEval,
AP, AP50, AP75,
AR1, AR10, AR100,
APsmall, APmedium, APlarge,
ARsmall, ARmedium, ARlarge.
When no ground-truth can be associated with a particular class (NPOS == 0),
that class is removed from the average calculation.
If for a given calculation, no metrics whatsoever are available, returns NaN.
Parameters
----------
groundtruth_bbs : list
A list containing objects of type BoundingBox representing the ground-truth bounding boxes.
detected_bbs : list
A list containing objects of type BoundingBox representing the detected bounding boxes.
Returns:
A dictionary with one entry for each metric.
"""
# separate bbs per image X class
_bbs = _group_detections(detected_bbs, groundtruth_bbs)
# pairwise ious
_ious = {k: _compute_ious(**v) for k, v in _bbs.items()}
def _evaluate(iou_threshold, max_dets, area_range):
# accumulate evaluations on a per-class basis
_evals = defaultdict(lambda: {"scores": [], "matched": [], "NP": []})
for img_id, class_id in _bbs:
ev = _evaluate_image(
_bbs[img_id, class_id]["dt"],
_bbs[img_id, class_id]["gt"],
_ious[img_id, class_id],
iou_threshold,
max_dets,
area_range,
)
acc = _evals[class_id]
acc["scores"].append(ev["scores"])
acc["matched"].append(ev["matched"])
acc["NP"].append(ev["NP"])
# now reduce accumulations
for class_id in _evals:
acc = _evals[class_id]
acc["scores"] = np.concatenate(acc["scores"])
acc["matched"] = np.concatenate(acc["matched"]).astype(np.bool)
acc["NP"] = np.sum(acc["NP"])
res = []
# run ap calculation per-class
for class_id in _evals:
ev = _evals[class_id]
res.append({
"class": class_id,
**_compute_ap_recall(ev["scores"], ev["matched"], ev["NP"]),
})
return res
iou_thresholds = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
# compute simple AP with all thresholds, using up to 100 dets, and all areas
full = {
i: _evaluate(iou_threshold=i, max_dets=100, area_range=(0, np.inf))
for i in iou_thresholds
}
AP50 = np.mean([x['AP'] for x in full[0.50] if x['AP'] is not None])
AP75 = np.mean([x['AP'] for x in full[0.75] if x['AP'] is not None])
AP = np.mean([x['AP'] for k in full for x in full[k] if x['AP'] is not None])
# max recall for 100 dets can also be calculated here
AR100 = np.mean(
[x['TP'] / x['total positives'] for k in full for x in full[k] if x['TP'] is not None])
small = {
i: _evaluate(iou_threshold=i, max_dets=100, area_range=(0, 32**2))
for i in iou_thresholds
}
APsmall = [x['AP'] for k in small for x in small[k] if x['AP'] is not None]
APsmall = np.nan if APsmall == [] else np.mean(APsmall)
ARsmall = [
x['TP'] / x['total positives'] for k in small for x in small[k] if x['TP'] is not None
]
ARsmall = np.nan if ARsmall == [] else np.mean(ARsmall)
medium = {
i: _evaluate(iou_threshold=i, max_dets=100, area_range=(32**2, 96**2))
for i in iou_thresholds
}
APmedium = [x['AP'] for k in medium for x in medium[k] if x['AP'] is not None]
APmedium = np.nan if APmedium == [] else np.mean(APmedium)
ARmedium = [
x['TP'] / x['total positives'] for k in medium for x in medium[k] if x['TP'] is not None
]
ARmedium = np.nan if ARmedium == [] else np.mean(ARmedium)
large = {
i: _evaluate(iou_threshold=i, max_dets=100, area_range=(96**2, np.inf))
for i in iou_thresholds
}
APlarge = [x['AP'] for k in large for x in large[k] if x['AP'] is not None]
APlarge = np.nan if APlarge == [] else np.mean(APlarge)
ARlarge = [
x['TP'] / x['total positives'] for k in large for x in large[k] if x['TP'] is not None
]
ARlarge = np.nan if ARlarge == [] else np.mean(ARlarge)
max_det1 = {
i: _evaluate(iou_threshold=i, max_dets=1, area_range=(0, np.inf))
for i in iou_thresholds
}
AR1 = np.mean([
x['TP'] / x['total positives'] for k in max_det1 for x in max_det1[k] if x['TP'] is not None
])
max_det10 = {
i: _evaluate(iou_threshold=i, max_dets=10, area_range=(0, np.inf))
for i in iou_thresholds
}
AR10 = np.mean([
x['TP'] / x['total positives'] for k in max_det10 for x in max_det10[k]
if x['TP'] is not None
])
return {
"AP": AP,
"AP50": AP50,
"AP75": AP75,
"APsmall": APsmall,
"APmedium": APmedium,
"APlarge": APlarge,
"AR1": AR1,
"AR10": AR10,
"AR100": AR100,
"ARsmall": ARsmall,
"ARmedium": ARmedium,
"ARlarge": ARlarge
}
def get_coco_metrics(
groundtruth_bbs,
detected_bbs,
iou_threshold=0.5,
area_range=(0, np.inf),
max_dets=100,
):
""" Calculate the Average Precision and Recall metrics as in COCO's official implementation
given an IOU threshold, area range and maximum number of detections.
Parameters
----------
groundtruth_bbs : list
A list containing objects of type BoundingBox representing the ground-truth bounding boxes.
detected_bbs : list
A list containing objects of type BoundingBox representing the detected bounding boxes.
iou_threshold : float
Intersection Over Union (IOU) value used to consider a TP detection.
area_range : (numerical x numerical)
Lower and upper bounds on annotation areas that should be considered.
max_dets : int
Upper bound on the number of detections to be considered for each class in an image.
Returns:
A list of dictionaries. One dictionary for each class.
The keys of each dictionary are:
dict['class']: class representing the current dictionary;
dict['precision']: array with the precision values;
dict['recall']: array with the recall values;
dict['AP']: average precision;
dict['interpolated precision']: interpolated precision values;
dict['interpolated recall']: interpolated recall values;
dict['total positives']: total number of ground truth positives;
dict['TP']: total number of True Positive detections;
dict['FP']: total number of False Positive detections;
if there was no valid ground truth for a specific class (total positives == 0),
all the associated keys default to None
"""
# separate bbs per image X class
_bbs = _group_detections(detected_bbs, groundtruth_bbs)
# pairwise ious
_ious = {k: _compute_ious(**v) for k, v in _bbs.items()}
# accumulate evaluations on a per-class basis
_evals = defaultdict(lambda: {"scores": [], "matched": [], "NP": []})
for img_id, class_id in _bbs:
ev = _evaluate_image(
_bbs[img_id, class_id]["dt"],
_bbs[img_id, class_id]["gt"],
_ious[img_id, class_id],
iou_threshold,
max_dets,
area_range,
)
acc = _evals[class_id]
acc["scores"].extend(ev["scores"])
acc["matched"].extend(ev["matched"])
acc["NP"].append(ev["NP"])
# now reduce accumulations
for class_id in _evals:
acc = _evals[class_id]
acc["scores"] = torch.stack(acc["scores"]) if acc["scores"] else torch.tensor([])
acc["matched"] = np.array(acc["matched"]).astype(np.bool)
acc["NP"] = np.sum(acc["NP"])
res = {}
# run ap calculation per-class
for class_id in _evals:
ev = _evals[class_id]
res[class_id] = {
"class": class_id,
"conf_scr": ev["scores"],
**_compute_ap_recall(ev["scores"].detach().cpu().numpy(), ev["matched"], ev["NP"])
}
return res
def _group_detections(dt, gt):
""" simply group gts and dts on a imageXclass basis """
bb_info = defaultdict(lambda: {"dt": [], "gt": []})
for d in dt:
i_id = d.get_image_name()
c_id = d.get_class_id()
bb_info[i_id, c_id]["dt"].append(d)
for g in gt:
i_id = g.get_image_name()
c_id = g.get_class_id()
bb_info[i_id, c_id]["gt"].append(g)
return bb_info
def _get_area(a):
""" COCO does not consider the outer edge as included in the bbox """
x, y, x2, y2 = a.get_absolute_bounding_box(format=BBFormat.XYX2Y2)
return (x2 - x) * (y2 - y)
def _jaccard(a, b):
xa, ya, x2a, y2a = a.get_absolute_bounding_box(format=BBFormat.XYX2Y2)
xb, yb, x2b, y2b = b.get_absolute_bounding_box(format=BBFormat.XYX2Y2)
# innermost left x
xi = max(xa, xb)
# innermost right x
x2i = min(x2a, x2b)
# same for y
yi = max(ya, yb)
y2i = min(y2a, y2b)
# calculate areas
Aa = max(x2a - xa, 0) * max(y2a - ya, 0)
Ab = max(x2b - xb, 0) * max(y2b - yb, 0)
Ai = max(x2i - xi, 0) * max(y2i - yi, 0)
return Ai / (Aa + Ab - Ai)
def _compute_ious(dt, gt):
""" compute pairwise ious """
ious = np.zeros((len(dt), len(gt)))
for g_idx, g in enumerate(gt):
for d_idx, d in enumerate(dt):
ious[d_idx, g_idx] = _jaccard(d, g)
return ious
def _evaluate_image(dt, gt, ious, iou_threshold, max_dets=None, area_range=None):
""" use COCO's method to associate detections to ground truths """
# sort dts by increasing confidence
# dt_sort = np.argsort([-d.get_confidence() for d in dt], kind="stable")
dt_sort = np.argsort([-d.get_confidence().item() for d in dt], kind="stable")
# sort list of dts and chop by max dets
dt = [dt[idx] for idx in dt_sort[:max_dets]]
ious = ious[dt_sort[:max_dets]]
# generate ignored gt list by area_range
def _is_ignore(bb):
if area_range is None:
return False
return not (area_range[0] <= _get_area(bb) <= area_range[1])
gt_ignore = [_is_ignore(g) for g in gt]
# sort gts by ignore last
gt_sort = np.argsort(gt_ignore, kind="stable")
gt = [gt[idx] for idx in gt_sort]
gt_ignore = [gt_ignore[idx] for idx in gt_sort]
ious = ious[:, gt_sort]
gtm = {}
dtm = {}
for d_idx, d in enumerate(dt):
# information about best match so far (m=-1 -> unmatched)
iou = min(iou_threshold, 1 - 1e-10)
m = -1
for g_idx, g in enumerate(gt):
# if this gt already matched, and not a crowd, continue
if g_idx in gtm:
continue
# if dt matched to reg gt, and on ignore gt, stop
if m > -1 and gt_ignore[m] == False and gt_ignore[g_idx] == True:
break
# continue to next gt unless better match made
if ious[d_idx, g_idx] < iou:
continue
# if match successful and best so far, store appropriately
iou = ious[d_idx, g_idx]
m = g_idx
# if match made store id of match for both dt and gt
if m == -1:
continue
dtm[d_idx] = m
gtm[m] = d_idx
# generate ignore list for dts
dt_ignore = [
gt_ignore[dtm[d_idx]] if d_idx in dtm else _is_ignore(d) for d_idx, d in enumerate(dt)
]
# get score for non-ignored dts
scores = [dt[d_idx].get_confidence() for d_idx in range(len(dt)) if not dt_ignore[d_idx]]
matched = [d_idx in dtm for d_idx in range(len(dt)) if not dt_ignore[d_idx]]
n_gts = len([g_idx for g_idx in range(len(gt)) if not gt_ignore[g_idx]])
return {"scores": scores, "matched": matched, "NP": n_gts}
def _compute_ap_recall(scores, matched, NP, recall_thresholds=None):
""" This curve tracing method has some quirks that do not appear when only unique confidence thresholds
are used (i.e. Scikit-learn's implementation), however, in order to be consistent, the COCO's method is reproduced. """
if NP == 0:
return {
"precision": None,
"recall": None,
"AP": None,
"interpolated precision": None,
"interpolated recall": None,
"total positives": None,
"TP": None,
"FP": None
}
# by default evaluate on 101 recall levels
if recall_thresholds is None:
recall_thresholds = np.linspace(0.0,
1.00,
int(np.round((1.00 - 0.0) / 0.01)) + 1,
endpoint=True)
# sort in descending score order
inds = np.argsort(-scores, kind="stable")
scores = scores[inds]
matched = matched[inds]
tp = np.cumsum(matched)
fp = np.cumsum(~matched)
rc = tp / NP
pr = tp / (tp + fp)
# make precision monotonically decreasing
i_pr = np.maximum.accumulate(pr[::-1])[::-1]
rec_idx = np.searchsorted(rc, recall_thresholds, side="left")
n_recalls = len(recall_thresholds)
# get interpolated precision values at the evaluation thresholds
i_pr = np.array([i_pr[r] if r < len(i_pr) else 0 for r in rec_idx])
return {
"precision": pr,
"recall": rc,
"AP": np.mean(i_pr),
"interpolated precision": i_pr,
"interpolated recall": recall_thresholds,
"total positives": NP,
"TP": tp[-1] if len(tp) != 0 else 0,
"FP": fp[-1] if len(fp) != 0 else 0
}