path = untar_data(URLs.IMAGEWOOF_320)
lbl_dict = dict(
n02086240= 'Shih-Tzu',
n02087394= 'Rhodesian ridgeback',
n02088364= 'Beagle',
n02089973= 'English foxhound',
n02093754= 'Australian terrier',
n02096294= 'Border terrier',
n02099601= 'Golden retriever',
n02105641= 'Old English sheepdog',
n02111889= 'Samoyed',
n02115641= 'Dingo'
)
dblock = DataBlock(blocks=(ImageBlock,CategoryBlock),
get_items=get_image_files,
splitter=GrandparentSplitter(valid_name='val'),
get_y=Pipeline([parent_label,lbl_dict.__getitem__]),
item_tfms=Resize(320),
batch_tfms=[*aug_transforms(size=224),Normalize.from_stats(*imagenet_stats)])
dls = dblock.dataloaders(path,bs=128)
dls.show_batch()
You can find the model weights here. It's a resnet34, trained for 9 epochs, reaching around 96% accuracy
exp_name='resnet34'
save_model = SaveModelCallback(monitor='error_rate',fname=exp_name)
learn = cnn_learner(dls,resnet34,metrics=error_rate,model_dir='/content/models',opt_func=ranger)
# learn.load(exp_name)
learn.fit_flat_cos(5,lr=1e-3,cbs=save_model)
learn.unfreeze()
learn.fit_flat_cos(5,lr=1e-4,cbs=save_model)
Grad-CAM
- gradient of score of class c, $y^c$ wrt feature map activations $A^k$ global-average-pooled over width and height dimensions
$$\alpha^c_k = \frac{1}{Z}\sum_i\sum_j \frac{\partial y^c}{\partial A^k_{ij}}$$
- weighted combination of forward activation maps, followed by ReLU $$L_{Grad-CAM}^C = ReLU (\sum_k \alpha^c_kA^k)$$
m = learn.model.eval()
We don't want to change the shuffle beviour of original valid dataloader, thus creating a copy
valid_dl = dls.valid
valid_dl.shuffle=True
xb,yb = valid_dl.one_batch()
dls.show_batch((xb,yb))
valid_dl.shuffle=False
idx=3
xb,yb = xb[idx][None],yb[idx][None]
x_dec,y_dec = valid_dl.decode_batch((xb,yb))[0]
show_image(x_dec,title=y_dec,figsize=(5,5));
hook,hook_g = hook_output(m[0]), hook_output(m[0],grad=True)
m.zero_grad()
preds = m(xb)
preds[0,preds.argmax().item()].backward(retain_graph=True)
dls.vocab[preds.argmax().item()]
hook.stored[0].shape, hook_g.stored[0].shape
acts, grads = hook.stored[0], hook_g.stored[0][0]
alpha = grads.mean((1,2),keepdim=True); alpha.shape
gcam = F.relu((alpha * acts).sum(0)); gcam.shape
def generate_gradcam(model,xb,yb=None,layer_idx:list=[0],with_pred=False):
"""Show Grad-CAM for a given image
`xb,yb`: input batch
`layer_idx`: list of indices to reach target layer
"""
m = model.eval()
hook_layer = get_module(m,layer_idx)
with hook_output(hook_layer,grad=True) as hook_g:
with hook_output(hook_layer) as hook:
m.zero_grad()
y_pred = m(xb)
if yb is None:
y = y_pred.argmax().item()
else: y = yb.item()
y_pred[0,y].backward(retain_graph=True)
acts = hook.stored[0]
grads = hook_g.stored[0]
alpha = grads.mean((2,3))
gcam = F.relu(torch.einsum('ab,bcd->acd',alpha,acts))[0]
if with_pred: return gcam,y
@delegates(show_heatmap)
def show_gradcam(dl:DataLoader,xb,yb,gcam,sz=224,merge=True,**kwargs):
x_dec,y_dec = dl.decode_batch((xb,yb))[0]
imsize = 5 if merge else 7
_,axs = subplots(1,1 if merge else 2,figsize=(imsize,imsize))
show_image(x_dec,ax=axs[0],title=y_dec)
alpha= 0.6 if merge else 1.
show_heatmap(gcam,sz=sz,ax=axs[int(not merge)],alpha=alpha,**kwargs)
gcam = generate_gradcam(m,xb,yb)
show_gradcam(valid_dl,xb,yb,gcam)
show_gradcam(valid_dl,xb,yb,gcam,merge=False,interpolation='spline36')
Custom test image
Fastai2 specific steps have been described in the CAM notebook, so borrowing the same code here
url = 'https://t2conline.com/wp-content/uploads/2020/01/shutterstock_1124417876.jpg'
fname = 'test-shih-tzu.jpg'
download_url(url,dest=fname)
img = PILImage.create(fname); img.show(figsize=(5,5));
for idx,label in enumerate(dls.vocab):
print(f'{idx:<2} : {label}')
xb,yb = create_batch(dls,fname,9)
gcam = generate_gradcam(learn.model,xb,yb)
show_gradcam(valid_dl,xb,yb,gcam)
show_gradcam(valid_dl,xb,yb,gcam,merge=False,interpolation='spline36')
The novelty with Grad-CAM is that, we can look at the activations of any layer. Let's first have a brief view of the architecture and then decide which layer to visualize
arch_summary(learn.model,verbose=True)
With m[0]
, we just looked at the activations of first sequential layer. Let's visualize the 2nd last block of our first sequential layer
gcam = generate_gradcam(learn.model,xb,yb,layer_idx=[0,-2])
show_gradcam(valid_dl,xb,yb,gcam)
Seems like model was not yet sure at this layer, but in the very next layer, it was pretty confident where the subject is! (as shown above)
Let's try one more. This time, "Australian Terrier"
url2 = "http://www.pets4homes.co.uk/images/breeds/197/large/0cdc3b81526ed5c3fa2cdf13b2d1cc36.jpg"
fname2 = 'test-australian-terrier.jpg'
download_url(url2,dest=fname2,overwrite=True)
img_2 = PILImage.create(fname2); img_2.show(figsize=(5,5));
xb,yb = create_batch(dls,fname2,0)
gcam,y_pred = generate_gradcam(learn.model,xb,yb,with_pred=True)
print(f"Prediction: {dls.vocab[y_pred]}")
show_gradcam(valid_dl,xb,yb,gcam)
Interesting! the model was able to classify it correctly, but it considered the features that we normally won't have thought of. Especially, I've seen with many examples, those mane hairs are particularly important in case of "Australian Terrier", also, I'm not an expert in identifying the dog breeds, but these are just my observations
show_gradcam(valid_dl,xb,yb,gcam,merge=False,interpolation='spline36')
GuidedBackprop
Reference implementations:
- utkuozbulak/pytorch-cnn-visualizations
- https://www.kaggle.com/sironghuang/understanding-pytorch-hooks
There certain naming conventions followed for the hooks
Forward Hook →
- input : current layer's input
- output : current layer's output
Backward Hook ←
- grad_in : gradient of the loss wrt layer's input (coming from previous layer)
- grad_out : gradient of the loss wrt layer's output (will be passed to the next layer)
import gc
gbviz = GuidedBackprop(learn.model)
gbprop = gbviz.guided_backprop(xb,yb)
show_image(min_max_scale(gbprop),figsize=(5,5));
show_image(to_grayscale(gbprop),figsize=(5,5),cmap='gray')
Guided backprop is lookting at their faces, but as we observed, there's something more important than the face of dog in case of "Australian Terrier", which Grad-CAM was able to discriminate. Let's fuse both of them
gcam_up = F.interpolate(gcam[None,None],size=(224,224),mode='bilinear',align_corners=True)[0]
guided_gcam = gcam_up * gbprop
show_image(min_max_scale(guided_gcam),figsize=(5,5));
There you go! we got the class discriminative features. Of course, this was just experimental. I'll come up with fine-grained classification examples some other day. Stay tuned!