In [None]:
# Install requirements
!pip install numpy scikit-image scipy scikit-learn matplotlib tqdm tensorflow torch torchvision

In [None]:
import itertools
import json
import math
import os
import random
import tarfile
import time
import urllib.request
import zipfile
from shutil import copyfile

import numpy as np
import requests
import torch
from PIL import Image
from matplotlib import pyplot as plt
from numpy import printoptions
from sklearn.manifold import TSNE
from sklearn.metrics import precision_score, recall_score, f1_score
from torch import nn
from torch.nn import Parameter
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from torch.utils.tensorboard import SummaryWriter
from torchvision import models
from torchvision import transforms
from tqdm import tqdm

In [None]:
# Fix all seeds to make experiments reproducible.
torch.manual_seed(2020)
torch.cuda.manual_seed(2020)
np.random.seed(2020)
random.seed(2020)
torch.backends.cudnn.deterministic = True

In [None]:
# We use the .tar.gz archive from this(https://github.com/thuml/HashNet/tree/master/pytorch#datasets) 
# github repository to speed up image loading(instead of loading it from Flickr).
# Let's download and extract it.
img_folder = 'images'
if not os.path.exists(img_folder):
    def download_file_from_google_drive(id, destination):
        def get_confirm_token(response):
            for key, value in response.cookies.items():
                if key.startswith('download_warning'):
                    return value
            return None

        def save_response_content(response, destination):
            CHUNK_SIZE = 32768
            with open(destination, "wb") as f:
                for chunk in tqdm(response.iter_content(CHUNK_SIZE), desc='Image downloading'):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)

        URL = "https://docs.google.com/uc?export=download"
        session = requests.Session()
        response = session.get(URL, params={'id': id}, stream=True)
        token = get_confirm_token(response)

        if token:
            params = {'id': id, 'confirm': token}
            response = session.get(URL, params=params, stream=True)
        save_response_content(response, destination)

    file_id = '0B7IzDz-4yH_HMFdiSE44R1lselE'
    path_to_tar_file = str(time.time()) + '.tar.gz'
    download_file_from_google_drive(file_id, path_to_tar_file)
    print('Extraction')
    with tarfile.open(path_to_tar_file) as tar_ref:
        tar_ref.extractall(os.path.dirname(img_folder))
    os.remove(path_to_tar_file)
# Also, copy our pre-processed annotations to the dataset folder.
copyfile('../PyTorch-Multi-Label-Image-Classification-Image-Tagging/nus_wide/small_test.json', os.path.join(img_folder, 'small_test.json'))
copyfile('../PyTorch-Multi-Label-Image-Classification-Image-Tagging/nus_wide/small_train.json', os.path.join(img_folder, 'small_train.json'))

In [None]:
# We want to represent our label names as vectors in order to use them as features further.
# To do that we decided to use GloVe model (https://nlp.stanford.edu/projects/glove/).
# Let's download GloVe model trained on a Wikipedia Text Corpus.
glove_zip_name = 'glove.6B.zip'
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
# For our purposes, we use a model where each word is encoded by a vector of length 300
target_model_name = 'glove.6B.300d.txt'
if not os.path.exists(target_model_name):
    with urllib.request.urlopen(glove_url) as dl_file:
        with open(glove_zip_name, 'wb') as out_file:
            out_file.write(dl_file.read())
    # Extract zip archive.    
    with zipfile.ZipFile(glove_zip_name) as zip_f:
        zip_f.extract(target_model_name)
    os.remove(glove_zip_name)

In [None]:
# Now load GloVe model.
embeddings_dict = {}

with open("glove.6B.300d.txt", 'r', encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [None]:
# Calculate GloVe embeddings for each label in our target label subset.
small_labels = ['house', 'birds', 'sun', 'valley',
               'nighttime', 'boats', 'mountain', 'tree', 'snow', 'beach', 'vehicle', 'rocks',
               'reflection', 'sunset', 'road', 'flowers', 'ocean', 'lake', 'window', 'plants',
               'buildings', 'grass', 'water', 'animal', 'person', 'clouds', 'sky']
vectorized_labels = [embeddings_dict[label].tolist() for label in small_labels]

# Save them for further use.
word_2_vec_path = 'word_2_vec_glow_classes.json'
with open(word_2_vec_path, 'w') as fp:
    json.dump({
        'vect_labels': vectorized_labels,
    }, fp, indent=3)


In [None]:
# Let's check how well GloVe represents label names from our dataset.
# It would be hard to visualize vectors with 300 values, but luckly we have t-SNE for that.
# This function builds a t-SNE model(https://www.learnopencv.com/t-sne-for-feature-visualization/) 
# for label embeddings and visualizes them.
def tsne_plot(tokens, labels):
    tsne_model = TSNE(perplexity=2, n_components=2, init='pca', n_iter=25000, random_state=2020, n_jobs=4)
    new_values = tsne_model.fit_transform(tokens)
    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(13, 13)) 
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     size=15,
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [None]:
# Now we can draw t-SNE visualization.
tsne_plot(vectorized_labels, small_labels)

In [None]:
# The Dataset class for NUS-WIDE is the same as in our previous post. The only difference
# is that we need to load vectorized representations of labels too.
class NusDatasetGCN(Dataset):
    def __init__(self, data_path, anno_path, transforms, w2v_path):
        self.transforms = transforms
        with open(anno_path) as fp:
            json_data = json.load(fp)
        samples = json_data['samples']
        self.classes = json_data['labels']

        self.imgs = []
        self.annos = []
        self.data_path = data_path
        print('loading', anno_path)
        for sample in samples:
            self.imgs.append(sample['image_name'])
            self.annos.append(sample['image_labels'])
        for item_id in range(len(self.annos)):
            item = self.annos[item_id]
            vector = [cls in item for cls in self.classes]
            self.annos[item_id] = np.array(vector, dtype=float)
        # Load vectorized labels for GCN from json.    
        with open(w2v_path) as fp:
            self.gcn_inp = np.array(json.load(fp)['vect_labels'], dtype=float)

    def __getitem__(self, item):
        anno = self.annos[item]
        img_path = os.path.join(self.data_path, self.imgs[item])
        img = Image.open(img_path)
        if self.transforms is not None:
            img = self.transforms(img)
        return img, anno, self.gcn_inp

    def __len__(self):
        return len(self.imgs)

In [None]:
# Let's take a look at the data we have. To do it we need to load the dataset without augmentations.
dataset_val = NusDatasetGCN(img_folder, os.path.join(img_folder, 'small_test.json'), None, word_2_vec_path)
dataset_train = NusDatasetGCN(img_folder, os.path.join(img_folder, 'small_train.json'), None, word_2_vec_path)

# A simple function for visualization.
def show_sample(img, binary_img_labels, _):
    # Convert the binary labels back to the text representation.    
    img_labels = np.array(dataset_val.classes)[np.argwhere(binary_img_labels > 0)[:, 0]]
    plt.imshow(img)
    plt.title("{}".format(', '.join(img_labels)))
    plt.axis('off')
    plt.show()

for sample_id in [13, 15, 22, 29, 57, 127]:
    show_sample(*dataset_val[sample_id])

In [None]:
# Calculate label distribution for the entire dataset (train + test).
samples = dataset_val.annos + dataset_train.annos
samples = np.array(samples)
with printoptions(precision=3, suppress=True):
    class_counts = np.sum(samples, axis=0)
    # Sort labels according to their frequency in the dataset.
    sorted_ids = np.array([i[0] for i in sorted(enumerate(class_counts), key=lambda x: x[1])], dtype=int)
    print('Label distribution (count, class name):', list(zip(class_counts[sorted_ids].astype(int), np.array(dataset_val.classes)[sorted_ids])))
    plt.barh(range(len(dataset_val.classes)), width=class_counts[sorted_ids])
    plt.yticks(range(len(dataset_val.classes)), np.array(dataset_val.classes)[sorted_ids])
    plt.gca().margins(y=0)
    plt.grid()
    plt.title('Label distribution')
    plt.show()

In [None]:
# To proceed with the training we first need to compute adjacency matrix.
adj_matrix_path = 'adjacency_matrix.json'
# Count all labels.
nums = np.sum(np.array(dataset_train.annos), axis=0)
label_len = len(small_labels)
adj = np.zeros((label_len, label_len), dtype=int)
# Now iterate over the whole training set and consider all pairs of labels in sample annotation.
for sample in dataset_train.annos:
    sample_idx = np.argwhere(sample > 0)[:, 0]
    # We count all possible pairs that can be created from each sample's set of labels.
    for i, j in itertools.combinations(sample_idx, 2):
        adj[i, j] += 1
        adj[j, i] += 1

# Save it for further use.        
with open(adj_matrix_path, 'w') as fp:
    json.dump({
        'nums': nums.tolist(),
        'adj': adj.tolist()
    }, fp, indent=3)

In [None]:
# We use implementation of GCN from github repository: 
# https://github.com/Megvii-Nanjing/ML-GCN/blob/master/models.py#L7
class GraphConvolution(nn.Module):
    """
        Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """
    def __init__(self, in_features, out_features, bias=False):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(in_features, out_features), requires_grad=True)
        if bias:
            self.bias = Parameter(torch.Tensor(1, 1, out_features), requires_grad=True)
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        support = torch.matmul(input.float(), self.weight.float())
        output = torch.matmul(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

# Create adjacency matrix from statistics.
def gen_A(num_classes, t, p, adj_data):
    adj = np.array(adj_data['adj']).astype(np.float32)
    nums = np.array(adj_data['nums']).astype(np.float32)
    nums = nums[:, np.newaxis]
    adj = adj / nums
    adj[adj < t] = 0
    adj[adj >= t] = 1
    adj = adj * p / (adj.sum(0, keepdims=True) + 1e-6)  
    adj = adj + np.identity(num_classes, np.int)
    return adj

# Apply adjacency matrix re-normalization.
def gen_adj(A):
    D = torch.pow(A.sum(1).float(), -0.5)
    D = torch.diag(D).type_as(A)
    adj = torch.matmul(torch.matmul(A, D).t(), D)
    return adj


class GCNResnext50(nn.Module):
    def __init__(self, n_classes, adj_path, in_channel=300, t=0.1, p=0.25):
        super().__init__()
        self.sigm = nn.Sigmoid()

        self.features = models.resnext50_32x4d(pretrained=True)
        self.features.fc = nn.Identity()
        self.num_classes = n_classes

        self.gc1 = GraphConvolution(in_channel, 1024)
        self.gc2 = GraphConvolution(1024, 2048)
        self.relu = nn.LeakyReLU(0.2)
        # Load data for adjacency matrix
        with open(adj_path) as fp:
            adj_data = json.load(fp)
        # Compute adjacency matrix
        adj = gen_A(n_classes, t, p, adj_data)
        self.A = Parameter(torch.from_numpy(adj).float(), requires_grad=False)

    def forward(self, imgs, inp):
        # Get visual features from image
        feature = self.features(imgs)
        feature = feature.view(feature.size(0), -1)
        
        # Get graph features from graph
        inp = inp[0].squeeze()
        adj = gen_adj(self.A).detach()
        x = self.gc1(inp, adj)
        x = self.relu(x)
        x = self.gc2(x, adj)
        
        # We multiply the features from GСN and СNN in order to take into account 
        # the contribution to the prediction of classes from both the image and the graph.
        x = x.transpose(0, 1)
        x = torch.matmul(feature, x)
        return self.sigm(x)

In [None]:
# Use threshold to define predicted labels and invoke sklearn's metrics with different averaging strategies.
def calculate_metrics(pred, target, threshold=0.5):
    pred = np.array(pred > threshold, dtype=float)
    return {'micro/precision': precision_score(y_true=target, y_pred=pred, average='micro'),
            'micro/recall': recall_score(y_true=target, y_pred=pred, average='micro'),
            'micro/f1': f1_score(y_true=target, y_pred=pred, average='micro'),
            'macro/precision': precision_score(y_true=target, y_pred=pred, average='macro'),
            'macro/recall': recall_score(y_true=target, y_pred=pred, average='macro'),
            'macro/f1': f1_score(y_true=target, y_pred=pred, average='macro'),
            'samples/precision': precision_score(y_true=target, y_pred=pred, average='samples'),
            'samples/recall': recall_score(y_true=target, y_pred=pred, average='samples'),
            'samples/f1': f1_score(y_true=target, y_pred=pred, average='samples'),
            }

In [None]:
# Initialize the training parameters.
num_workers = 8 # Number of CPU processes for data preprocessing
lr = 5e-6 # Learning rate
batch_size = 32
save_freq = 1 # Save checkpoint frequency (epochs)
test_freq = 200 # Test model frequency (iterations)
max_epoch_number = 35 # Number of epochs for training 
# Note: on the small subset of data overfitting happens after 30-35 epochs.

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

device = torch.device('cuda')
# Save path for checkpoints.
save_path = 'chekpoints/'
# Save path for logs.
logdir = 'logs/'

# Run tensorboard.
%load_ext tensorboard
%tensorboard --logdir {logdir}

In [None]:
# Here is an auxiliary function for checkpoint saving.
def checkpoint_save(model, save_path, epoch):
    f = os.path.join(save_path, 'checkpoint-{:06d}.pth'.format(epoch))
    if 'module' in dir(model):
        torch.save(model.module.state_dict(), f)
    else:
        torch.save(model.state_dict(), f)
    print('saved checkpoint:', f)

In [None]:
# Test preprocessing.
val_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

# Train preprocessing.
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(),
    transforms.RandomAffine(degrees=20, translate=(0.2, 0.2), scale=(0.5, 1.5),
                            shear=None, resample=False, 
                            fillcolor=tuple(np.array(np.array(mean) * 255).astype(int).tolist())),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

In [None]:
# Initialize the dataloaders for training.
test_annotations = os.path.join(img_folder, 'small_test.json')
train_annotations = os.path.join(img_folder, 'small_train.json')

test_dataset = NusDatasetGCN(img_folder, test_annotations, val_transform, word_2_vec_path)
train_dataset = NusDatasetGCN(img_folder, train_annotations, train_transform, word_2_vec_path)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True,
                              drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)

num_train_batches = int(np.ceil(len(train_dataset) / batch_size))

# Initialize the model.
model = GCNResnext50(len(train_dataset.classes), adj_matrix_path)
# Switch model to the training mode and move it to GPU.
model.train()
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# If more than one GPU is available we can use both to speed up the training.
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

os.makedirs(save_path, exist_ok=True)

# Loss function.
criterion = nn.BCELoss()
# Tensoboard logger.
logger = SummaryWriter(logdir)

In [None]:
# Run training.
epoch = 0
iteration = 0
while True:
    batch_losses = []
    for batch_number, (imgs, targets, gcn_input) in enumerate(train_dataloader):
        imgs, targets, gcn_input = imgs.to(device), targets.to(device), gcn_input.to(device)
        optimizer.zero_grad()

        model_result = model(imgs, gcn_input)
        loss = criterion(model_result, targets.type(torch.float))

        batch_loss_value = loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 10.0)
        
        optimizer.step()

        logger.add_scalar('train_loss', batch_loss_value, iteration)
        batch_losses.append(batch_loss_value)
        with torch.no_grad():
            result = calculate_metrics(model_result.cpu().numpy(), targets.cpu().numpy())
            for metric in result:
                logger.add_scalar('train/' + metric, result[metric], iteration)

        if iteration % test_freq == 0:
            model.eval()
            with torch.no_grad():
                model_result = []
                targets = []
                for imgs, batch_targets, gcn_input in test_dataloader:
                    gcn_input = gcn_input.to(device)
                    imgs = imgs.to(device)
                    model_batch_result = model(imgs, gcn_input)
                    model_result.extend(model_batch_result.cpu().numpy())
                    targets.extend(batch_targets.cpu().numpy())

            result = calculate_metrics(np.array(model_result), np.array(targets))
            for metric in result:
                logger.add_scalar('test/' + metric, result[metric], iteration)
            print("epoch:{:2d} iter:{:3d} test: "
                  "micro f1: {:.3f} "
                  "macro f1: {:.3f} "
                  "samples f1: {:.3f}".format(epoch, iteration,
                                              result['micro/f1'],
                                              result['macro/f1'],
                                              result['samples/f1']))

            model.train()
        iteration += 1

    loss_value = np.mean(batch_losses)
    print("epoch:{:2d} iter:{:3d} train: loss:{:.3f}".format(epoch, iteration, loss_value))
    if epoch % save_freq == 0:
        checkpoint_save(model, save_path, epoch)
    epoch += 1
    if max_epoch_number < epoch:
        break

In [None]:
# Run inference on the test data.
model.eval()
for sample_id in [1, 2, 3, 4, 6]:
    test_img, test_labels, gcn_input  = test_dataset[sample_id]
    test_img_path = os.path.join(img_folder, test_dataset.imgs[sample_id])
    with torch.no_grad():
        raw_pred = model(test_img.unsqueeze(0).cuda(), torch.from_numpy(gcn_input).unsqueeze(0).cuda()).cpu().numpy()[0]
        raw_pred = np.array(raw_pred > 0.5, dtype=float)

    predicted_labels = np.array(dataset_val.classes)[np.argwhere(raw_pred > 0)[:, 0]]
    if not len(predicted_labels):
        predicted_labels = ['no predictions']
    img_labels = np.array(dataset_val.classes)[np.argwhere(test_labels > 0)[:, 0]]
    plt.imshow(Image.open(test_img_path))
    plt.title("Predicted labels: {} \nGT labels: {}".format(', '.join(predicted_labels), ', '.join(img_labels)))
    plt.axis('off')
    plt.show()
