Using molmapnets for multi-label classification, with descriptors, or fingerprints, or both. Tested on the ClinTox dataset.
 

Per its own documentation:

Qualitative data of drugs approved by the FDA and those that have failed clinical trials for toxicity reasons.

%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
torch.set_default_dtype(torch.float64)

from torch.utils.data import Dataset, DataLoader, random_split
from chembench import dataset
from molmap import MolMap, feature
RDKit WARNING: [14:57:53] Enabling RDKit 2019.09.3 jupyter extensions
from molmapnets.data import SingleFeatureData, DoubleFeatureData
from molmapnets.models import MolMapMultiLabelClassification

Feature extraction

data = dataset.load_ClinTox()
total samples: 1478

Take a look at the data

data.df.head()
index smiles FDA_APPROVED CT_TOX
0 0 *C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC 1 0
1 1 [C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C... 1 0
2 2 [C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)... 1 0
3 3 [H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])... 1 0
4 4 [H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(... 1 0

This is a two class classification data set

data.df.FDA_APPROVED.nunique(dropna=False)
2
data.df.FDA_APPROVED.unique()
array([1, 0])
data.df.CT_TOX.nunique(dropna=False)
2
data.df.CT_TOX.unique()
array([0, 1])

Create feature map objects

bitsinfo = feature.fingerprint.Extraction().bitsinfo
flist = bitsinfo[bitsinfo.Subtypes.isin(['PubChemFP'])].IDs.tolist()

flist[:5]
['PubChemFP0', 'PubChemFP1', 'PubChemFP2', 'PubChemFP3', 'PubChemFP4']
descriptor = MolMap(ftype='descriptor', metric='cosine',)
fingerprint = MolMap(ftype='fingerprint', fmap_type='scatter', flist=flist)
descriptor.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
2021-07-23 14:58:08,834 - INFO - [bidd-molmap] - Applying grid feature map(assignment), this may take several minutes(1~30 min)
2021-07-23 14:58:11,974 - INFO - [bidd-molmap] - Finished
fingerprint.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
2021-07-23 14:58:13,073 - INFO - [bidd-molmap] - Applying naive scatter feature map...
2021-07-23 14:58:13,095 - INFO - [bidd-molmap] - Finished

Feature extraction

X1 = descriptor.batch_transform(data.x)
X2 = fingerprint.batch_transform(data.x)
100%|##########| 1478/1478 [08:20<00:00,  3.61it/s]
100%|##########| 1478/1478 [02:07<00:00, 14.06it/s]
X1.shape
(1478, 37, 37, 13)
X2.shape
(1478, 52, 52, 1)

We also need to transform the outcome variable

Y = data.y
Y.shape
(1478, 2)
Y[:5]
array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

Classification using only the descriptor map

single_feature = SingleFeatureData(Y, X1)
train, val, test = random_split(single_feature, [1184, 147, 147], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
(1184, 147, 147)
train_loader = DataLoader(train, batch_size=8, shuffle=True)
val_loader = DataLoader(val, batch_size=8, shuffle=True)
test_loader = DataLoader(test, batch_size=8, shuffle=True)

And we can get one batch of data by making the data loader iterable

x, t = next(iter(train_loader))
t.shape
torch.Size([8, 2])
t
tensor([[1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.]])
x.shape
torch.Size([8, 13, 37, 37])

Finally with the data prepared we can train the models. These are tests to show that the models work as expected, but we can certainly fine tune the training loop to achieve better results.

model = MolMapMultiLabelClassification(n_label=2)

epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
model(x)
/Users/olivier/opt/anaconda3/envs/molmap/lib/python3.6/site-packages/torch/nn/functional.py:718: UserWarning: Named tensors and all their associated APIs are an experimental feature and subject to change. Please do not use them for anything important until they are released as stable. (Triggered internally at  ../c10/core/TensorImpl.h:1156.)
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
tensor([[0.4808, 0.5071],
        [0.4809, 0.5071],
        [0.4808, 0.5071],
        [0.4809, 0.5071],
        [0.4809, 0.5071],
        [0.4808, 0.5071],
        [0.4808, 0.5071],
        [0.4808, 0.5071]], grad_fn=<SigmoidBackward>)
criterion(model(x), t)
tensor(0.7133, grad_fn=<BinaryCrossEntropyBackward>)

And the training loop

for epoch in range(epochs):

    running_loss = 0.0
    for i, (xb, yb) in enumerate(train_loader):
        
        xb, yb = xb.to(device), yb.to(device)

        # zero gradients
        optimizer.zero_grad()

        # forward propagation
        pred = model(xb)

        # loss calculation
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 50 == 0:    
            print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / (i+1)))

print('Training finished')
[Epoch: 1, Iter:    50] Training loss: 0.351
[Epoch: 1, Iter:   100] Training loss: 0.305
[Epoch: 2, Iter:    50] Training loss: 0.221
[Epoch: 2, Iter:   100] Training loss: 0.277
[Epoch: 3, Iter:    50] Training loss: 0.281
[Epoch: 3, Iter:   100] Training loss: 0.277
[Epoch: 4, Iter:    50] Training loss: 0.313
[Epoch: 4, Iter:   100] Training loss: 0.273
[Epoch: 5, Iter:    50] Training loss: 0.258
[Epoch: 5, Iter:   100] Training loss: 0.253
Training finished
((model(x) > 0.5).float() == t).sum().item()
14
model(x).nelement()
16

And let's look at the prediction accuracy on validation data set

correct = 0
total = 0

with torch.no_grad():
    for i, (xb, yb) in enumerate(val_loader):

        xb, yb = xb.to(device), yb.to(device)

        pred = model(xb)

        # accuracy calculation
        total += yb.nelement()
        correct += ((pred > 0.5).float()==yb).sum().item()

        
print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))
Accuracy of the network on the test data: 93 %

Classification using both feature maps

Now we can feed both the feature maps to the model as a tuple

double_feature = DoubleFeatureData(Y, (X1, X2))

split the data

train_double, val_double, test_double = random_split(double_feature, [1184, 147, 147], generator=torch.Generator().manual_seed(7))
len(train_double), len(val_double), len(test_double)
(1184, 147, 147)

Prepare batch data loader

train_loader_double = DataLoader(train_double, batch_size=8, shuffle=True)
val_loader_double = DataLoader(val_double, batch_size=8, shuffle=True)
test_loader_double = DataLoader(test_double, batch_size=8, shuffle=True)

And we can get one batch of data by making the data loader iterable

x, t = next(iter(train_loader_double))
t.shape
torch.Size([8, 2])
x1, x2 = x
x1.shape, x2.shape
(torch.Size([8, 13, 37, 37]), torch.Size([8, 1, 52, 52]))

And multi-label classification. Different feature maps have different number of channels.

model_double = MolMapMultiLabelClassification(conv_in1=13, conv_in2=1, n_label=2)

epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_double.to(device)
optimizer = optim.Adam(model_double.parameters(), lr=0.001)
criterion = nn.BCELoss()

And the training loop

for epoch in range(epochs):

    running_loss = 0.0
    for i, ((x1, x2), yb) in enumerate(train_loader_double):

        x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)

        # zero gradients
        optimizer.zero_grad()

        # forward propagation
        pred = model_double((x1, x2))

        # loss calculation
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if (i+1) % 50 == 0:    
            print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / (i+1)))

print('Training finished')
[Epoch: 1, Iter:    50] Training loss: 0.304
[Epoch: 1, Iter:   100] Training loss: 0.301
[Epoch: 2, Iter:    50] Training loss: 0.307
[Epoch: 2, Iter:   100] Training loss: 0.284
[Epoch: 3, Iter:    50] Training loss: 0.273
[Epoch: 3, Iter:   100] Training loss: 0.288
[Epoch: 4, Iter:    50] Training loss: 0.269
[Epoch: 4, Iter:   100] Training loss: 0.229
[Epoch: 5, Iter:    50] Training loss: 0.283
[Epoch: 5, Iter:   100] Training loss: 0.254
Training finished

Accuracy on the validation data set

correct = 0
total = 0

with torch.no_grad():
    for i, ((x1, x2), yb) in enumerate(val_loader_double):

        x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)

        pred = model_double((x1, x2))

        # accuracy calculation
        total += yb.nelement()
        correct += ((pred > 0.5).float()==yb).sum().item()

        
print('Accuracy of the network on the test data: %d %%' % (
    100 * correct / total))
Accuracy of the network on the test data: 93 %