Per its own documentation:
Qualitative data of drugs approved by the FDA and those that have failed clinical trials for toxicity reasons.
%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
torch.set_default_dtype(torch.float64)
from torch.utils.data import Dataset, DataLoader, random_split
from chembench import dataset
from molmap import MolMap, feature
from molmapnets.data import SingleFeatureData, DoubleFeatureData
from molmapnets.models import MolMapMultiLabelClassification
data = dataset.load_ClinTox()
Take a look at the data
data.df.head()
This is a two class classification data set
data.df.FDA_APPROVED.nunique(dropna=False)
data.df.FDA_APPROVED.unique()
data.df.CT_TOX.nunique(dropna=False)
data.df.CT_TOX.unique()
Create feature map objects
bitsinfo = feature.fingerprint.Extraction().bitsinfo
flist = bitsinfo[bitsinfo.Subtypes.isin(['PubChemFP'])].IDs.tolist()
flist[:5]
descriptor = MolMap(ftype='descriptor', metric='cosine',)
fingerprint = MolMap(ftype='fingerprint', fmap_type='scatter', flist=flist)
descriptor.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
fingerprint.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
Feature extraction
X1 = descriptor.batch_transform(data.x)
X2 = fingerprint.batch_transform(data.x)
X1.shape
X2.shape
We also need to transform the outcome variable
Y = data.y
Y.shape
Y[:5]
single_feature = SingleFeatureData(Y, X1)
train, val, test = random_split(single_feature, [1184, 147, 147], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
train_loader = DataLoader(train, batch_size=8, shuffle=True)
val_loader = DataLoader(val, batch_size=8, shuffle=True)
test_loader = DataLoader(test, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader))
t.shape
t
x.shape
Finally with the data prepared we can train the models. These are tests to show that the models work as expected, but we can certainly fine tune the training loop to achieve better results.
model = MolMapMultiLabelClassification(n_label=2)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
model(x)
criterion(model(x), t)
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, (xb, yb) in enumerate(train_loader):
xb, yb = xb.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model(xb)
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
((model(x) > 0.5).float() == t).sum().item()
model(x).nelement()
And let's look at the prediction accuracy on validation data set
correct = 0
total = 0
with torch.no_grad():
for i, (xb, yb) in enumerate(val_loader):
xb, yb = xb.to(device), yb.to(device)
pred = model(xb)
# accuracy calculation
total += yb.nelement()
correct += ((pred > 0.5).float()==yb).sum().item()
print('Accuracy of the network on the test data: %d %%' % (100 * correct / total))
double_feature = DoubleFeatureData(Y, (X1, X2))
split the data
train_double, val_double, test_double = random_split(double_feature, [1184, 147, 147], generator=torch.Generator().manual_seed(7))
len(train_double), len(val_double), len(test_double)
Prepare batch data loader
train_loader_double = DataLoader(train_double, batch_size=8, shuffle=True)
val_loader_double = DataLoader(val_double, batch_size=8, shuffle=True)
test_loader_double = DataLoader(test_double, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader_double))
t.shape
x1, x2 = x
x1.shape, x2.shape
And multi-label classification. Different feature maps have different number of channels.
model_double = MolMapMultiLabelClassification(conv_in1=13, conv_in2=1, n_label=2)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_double.to(device)
optimizer = optim.Adam(model_double.parameters(), lr=0.001)
criterion = nn.BCELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, ((x1, x2), yb) in enumerate(train_loader_double):
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model_double((x1, x2))
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Accuracy on the validation data set
correct = 0
total = 0
with torch.no_grad():
for i, ((x1, x2), yb) in enumerate(val_loader_double):
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
pred = model_double((x1, x2))
# accuracy calculation
total += yb.nelement()
correct += ((pred > 0.5).float()==yb).sum().item()
print('Accuracy of the network on the test data: %d %%' % (
100 * correct / total))