Per it's own documentation:
Beta-Secretase 1 (BACE) is a transmembrane aspartic-acid protease human protein encoded by the BACE1 gene. BACE is essential for the generation of beta-amyloid peptide in neural tissue1, a component of amyloid plaques widely believed to be critical in the development of Alzheimer's, rendering BACE an attractive therapeutic target for this devastating disease2.
The BACE dataset provided herein comprises small molecule inhibitors across a three order of magnitude (nM to μM) range of IC50s along with previously undisclosed crystallographic structures. Specifically, we provide 154 BACE inhibitors for affinity, 20 for pose, and 34 for free energy prediction. This dataset was kindly provided by Novartis.
%config Completer.use_jedi = False
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
torch.set_default_dtype(torch.float64)
from torch.utils.data import Dataset, DataLoader, random_split
from chembench import dataset
from molmap import MolMap, feature
from molmapnets.data import SingleFeatureData, DoubleFeatureData
from molmapnets.models import MolMapMultiClassClassification
data = dataset.load_BACE()
Take a look at the data
data.df.head()
This is a two class classification data set
data.df.Class.nunique()
Create feature map objects
bitsinfo = feature.fingerprint.Extraction().bitsinfo
flist = bitsinfo[bitsinfo.Subtypes.isin(['PubChemFP'])].IDs.tolist()
flist[:5]
descriptor = MolMap(ftype='descriptor', metric='cosine',)
fingerprint = MolMap(ftype='fingerprint', fmap_type='scatter', flist=flist)
descriptor.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
fingerprint.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
Feature extraction
X1 = descriptor.batch_transform(data.x)
X2 = fingerprint.batch_transform(data.x)
We also need to transform the outcome variable
Y = pd.get_dummies(data.df['Class']).values
Y.shape
We can visualise the feature maps easily with MolMap, but the visualisations are removed to avoid crushing the notebook.
single_feature = SingleFeatureData(Y, X1)
train, val, test = random_split(single_feature, [1113, 200, 200], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
train_loader = DataLoader(train, batch_size=8, shuffle=True)
val_loader = DataLoader(val, batch_size=8, shuffle=True)
test_loader = DataLoader(test, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader))
t.shape
torch.max(t, 1)[1]
x.shape
Finally with the data prepared we can train the models. These are tests to show that the models work as expected, but we can certainly fine tune the training loop to achieve better results.
model = MolMapMultiClassClassification(n_class=2)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
Predicted probabilities
model(x).exp()
And we can get the predicted class by
torch.max(model(x).exp(), 1)[1]
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, (xb, yb) in enumerate(train_loader):
# fix output shape for loss function
yb = torch.max(yb, 1)[1]
xb, yb = xb.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model(xb)
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
And let's look at the prediction accuracy on validation data set
correct = 0
total = 0
with torch.no_grad():
for i, (xb, yb) in enumerate(val_loader):
yb = torch.max(yb, 1)[1]
xb, yb = xb.to(device), yb.to(device)
pred = model(xb)
pred = torch.max(pred, 1)[1]
# accuracy calculation
total += yb.size(0)
correct += (pred==yb).sum().item()
print('Accuracy of the network on the test data: %d %%' % (
100 * correct / total))
Hmm not very good, but again we only trained for 5 epochs!
double_feature = DoubleFeatureData(Y, (X1, X2))
train_double, val_double, test_double = random_split(double_feature, [1113, 200, 200], generator=torch.Generator().manual_seed(7))
len(train_double), len(val_double), len(test_double)
train_loader_double = DataLoader(train_double, batch_size=8, shuffle=True)
val_loader_double = DataLoader(val_double, batch_size=8, shuffle=True)
test_loader_double = DataLoader(test_double, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader_double))
t.shape
x1, x2 = x
x1.shape, x2.shape
And classification. Different feature maps have different number of channels.
model_double = MolMapMultiClassClassification(conv_in1=13, conv_in2=1)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_double.to(device)
optimizer = optim.Adam(model_double.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, ((x1, x2), yb) in enumerate(train_loader_double):
# fix output shape for loss function
yb = torch.max(yb, 1)[1]
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model_double((x1, x2))
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Accuracy validation data set
correct = 0
total = 0
with torch.no_grad():
for i, ((x1, x2), yb) in enumerate(val_loader_double):
yb = torch.max(yb, 1)[1]
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
pred = model_double((x1, x2))
pred = torch.max(pred, 1)[1]
# accuracy calculation
total += yb.size(0)
correct += (pred==yb).sum().item()
print('Accuracy of the network on the test data: %d %%' % (
100 * correct / total))
Nice!