%config Completer.use_jedi = False
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_theme(palette='Set2')
colors = sns.color_palette()
colors
import torch
from torch import nn, optim
import torch.nn.functional as F
torch.set_default_dtype(torch.float64)
from torch.utils.data import Dataset, DataLoader, random_split
from chembench import dataset
from molmap import MolMap
from molmapnets.data import SingleFeatureData, DoubleFeatureData
from molmapnets.models import MolMapRegression
data = dataset.load_ESOL()
We have the smiles (Simplified Molecular Input Line Entry Specification) for different proteins and their corresponding solubility measure:
data.df.head()
Using MolMap we can extract features using the smiles as input. We can specify the feature type ftype, feature pairwise distance calculation method metric, and feature grid arrangement method fmap_type:
MolMap?
descriptor = MolMap(ftype='descriptor', metric='cosine',)
fingerprint = MolMap(ftype='fingerprint', metric='cosine')
After setting up the feature extracting method, we can then use the .fit method of the feature object to extract the features. During this step we need to specify the algorithm (method) to embed higher dimensional features to 2D presentation:
descriptor.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=15,)
fingerprint.fit(verbose=0, method='umap', min_dist=0.1, n_neighbors=10,)
We can visualise the feature maps easily with MolMap, but the visualisations are removed to avoid crushing the notebook.
X = descriptor.batch_transform(data.x)
X.shape
In PyTorch the training data for computer vision problems takes the shape (n_channels, hight, width), while the features extracted from MolMap take the shape (hight, width, n_channels), so we'll first correct it by moving the channels dimension before the feature map dimensions.
torch.movedim(torch.from_numpy(X), -1, 1).shape
Y = data.y
Y.shape
Now from these feature maps we can create the dataset suitable for training models in PyTorch
esol = SingleFeatureData(data.y, X)
train, val, test = random_split(esol, [904,112,112], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
train_loader = DataLoader(train, batch_size=8, shuffle=True)
val_loader = DataLoader(val, batch_size=8, shuffle=True)
test_loader = DataLoader(test, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader))
t
x.shape
Finally with the data prepared we can train the models. These are tests to show that the models work as expected, but we can certainly fine tune the model to achieve better results.
model = MolMapRegression()
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, (xb, yb) in enumerate(train_loader):
xb, yb = xb.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model(xb)
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Loss on validation data set
running_loss = 0.0
with torch.no_grad():
for i, (xb, yb) in enumerate(val_loader):
xb, yb = xb.to(device), yb.to(device)
# forward propagation
pred = model(xb)
# loss calculation
loss = criterion(pred, yb)
running_loss += loss.item()
if (i+1) % 3 == 0:
print('[Iter: %5d] Validation loss: %.3f' %
(i + 1, running_loss / (i+1)))
X_fingerprint = fingerprint.batch_transform(data.x)
X_fingerprint.shape
Now from these feature maps we can create the dataset suitable for training models in PyTorch
esol_fingerprint = SingleFeatureData(data.y, X_fingerprint)
train_fingerprint, val_fingerprint, test_fingerprint = random_split(esol_fingerprint, [904,112,112], generator=torch.Generator().manual_seed(7))
len(train), len(val), len(test)
train_loader_fingerprint = DataLoader(train_fingerprint, batch_size=8, shuffle=True)
val_loader_fingerprint = DataLoader(val_fingerprint, batch_size=8, shuffle=True)
test_loader_fingerprint = DataLoader(test_fingerprint, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader_fingerprint))
t.shape
x.shape
And regression. Different feature maps have different number of channels.
model_fingerprint = MolMapRegression(conv_in1=12)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_fingerprint.to(device)
optimizer = optim.Adam(model_fingerprint.parameters(), lr=0.001)
criterion = nn.MSELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, (xb, yb) in enumerate(train_loader_fingerprint):
xb, yb = xb.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model_fingerprint(xb)
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Loss on validation data set
running_loss = 0.0
with torch.no_grad():
for i, (xb, yb) in enumerate(val_loader_fingerprint):
xb, yb = xb.to(device), yb.to(device)
# forward propagation
pred = model_fingerprint(xb)
# loss calculation
loss = criterion(pred, yb)
running_loss += loss.item()
if (i+1) % 3 == 0:
print('[Iter: %5d] Validation loss: %.3f' %
(i + 1, running_loss / (i+1)))
Now we can feed both the feature maps to the model as a tuple
double_feature = DoubleFeatureData(data.y, (X, X_fingerprint))
train_double, val_double, test_double = random_split(double_feature, [904,112,112], generator=torch.Generator().manual_seed(7))
len(train_double), len(val_double), len(test_double)
train_loader_double = DataLoader(train_double, batch_size=8, shuffle=True)
val_loader_double = DataLoader(val_double, batch_size=8, shuffle=True)
test_loader_double = DataLoader(test_double, batch_size=8, shuffle=True)
And we can get one batch of data by making the data loader iterable
x, t = next(iter(train_loader_double))
t.shape
x1, x2 = x
x1.shape, x2.shape
And regression. Different feature maps have different number of channels.
model_double = MolMapRegression(conv_in1=13, conv_in2=12)
epochs = 5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_double.to(device)
optimizer = optim.Adam(model_double.parameters(), lr=0.001)
criterion = nn.MSELoss()
And the training loop
for epoch in range(epochs):
running_loss = 0.0
for i, ((x1, x2), yb) in enumerate(train_loader_double):
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
# zero gradients
optimizer.zero_grad()
# forward propagation
pred = model_double((x1, x2))
# loss calculation
loss = criterion(pred, yb)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i+1) % 50 == 0:
print('[Epoch: %d, Iter: %5d] Training loss: %.3f' %
(epoch + 1, i + 1, running_loss / (i+1)))
print('Training finished')
Loss on validation data set
running_loss = 0.0
with torch.no_grad():
for i, ((x1, x2), yb) in enumerate(val_loader_double):
x1, x2, yb = x1.to(device), x2.to(device), yb.to(device)
# forward propagation
pred = model_double((x1, x2))
# loss calculation
loss = criterion(pred, yb)
running_loss += loss.item()
if (i+1) % 3 == 0:
print('[Iter: %5d] Validation loss: %.3f' %
(i + 1, running_loss / (i+1)))
print('Validation finished')