Buckler et al. 2009

This file aims to reproduce the findings of Buckler et al 2009, “The Genetic Architecture of Maize Flowering Time”.

It used data from panzea - Phenotypic data panzea_etal_2009_Science_flowering_time_data-090807
- Genotypic Data panzea7_publicSamples_imputedV5_AGPv4-181023.vcf.gz - Genomic Data …

use_gpu_num = 1

import os
import pandas as pd
import numpy as np
import re

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

# TODO fixme

device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]: 
    torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")

import tqdm

import plotly.graph_objects as go
import plotly.express as px

# [e for e in os.listdir() if re.match(".+\\.txt", e)]

/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Using cuda device

nam_overview = pd.read_table('../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/NAMSum0607FloweringTraitBLUPsAcross8Envs.txt')
nam_overview

	Geno_Code	Entry_ID	Group	pop	entry	Days_To_Anthesis_BLUP_Sum0607	Days_To_Silk_BLUP_Sum0607	ASI_BLUP_Sum0607
0	Z001E0001	04P1367A51A	Z001	1	1	75.5364	77.1298	1.4600
1	Z001E0002	04P1368A51A	Z001	1	2	76.9075	77.7945	1.3928
2	Z001E0003	04P1368B51A	Z001	1	3	75.2646	75.2555	0.8644
3	Z001E0004	04P1370B51A	Z001	1	4	73.6933	75.7604	2.0012
4	Z001E0005	04P1371B51A	Z001	1	5	79.2441	81.2611	1.8931
...	...	...	...	...	...	...	...	...
5458	Z027E0277	W64A	NaN	27	277	71.9008	73.9811	2.6756
5459	Z027E0278	WD	NaN	27	278	62.0212	60.5992	-0.5733
5460	Z027E0279	Wf9	NaN	27	279	71.9970	72.2319	0.8338
5461	Z027E0280	Yu796_NS	NaN	27	280	74.5107	73.9727	0.2935
5462	Z027E0282	Mo17	NaN	27	282	72.7428	75.5080	3.0455

5463 rows × 8 columns

data = pd.read_table('../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/markergenotypes062508.txt', skiprows=1
                    ).reset_index().rename(columns = {'index': 'Geno_Code'})
data

	Geno_Code	days2anthesis	days2silk	asi	pop	i0	i1	i2	i3	i4	...	i1096	i1097	i1098	i1099	i1100	i1101	i1102	i1103	i1104	i1105
0	Z001E0001	75.5364	77.1298	1.4600	1	0.0	0.0	0.0	0.0	0.0	...	2.0	2.0	2.0	2.0	2.0	1.0	0.0	0.0	0.0	0.0
1	Z001E0002	76.9075	77.7945	1.3928	1	2.0	2.0	2.0	2.0	2.0	...	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0
2	Z001E0003	75.2646	75.2555	0.8644	1	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	Z001E0004	73.6933	75.7604	2.0012	1	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	1.0	2.0	2.0	2.0	2.0
4	Z001E0005	79.2441	81.2611	1.8931	1	0.0	0.0	0.0	0.0	0.0	...	2.0	2.0	2.0	2.0	2.0	1.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4694	Z026E0196	77.6523	80.2916	1.9698	26	0.0	0.0	0.0	0.0	0.0	...	2.0	2.0	2.0	2.0	2.0	2.0	1.0	1.0	1.0	1.0
4695	Z026E0197	78.5015	82.2767	3.2979	26	2.0	2.0	2.0	2.0	2.0	...	0.5	0.5	0.5	0.5	0.5	0.5	1.0	1.0	1.0	1.0
4696	Z026E0198	77.4219	79.7868	2.2208	26	1.0	1.0	1.0	1.0	2.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4697	Z026E0199	78.6712	82.8476	4.1247	26	2.0	2.0	0.0	0.0	0.0	...	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0
4698	Z026E0200	77.4937	82.4678	4.2915	26	0.0	0.0	0.0	2.0	2.0	...	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0	2.0

4699 rows × 1111 columns

px.scatter_matrix(data.loc[:, ['days2anthesis', 'days2silk', 'asi']])

/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/plotly/express/_core.py:279: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
  dims = [

d2a = np.array(data['days2anthesis'])
d2s = np.array(data['days2silk'])
asi = np.array(data['asi'])

xs = np.array(data.drop(columns = ['days2anthesis', 'days2silk', 'asi', 'pop', 'Geno_Code']))

n_obs = xs.shape[0]

np_seed = 9070707
rng = np.random.default_rng(np_seed)  # can be called without a seed

test_pr = 0.2

test_n = round(n_obs*test_pr)
idxs = np.linspace(0, n_obs-1, num = n_obs).astype(int)
rng.shuffle(idxs)

test_idxs = idxs[0:test_n]
train_idxs = idxs[test_n:-1]

# make up tensors
def calc_cs(x): return [np.mean(x, axis = 0), np.std(x, axis = 0)]

def apply_cs(xs, cs_dict_entry): return ((xs - cs_dict_entry[0]) / cs_dict_entry[0])

scale_dict = {
    'd2a':calc_cs(d2a[train_idxs]),
    'd2s':calc_cs(d2s[train_idxs]),
    'asi':calc_cs(asi[train_idxs]),
    'xs' :calc_cs(xs[train_idxs])
}

y1 = apply_cs(d2a, scale_dict['d2a'])
y2 = apply_cs(d2s, scale_dict['d2s'])
y3 = apply_cs(asi, scale_dict['asi'])

# No need to cs xs -- 0-2 scale
# apply_cs(xs, scale_dict['xs'])

y1_train = torch.from_numpy(y1[train_idxs]).to(device).float()[:, None]
y2_train = torch.from_numpy(y2[train_idxs]).to(device).float()[:, None]
y3_train = torch.from_numpy(y3[train_idxs]).to(device).float()[:, None]
xs_train = torch.from_numpy(xs[train_idxs]).to(device).float()

y1_test = torch.from_numpy(y1[test_idxs]).to(device).float()[:, None]
y2_test = torch.from_numpy(y2[test_idxs]).to(device).float()[:, None]
y3_test = torch.from_numpy(y3[test_idxs]).to(device).float()[:, None]
xs_test = torch.from_numpy(xs[test_idxs]).to(device).float()

class CustomDataset(Dataset):
    def __init__(self, y1, y2, y3, xs, transform = None, target_transform = None):
        self.y1 = y1
        self.y2 = y2
        self.y3 = y3
        self.xs = xs
        self.transform = transform
        self.target_transform = target_transform    
    
    def __len__(self):
        return len(self.y1)
    
    def __getitem__(self, idx):
        y1_idx = self.y1[idx]
        y2_idx = self.y2[idx]
        y3_idx = self.y3[idx]
        xs_idx = self.xs[idx]
        
        if self.transform:
            xs_idx = self.transform(xs_idx)
            
        if self.target_transform:
            y1_idx = self.transform(y1_idx)
            y2_idx = self.transform(y2_idx)
            y3_idx = self.transform(y3_idx)
        return xs_idx, y1_idx, y2_idx, y3_idx

training_dataloader = DataLoader(
    CustomDataset(
        y1 = y1_train,
        y2 = y2_train,
        y3 = y3_train,
        xs = xs_train
    ), 
    batch_size = 64, 
    shuffle = True)

testing_dataloader = DataLoader(
    CustomDataset(
        y1 = y1_test,
        y2 = y2_test,
        y3 = y3_test,
        xs = xs_test
    ), 
    batch_size = 64, 
    shuffle = True)

xs.shape

(4699, 1106)

Version 1, Predict `y1` (Anthesis)

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()    
        self.x_network = nn.Sequential(
            nn.Linear(1106, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1))
        
    def forward(self, x):
        x_out = self.x_network(x)
        return x_out

model = NeuralNetwork().to(device)
# print(model)

xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
model(xs_i).shape # try prediction on one batch

torch.Size([64, 1])

def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
    size = len(dataloader.dataset)
    for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(xs_i)
        loss = loss_fn(pred, y1_i) # <----------------------------------------

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(y1_i) # <----------------
            if not silent:
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

                
def train_error(dataloader, model, loss_fn, silent = False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss = 0

    with torch.no_grad():
        for xs_i, y1_i, y2_i, y3_i in dataloader:
            pred = model(xs_i)
            train_loss += loss_fn(pred, y1_i).item() # <----------------------
            
    train_loss /= num_batches
    return(train_loss) 

            
def test_loop(dataloader, model, loss_fn, silent = False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for xs_i, y1_i, y2_i, y3_i in dataloader:
            pred = model(xs_i)
            test_loss += loss_fn(pred, y1_i).item() # <-----------------------

    test_loss /= num_batches
    if not silent:
        print(f"Test Error: Avg loss: {test_loss:>8f}")
    return(test_loss) 


def train_nn(
    training_dataloader,
    testing_dataloader,
    model,
    learning_rate = 1e-3,
    batch_size = 64,
    epochs = 500
):
    # Initialize the loss function
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
    loss_df['TrainMSE'] = np.nan
    loss_df['TestMSE']  = np.nan

    for t in tqdm.tqdm(range(epochs)):
        # print(f"Epoch {t+1}\n-------------------------------")
        train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)

        loss_df.loc[loss_df.index == t, 'TrainMSE'
                   ] = train_error(training_dataloader, model, loss_fn, silent = True)
        
        loss_df.loc[loss_df.index == t, 'TestMSE'
                   ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
        
    return([model, loss_df])

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()

# ! conda install captum -c pytorch -y

# # imports from captum library
# from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
# from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation

# ig = IntegratedGradients(model)
# ig_nt = NoiseTunnel(ig)
# dl = DeepLift(model)
# gs = GradientShap(model)
# fa = FeatureAblation(model)

# ig_attr_test = ig.attribute(xs_test, n_steps=50)
# ig_nt_attr_test = ig_nt.attribute(xs_test)
# dl_attr_test = dl.attribute(xs_test)
# gs_attr_test = gs.attribute(xs_test, xs_train)
# fa_attr_test = fa.attribute(xs_test)

# [e.shape for e in [ig_attr_test,
# ig_nt_attr_test,
# dl_attr_test,
# gs_attr_test,
# fa_attr_test]]

# fig = go.Figure()
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = ig_nt_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = dl_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = gs_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = fa_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.show()

# len(dl_attr_test.cpu().detach().numpy().mean(axis = 0))

Version 2, Predict `y1` (Anthesis), `y2` (Silking), and `y3` (ASI)

Here each model will predict 3 values. The loss function is still mse, but the y tensors are concatenated

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()    
        self.x_network = nn.Sequential(
            nn.Linear(1106, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 3))
        
    def forward(self, x):
        x_out = self.x_network(x)
        return x_out

model = NeuralNetwork().to(device)
# print(model)

xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
model(xs_i).shape # try prediction on one batch

torch.Size([64, 3])

def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
    size = len(dataloader.dataset)
    for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(xs_i)
        loss = loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)) # <----------------------------------------

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(y1_i) # <----------------
            if not silent:
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

                
def train_error(dataloader, model, loss_fn, silent = False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss = 0

    with torch.no_grad():
        for xs_i, y1_i, y2_i, y3_i in dataloader:
            pred = model(xs_i)
            train_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <----------------------
            
    train_loss /= num_batches
    return(train_loss) 

            
def test_loop(dataloader, model, loss_fn, silent = False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for xs_i, y1_i, y2_i, y3_i in dataloader:
            pred = model(xs_i)
            test_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <-----------------------

    test_loss /= num_batches
    if not silent:
        print(f"Test Error: Avg loss: {test_loss:>8f}")
    return(test_loss) 


def train_nn(
    training_dataloader,
    testing_dataloader,
    model,
    learning_rate = 1e-3,
    batch_size = 64,
    epochs = 500
):
    # Initialize the loss function
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
    loss_df['TrainMSE'] = np.nan
    loss_df['TestMSE']  = np.nan

    for t in tqdm.tqdm(range(epochs)):
        # print(f"Epoch {t+1}\n-------------------------------")
        train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)

        loss_df.loc[loss_df.index == t, 'TrainMSE'
                   ] = train_error(training_dataloader, model, loss_fn, silent = True)
        
        loss_df.loc[loss_df.index == t, 'TestMSE'
                   ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
        
    return([model, loss_df])

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 5000
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()

'../ext_data/zma/panzea/phenotypes/'

'../ext_data/zma/panzea/phenotypes/'

# pd.read_table('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212.txt', low_memory = False)

# pd.read_excel('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212_TraitDescritptions.xlsx')

Version 1, Predict y1 (Anthesis)

Version 2, Predict y1 (Anthesis), y2 (Silking), and y3 (ASI)

Version 1, Predict `y1` (Anthesis)

Version 2, Predict `y1` (Anthesis), `y2` (Silking), and `y3` (ASI)