Tian et al. 2011

This file aims to reproduce the findings of Tian et al. 2011, “Genome-wide association study of leaf architecture in the

use_gpu_num = 1

import os
import pandas as pd
import numpy as np
import re

# import torch
# from torch.utils.data import Dataset
# from torch.utils.data import DataLoader
# from torch import nn

# TODO fixme

# device = "cuda" if torch.cuda.is_available() else "cpu"
# if use_gpu_num in [0, 1]: 
#     torch.cuda.set_device(use_gpu_num)
# print(f"Using {device} device")

import tqdm
from tqdm import tqdm

import plotly.graph_objects as go
import plotly.express as px

# [e for e in os.listdir() if re.match(".+\\.txt", e)]

import dlgwas
from dlgwas.dna import *

from dlgwas.kegg import ensure_dir_path_exists
from dlgwas.kegg import get_cached_result
from dlgwas.kegg import put_cached_result

# set up directory for notebook artifacts
nb_name = '10_TianEtAl2011'
ensure_dir_path_exists(dir_path = '../models/'+nb_name)
ensure_dir_path_exists(dir_path = '../reports/'+nb_name)

Load Data

with open('../ext_data/zma/panzea/phenotypes/Tian_etal_2011_NatGen_leaf_pheno_data-110221/Tian_etal_2011_NatGen_readme.txt', 
          'r') as f:
    dat = f.read()
print(dat)

Tian F, Bradbury PJ, Brown PJ, Hung H, Sun Q, Flint-Garcia S, Rocheford TR, McMullen MD, Holland JB, Buckler ES. 2011. Genome-wide association study of leaf architecture in the maize nested association mapping population. Nature Genetics 43. http://dx.doi.org/doi:10.1038/ng.746

------------------------------------------------
From: Feng Tian
Sent: Sunday, November 21, 2010 1:03 PM
Subject: Leaf traits data

The file "Tian_etal2011NatGenet.leaf_trait_phenotype.xlsx" contains the phenotypes I used in the paper. In the paper, we used boxcox transformed upper leaf angle. This is included in the file. Before I started mapping, I removed 4 obvious outlier data points from the raw BLUP data from Jim (set them as missing): 
leaf length and width of Z002E0060
Upper leaf angle of Z017E0082 and Z022E0007

Feng Tian, Ph.D. 
Post-doctoral Associate
Cornell University
Institute for Genomic Diversity
175 Biotechnology Building
Ithaca, NY 14853-2703

Email:ft55@cornell.edu

data = pd.read_excel('../ext_data/zma/panzea/phenotypes/Tian_etal_2011_NatGen_leaf_pheno_data-110221/Tian_etal2011NatGenet.leaf_trait_phenotype.xlsx')
data

	sample	pop	leaf_length	leaf_width	upper_leaf_angle	leaf_angle_boxcox_transformed
0	Z001E0001	1	850.6304	88.0488	65.3152	9.620754e+06
1	Z001E0002	1	654.2202	95.8449	59.8256	6.659548e+06
2	Z001E0003	1	836.4517	93.4534	66.1322	1.013518e+07
3	Z001E0004	1	595.5967	100.3453	66.3374	1.026761e+07
4	Z001E0005	1	822.9404	95.9405	76.4436	1.860027e+07
...	...	...	...	...	...	...
4887	MO380	17	676.5082	82.9771	68.6168	1.182904e+07
4888	MO381	17	624.0939	77.0269	62.5098	8.004169e+06
4889	MO382	17	669.3213	81.5181	62.0530	7.761913e+06
4890	MO383	17	693.4092	91.2571	76.6321	1.879322e+07
4891	MO384	17	695.5563	80.4511	65.9832	1.003983e+07

4892 rows × 6 columns

Find Marker data to use along with the phenotypic data here

samples = list(set(data['sample']))

# this can take a while to calculate so it's worth cacheing
save_path = '../models/'+nb_name+'/samples_and_matches.pkl'

samples_and_matches = get_cached_result(save_path=save_path)

if None == samples_and_matches:
    samples_and_matches = [{
        'sample': sample,
        'matches': find_AGPv4(taxa = sample) } for sample in tqdm.tqdm(samples)]

    put_cached_result(
        save_path = save_path,
        save_obj = samples_and_matches
    )

#TODO: Some of these samples have multiple possible matches. 
# For the time being I'm usign the first one.
[e for e in samples_and_matches if len(e['matches']) > 1][0:10]

[{'sample': 'Z006E0119',
  'matches': ['Z006E0119__250065007', 'Z006E0119__250022599']},
 {'sample': 'Z025E0162',
  'matches': ['Z025E0162__250183159', 'Z025E0162__250027524']},
 {'sample': 'Z006E0023',
  'matches': ['Z006E0023__250022411', 'Z006E0023__250178848']},
 {'sample': 'Z001E0045',
  'matches': ['Z001E0045__250079822', 'Z001E0045__250021154']},
 {'sample': 'Z023E0068',
  'matches': ['Z023E0068__250026869', 'Z023E0068__250080182']},
 {'sample': 'Z024E0154',
  'matches': ['Z024E0154__250027303', 'Z024E0154__250183143']},
 {'sample': 'Z018E0103',
  'matches': ['Z018E0103__250025674', 'Z018E0103__250179096']},
 {'sample': 'Z001E0039',
  'matches': ['Z001E0039__250021192', 'Z001E0039__250079845']},
 {'sample': 'Z009E0109',
  'matches': ['Z009E0109__250023292', 'Z009E0109__250080072']},
 {'sample': 'Z019E0083',
  'matches': ['Z019E0083__250028399', 'Z019E0083__250179105']}]

samples_one_match = [e for e in samples_and_matches if len(e['matches']) == 1]

print("Warning: "+str(len(samples_and_matches)-len(samples_one_match)
    )+" samples ("+str(round(100*((len(samples_and_matches)-len(samples_one_match))/len(samples_and_matches))) 
    )+"%) have zero matches or more than one match in AGPv4. The first is being used.")

Warning: 1045 samples (21%) have zero matches or more than one match in AGPv4. The first is being used.

Filter data to only those with unambiguous genotypes

original_rows = data.shape[0]

# mask to restrict to only those samples with one or more GBS marker set in AGPv4
mask = [True if e in [e1['sample'] for e1 in 
                      [e for e in samples_and_matches if len(e['matches']) >= 1]
              ] else False for e in data['sample'] ]
data = data.loc[mask,].reset_index().drop(columns = 'index')
print(str(original_rows - data.shape[0])+' rows dropped.')

209 rows dropped.

# repeat for data completeness:
original_rows = data.shape[0]
# exclude rows with missing values in y
mask = (data.leaf_length.notna() &                    
        data.leaf_width.notna() &                     
        data.upper_leaf_angle.notna())

data = data.loc[mask,].reset_index().drop(columns = 'index')
print(str(original_rows - data.shape[0])+' rows dropped.')

7 rows dropped.

# ys = data.loc[:, ['leaf_length', 'leaf_width', 'upper_leaf_angle']]

# geno_sample = data.loc[:, 'sample']
# pop_sample  = data.loc[:, 'pop']

# Sample withing Group
data.assign(n = 1).groupby(['pop', 'sample']).agg(nsum = ('n', np.mean)).reset_index().sort_values('nsum')

	pop	sample	nsum
0	1	Z001E0001	1.0
3121	18	Z018E0132	1.0
3120	18	Z018E0131	1.0
3119	18	Z018E0130	1.0
3118	18	Z018E0129	1.0
...	...	...	...
1555	9	Z009E0025	1.0
1554	9	Z009E0024	1.0
1553	9	Z009E0023	1.0
1569	9	Z009E0039	1.0
4675	26	Z026E0200	1.0

4676 rows × 3 columns

# But sample is also usable as a uid
data.assign(n = 1).groupby(['sample']).agg(nsum = ('n', np.mean)).reset_index().sort_values('nsum')

	sample	nsum
0	Z001E0001	1.0
3121	Z018E0132	1.0
3120	Z018E0131	1.0
3119	Z018E0130	1.0
3118	Z018E0129	1.0
...	...	...
1555	Z009E0025	1.0
1554	Z009E0024	1.0
1553	Z009E0023	1.0
1569	Z009E0039	1.0
4675	Z026E0200	1.0

4676 rows × 2 columns

Retrieve Marker data

Initial approach: Convert lists to arrays

The original approach to converting marker lists to np arrays was straighforward but required looping over the marker lists such that it takes about a second per sample. This puts the conversion at ~1h.

n_samples = len(list(data['sample']))

markers = np.zeros(shape = (
    n_samples, 
    len(get_AGPv4(taxa = 'taxa'))-1, # don't include taxa
    4
))

for i in tqdm(range(n_samples)):
    search_taxa = data['sample'][i]
    markers[i, :, :] = list_to_ACGT(
            in_seq = get_AGPv4(
                taxa = find_AGPv4(
                    taxa = search_taxa)[0] 
            )[1:]
        )

n_samples = len(list(data['sample']))

markers = np.zeros(shape = (
    n_samples, 
    len(get_AGPv4(taxa = 'taxa'))-1, # don't include taxa
    4
))



import time
times_list = []

for i in tqdm(range(2)):
    times = []
    times += [time.time()]
    search_taxa = data['sample'][i]
    times += [time.time()] #---- 0
    aa = find_AGPv4(taxa = search_taxa)[0]
    times += [time.time()] #---- 1
    bb = get_AGPv4(taxa =  aa)[1:]
    times += [time.time()] #---- 2
    cc = list_to_ACGT(in_seq = bb) # <-- This is where almost all of the time is coming from
    times += [time.time()] #---- 3
    markers[i, :, :] = cc
    times += [time.time()] #---- 4
    times_list += [times]

100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.05s/it]

diff_times = []
for times in times_list:
    diff_times += [ [times[i+1]-times[i] for i in range(len(times)-1)]]

px.imshow(np.asarray(diff_times))

Improved approach: convert dataframes to arrays

In addition to caching the data, alternate approaches are available. Here I tested a function that would work off of a dataframe rather than a list.

# What about a dataframe based version of list_to_ACGT?

def df_to_ACGT(
    in_df, # This should be a dataframe containing samples and SNPs
    sample_axis, # this is an int with the axis of the samples. If samples are not in the 0th axis they will be swapped and returned there.
    progress = False,
    silent = False
):
    # Note! in_df may have samples second! if so then.
    irows, jcols = in_df.shape

    # Convert IUPAC codes into pr ACGT -------------------------------------------
    encode_dict = {
        #     https://www.bioinformatics.org/sms/iupac.html
        #     A     C     G     T
        'A': [1,    0,    0,    0   ],
        'C': [0,    1,    0,    0   ],
        'G': [0,    0,    1,    0   ],
        'T': [0,    0,    0,    1   ],
        'K': [0,    0,    0.5,  0.5 ],
        'M': [0.5,  0.5,  0,    0   ],
        'N': [0.25, 0.25, 0.25, 0.25],
        'R': [0.5,  0,    0.5,  0   ],
        'S': [0,    0.5,  0.5,  0   ],
        'W': [0.5,  0,    0,    0.5 ],
        'Y': [0,    0.5,  0,    0.5 ],
        #     Other values (assumed empty)
        #     A     C     G     T
    #      '': [0,    0,    0,    0   ],
    #     '-': [0,    0,    0,    0   ],
    #     '0': [0,    0,    0,    0   ],
    }

    # fix newline in last row
    for j in range(jcols):
        in_df[j] = in_df[j].str.replace('\n', '')

    not_in_dict = [e for e in set(in_df[j]) if e not in list(encode_dict.keys())]

    if not_in_dict != []:
        if silent != True:
            print("Waring: The following are not in the encoding dictionary and will be set as missing.\n"+str(not_in_dict))

    # output matrix
    GMat = np.zeros(shape = [irows,
                             jcols, 
                             4])
    # convert all nucleotides to probabilities
    if progress == True:
        for nucleotide in tqdm(encode_dict.keys()):
            mask = (in_df == nucleotide)
            GMat[mask, :] = encode_dict[nucleotide] 
    else:
        for nucleotide in encode_dict.keys():
            mask = (in_df == nucleotide)
            GMat[mask, :] = encode_dict[nucleotide] 

    # if needed rotate to have desired shape    
    if sample_axis != 0:
        GMat = np.swapaxes(GMat, 0, sample_axis)

    return(GMat)

Confirm that these are equivalent:

search_taxa = data['sample'][i]
aa = find_AGPv4(taxa = search_taxa)[0]
bb = get_AGPv4(taxa =  aa)[1:]
cc = list_to_ACGT(in_seq = bb) 

dd = df_to_ACGT(
    in_df = pd.DataFrame(bb),# This should be a dataframe containing samples and SNPs
    sample_axis = 0, # this is an int with the axis of the samples. If samples are not in the 0th axis they will be swapped and returned there.
    progress = False
)

cc.shape ==  dd.squeeze().shape

Waring: The following are not in the encoding dictionary and will be set as missing.
['-', '0']

True

Check if this is faster. Previous version takes ~1sec/iter.

j = 2

vals = pd.concat([pd.DataFrame(get_AGPv4(taxa = find_AGPv4(taxa = data['sample'][i])[0])[1:]
                       ) for i in tqdm(range(j))], axis = 1)
vals.columns = [i for i in range(j)]

vals.shape

vals = df_to_ACGT(
    in_df = vals,# This should be a dataframe containing samples and SNPs
    sample_axis = 1# this is an int with the axis of the samples. If samples are not in the 0th axis they will be swapped and returned there.
)

100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 20.76it/s]

Waring: The following are not in the encoding dictionary and will be set as missing.
['-', '0']

This is slow as well. Still an improvement over but not an impressive one.

# save_path = '../models/'+nb_name+'/markers/'

# for e in os.listdir(save_path):
#     np.savez_compressed(save_path+e[0:-1]+'z',
#                         np.load(save_path+e))

# # np.savez_compressed('../models/'+nb_name+'/markers/test_zip.npz', vals[0, :, :])

# numpy.savez_compressed

# Loading and processing is the time consuming part. 
save_path = '../models/'+nb_name+'/markers/'
ensure_dir_path_exists(save_path)

n_samples = len(list(data['sample']))

for i in  tqdm(range(n_samples)):
    save_file_path = save_path+'m'+str(i)+'.npz'
    if os.path.exists(save_file_path) == False:
        markers = pd.DataFrame(get_AGPv4(taxa = find_AGPv4(taxa = data['sample'][i])[0])[1:])    

        markers = df_to_ACGT(
            in_df = markers,# This should be a dataframe containing samples and SNPs
            sample_axis = 1,# this is an int with the axis of the samples. If samples are not in the 0th axis they will be swapped and returned there.
            silent = True
        )

        np.savez_compressed(save_file_path, markers)
# 4683/4683 [59:54<00:00,  1.30it/s]

100%|███████████████████████████████████████████████████████████████████████████| 4676/4676 [00:00<00:00, 470663.92it/s]

Apply Hilbert Transform

# This should be done on as many samples as possible at one time. 
# Transforming 1 sample and transforming 11 each take 7 and 7.74 seconds.
# markers_hilbert = np_3d_to_hilbert(
#     in_seq = np.load('../models/10_TianEtAl2011/markers/m0.npy')
# )

# load_path = '../models/'+nb_name+'/markers/'
# save_path = '../models/'+nb_name+'/hilbert/'
# ensure_dir_path_exists(save_path)

# cached_markers = os.listdir(load_path)
# cached_hilbert_markers = os.listdir(save_path)
# # only process markers that haven't been transformed already
# cached_markers = [ee for ee in cached_markers if ee not in ['m'+e[1:] for e in cached_hilbert_markers]]

# # if need be, this can always be chunked
# # process these as a large array for speed.
# print('Applying Hilbert Transformation.')
# markers_hilbert = np_3d_to_hilbert(
#     in_seq = np.concatenate([np.load(load_path+e) for e in cached_markers])
# )

# # then save each sample separately so they can be loaded as needed.
# print('Saving.')
# for i in tqdm(range(len(cached_markers))):
#     np.save(save_path+'h'+cached_markers[i][1:], markers_hilbert[i])

chunk_size = 500
import math

load_path = '../models/'+nb_name+'/markers/'
save_path = '../models/'+nb_name+'/hilbert/'
ensure_dir_path_exists(save_path)

cached_markers = os.listdir(load_path)
cached_hilbert_markers = os.listdir(save_path)
# only process markers that haven't been transformed already
cached_markers = [ee for ee in cached_markers if ee not in ['m'+e[1:] for e in cached_hilbert_markers]]

n_chunks = math.ceil(len(cached_markers)/chunk_size)
for ith_chunk in range(n_chunks):

    print(str(ith_chunk)+'/'+str(n_chunks))
    chunk_start = ith_chunk*chunk_size
    chunk_stop = min((ith_chunk+1)*chunk_size, 
                     len(cached_markers)+1 ) # Last chunk may not be evenly divisible


    cached_marker_slice = cached_markers[chunk_start:chunk_stop]
    # if need be, this can always be chunked. 
    # process these as a large array for speed.
    print('Applying Hilbert Transformation.')

    markers_hilbert = np_3d_to_hilbert(
        #                                                v--- zipped np arrays use position (arr_0, arr_1, ...) if no name is passed
        in_seq = np.concatenate([np.load(load_path+e)['arr_0'] for e in cached_marker_slice ]) 
    )

    #then save each sample separately so they can be loaded as needed.
    print('Saving.')
    for i in tqdm(range(len(cached_marker_slice))):
        np.savez_compressed(save_path+'h'+cached_marker_slice[i][1:-1]+'z', markers_hilbert[i])

Misc

# Useful for converting between the physical location and site
AGPv4_site = pd.read_table('../data/zma/panzea/genotypes/GBS/v27/'+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()

	Site	Name	Chromosome	Position
0	0	S1_6370	1	52399
1	1	S1_8210	1	54239
2	2	S1_8376	1	54405
3	3	S1_9889	1	55917
4	4	S1_9899	1	55927

Taxa Groupings

taxa_groupings = pd.read_table('../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
taxa_groupings = taxa_groupings.loc[:, ['Taxa', 'Tassel4SampleName', 'Population']]

taxa_groupings[['sample', 'sample2']] = taxa_groupings['Taxa'].str.split(':', expand = True)

taxa_groupings = taxa_groupings.loc[:, ['sample', 'Population']].drop_duplicates()

# Restrict to those in data
taxa_groupings = data[['sample']].merge(taxa_groupings, how = 'left')
taxa_groupings

	sample	Population
0	Z001E0001	B73 x B97
1	Z001E0002	B73 x B97
2	Z001E0003	B73 x B97
3	Z001E0004	B73 x B97
4	Z001E0005	B73 x B97
...	...	...
4671	Z026E0196	B73 x Tzi8
4672	Z026E0197	B73 x Tzi8
4673	Z026E0198	B73 x Tzi8
4674	Z026E0199	B73 x Tzi8
4675	Z026E0200	B73 x Tzi8