Retrieve genomic data for phenotypes

This notebook will likely be broken into at least two
import os

import numpy as np
import pandas as pd

import plotly.express as px

# import hilbertcurve
# from hilbertcurve.hilbertcurve import HilbertCurve
import tqdm
from tqdm import tqdm

import dlgwas
from dlgwas.dna import *

# ! conda install openpyxl -y
# ! conda install hilbertcurve -y
# ! pip install hilbertcurve

Load phenotypic data to explore

For this I’m using data referenced in Wallace et al 2014 which is available through panzea. This study refers to data from 9 studies (including itself) as a source of phenotypes for the NAM data. This combination of a large set of published GWAS hits, phenotypes, and many rils makes it ideal for use here.

This file contains results I can use to check if my approaches are producing similar hits.

Wallace_etal_2014_PLoSGenet_GWAS_hits = pd.read_table('../ext_data/zma/panzea/GWASResults/Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt')
Wallace_etal_2014_PLoSGenet_GWAS_hits.head()
trait chr pos allele rmip source
0 100 Kernel weight 1 3364007 A/G 1 Hapmap1
1 100 Kernel weight 1 22247033 A/G 3 Hapmap1
2 100 Kernel weight 1 22987420 C/T 1 Hapmap2
3 100 Kernel weight 1 23056483 C/G 1 Hapmap2
4 100 Kernel weight 1 23066099 A/G 1 Hapmap2

This file on I think matches Buckler et al 2009.

temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx', 
                     skiprows=1
                    ).rename(columns = {
    'Unnamed: 1': 'ASI', 
    'Unnamed: 2': 'Days to Anthesis', 
    'Unnamed: 3': 'Days to Silk'
})
temp = temp.loc[temp.index == 0]
temp.head()
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed
  for idx, row in parser.parse():
Heritability of line BLUPs across all crosses (excluding association panel): ASI Days to Anthesis Days to Silk
0 Heritability of line BLUPs 0.775037 0.944619 0.942365
temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx', 
                     skiprows=4
                    ).rename(columns = {
    'Unnamed: 1': 'ASI', 
    'Unnamed: 2': 'Days to Anthesis', 
    'Unnamed: 3': 'Days to Silk'
})
temp.head()
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed
  for idx, row in parser.parse():
Heritability of line BLUPs within each cross: ASI Days to Anthesis Days to Silk
0 B73×B97 0.64151 0.770513 0.766082
1 B73×CML103 0.613736 0.777381 0.734942
2 B73×CML228 0.680939 0.903684 0.892442
3 B73×CML247 0.748584 0.900707 0.894291
4 B73×CML277 0.708526 0.918574 0.911493
# pull in some of the data that Wallace et al 2014 point to:

buckler_2009_path = '../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/'
os.listdir(buckler_2009_path)
['NAM_DaysToTassel.txt',
 'NAM_TasselingDate.txt',
 'markergenotypes062508.txt',
 'NAM_SilkingDate.txt',
 'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt',
 'markers061208.txt',
 'NAM_DaysToSilk.txt']
nam_dts = pd.read_table(buckler_2009_path+'NAM_DaysToSilk.txt', encoding="ISO-8859-1")
nam_dts_taxa = list(nam_dts.loc[:, 'accename'].drop_duplicates())
# Look for the right taxa
# genome_files.sort()
# genome_files[0:10]
# # I think we need to match everything before the __
# def find_AGPv4_genome(
#     taxa, # should be the desired taxa or a regex fragment (stopping before the __). E.g. 'B73' or 'B\d+'
#     **kwargs # optionally pass in a genome list (this allows for a different path or precomputing if we're finding a lot of genomes)
#     ):
#     if 'genome_files' not in kwargs.keys():
#         import os
#         genome_files = os.listdir(
#     '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
#     else:
#         genome_files = kwargs['genome_files']
#     import re
#     return( [e for e in genome_files if re.match(taxa+'__.+', e)] )
genome_files = os.listdir(
    '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
possible_matches = [{'taxa': e,
                     'matches': find_AGPv4(
                         taxa = e,
                         genome_files = genome_files
)} for e in tqdm(nam_dts_taxa)]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5369/5369 [01:25<00:00, 63.16it/s]
# how many have more than one match?
len(
[[len(e['matches']), e] for e in possible_matches if len(e['matches']) != 1])
1297
'Z018E0021'




# ith_taxa = '05-397:250007467'
# res = get_AGPv4(taxa_to_filename(taxa = ith_taxa))   # Retrieve record
# temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]  
# temp[res[0]] = res[1:]                               # Add Col. with Nucleotides
# temp.head()
'Z018E0021'
pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")
Geno_Code Entry_ID Group pop entry Days_To_Anthesis_BLUP_Sum0607 Days_To_Silk_BLUP_Sum0607 ASI_BLUP_Sum0607
0 Z001E0001 04P1367A51A Z001 1 1 75.5364 77.1298 1.4600
1 Z001E0002 04P1368A51A Z001 1 2 76.9075 77.7945 1.3928
2 Z001E0003 04P1368B51A Z001 1 3 75.2646 75.2555 0.8644
3 Z001E0004 04P1370B51A Z001 1 4 73.6933 75.7604 2.0012
4 Z001E0005 04P1371B51A Z001 1 5 79.2441 81.2611 1.8931
... ... ... ... ... ... ... ... ...
5458 Z027E0277 W64A NaN 27 277 71.9008 73.9811 2.6756
5459 Z027E0278 WD NaN 27 278 62.0212 60.5992 -0.5733
5460 Z027E0279 Wf9 NaN 27 279 71.9970 72.2319 0.8338
5461 Z027E0280 Yu796_NS NaN 27 280 74.5107 73.9727 0.2935
5462 Z027E0282 Mo17 NaN 27 282 72.7428 75.5080 3.0455

5463 rows × 8 columns

pd.read_table(buckler_2009_path+'NAM_TasselingDate.txt', encoding="ISO-8859-1")
loc_name year season pop block plot row plot_id entry_num pedigree seed_source accename value
0 Urbana, IL 2006 Summer 18 1 1 1 0650001 21 (B73×Mo18W)S5 05CL1572-blk Z018E0021 07/28/06
1 Urbana, IL 2006 Summer 18 1 2 2 0650002 41 (B73×Mo18W)S5 05FL4779-blk Z018E0041 07/31/06
2 Urbana, IL 2006 Summer 18 1 3 3 0650003 128 (B73×Mo18W)S5 24MM1520 Z018E0128 07/31/06
3 Urbana, IL 2006 Summer 18 1 4 4 0650004 146 (B73×Mo18W)S5 05P114751A Z018E0146 07/31/06
4 Urbana, IL 2006 Summer 18 1 5 5 0650005 105 (B73×Mo18W)S5 24MM1482 Z018E0105 08/02/06
... ... ... ... ... ... ... ... ... ... ... ... ... ...
43454 Columbia, MO 2007 Summer 12 3 16 2936 27M32936 179 (B73×Ki11)S5 05P025251A Z012E0179 7/25/2007
43455 Columbia, MO 2007 Summer 12 3 17 2937 27M32937 182 (B73×Ki11)S5 05P025351A Z012E0182 7/23/2007
43456 Columbia, MO 2007 Summer 12 3 18 2938 27M32938 180 (B73×Ki11)S5 05P029351A Z012E0180 7/23/2007
43457 Columbia, MO 2007 Summer 12 3 19 2939 27M32939 168 (B73×Ki11)S5 05P030451A Z012E0168 7/25/2007
43458 Columbia, MO 2007 Summer 12 3 20 2940 27M32940 8 (B73×Ki11)S5 05CL1149-blk Z012E0008 7/21/2007

43459 rows × 13 columns

# pd.read_table(buckler_2009_path+'markergenotypes062508.txt', encoding="ISO-8859-1")
pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")
Geno_Code Entry_ID Group pop entry Days_To_Anthesis_BLUP_Sum0607 Days_To_Silk_BLUP_Sum0607 ASI_BLUP_Sum0607
0 Z001E0001 04P1367A51A Z001 1 1 75.5364 77.1298 1.4600
1 Z001E0002 04P1368A51A Z001 1 2 76.9075 77.7945 1.3928
2 Z001E0003 04P1368B51A Z001 1 3 75.2646 75.2555 0.8644
3 Z001E0004 04P1370B51A Z001 1 4 73.6933 75.7604 2.0012
4 Z001E0005 04P1371B51A Z001 1 5 79.2441 81.2611 1.8931
... ... ... ... ... ... ... ... ...
5458 Z027E0277 W64A NaN 27 277 71.9008 73.9811 2.6756
5459 Z027E0278 WD NaN 27 278 62.0212 60.5992 -0.5733
5460 Z027E0279 Wf9 NaN 27 279 71.9970 72.2319 0.8338
5461 Z027E0280 Yu796_NS NaN 27 280 74.5107 73.9727 0.2935
5462 Z027E0282 Mo17 NaN 27 282 72.7428 75.5080 3.0455

5463 rows × 8 columns

# os.listdir('../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
AGPv4_path = '../data/zma/panzea/genotypes/GBS/v27/'
# Other than listing the taxa this isn't expected to be of much use for our purposes.
AGPv4_taxa=pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
AGPv4_taxa.head()
Taxa LibraryPrepID Status DNAPlate GENUS INBREEDF SPECIES DNASample Flowcell_Lane NumLanes ... GermplasmSet Barcode LibraryPlate Tassel4SampleName Population LibraryPlateWell SampleDNAWell OwnerEmail PEDIGREE SeedLot
0 05-397:250007467 250007467 public P3-GS-F Zea 0.95 mays 05-397 C00R8ABXX_4 1.0 ... Margaret Smith lines GTTGAA P3-GS-F 05-397:C00R8ABXX:4:250007467 Inbred F11 F11 esb33@cornell.edu NaN NaN
1 05-438:250007407 250007407 public P3-GS-F Zea 0.95 mays 05-438 C00R8ABXX_4 1.0 ... Margaret Smith lines GTATT P3-GS-F 05-438:C00R8ABXX:4:250007407 Inbred B03 B03 esb33@cornell.edu NaN NaN
2 12E:250032344 250032344 public Ames12 Zea 0.95 mays 12E 81FE8ABXX_4 1.0 ... Ames GCTGTGGA Ames12 12E:81FE8ABXX:4:250032344 inbred H08 H08 esb33@cornell.edu 12E NaN
3 207:250007202 250007202 public P1-GS-F Zea 0.95 mays 207 C00R8ABXX_2 1.0 ... Margaret Smith lines TACAT P1-GS-F 207:C00R8ABXX:2:250007202 Inbred E12 E12 esb33@cornell.edu NaN NaN
4 22612:250007466 250007466 public P3-GS-F Zea 0.95 mays 22612 C00R8ABXX_4 1.0 ... Margaret Smith lines GTACTT P3-GS-F 22612:C00R8ABXX:4:250007466 Inbred F10 F10 esb33@cornell.edu NaN NaN

5 rows × 21 columns

# Useful for converting between the physical location and site
AGPv4_site = pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()
Site Name Chromosome Position
0 0 S1_6370 1 52399
1 1 S1_8210 1 54239
2 2 S1_8376 1 54405
3 3 S1_9889 1 55917
4 4 S1_9899 1 55927

Retrieving a genome by taxa name:

# The genomes are in a folder with an identical name as their source table
table_directory = AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# Note however that the naming is altered to not use ':'
os.listdir(table_directory)[0:3]
['xztea', 'xzedh', 'Z020E0024__250026132']
taxa_to_filename(taxa = '05-397:250007467')
'05-397__250007467'
def get_AGPv4(taxa):
    with open(table_directory+taxa, 'r') as f:
        data = f.read()    
    data = data.split('\t')
    return(data)
get_AGPv4('05-397__250007467')[0:4]
['05-397:250007467', 'T', 'T', 'A']

In addition to returning a specific taxa, the table’s headers can be retieved with “taxa”.

get_AGPv4(taxa = 'taxa')[0:4]
['Taxa', '52399', '54239', '54405']

Converting between site and chromosome/position requires the AGPv4_site dataframe. A given record contains the taxa as well as the nucleotides, so with that entry excluded the chromosome / position can be paired up.

len(get_AGPv4(taxa = 'taxa')), AGPv4_site.shape
(943456, (943455, 4))
ith_taxa = '05-397:250007467'
res = get_AGPv4(taxa_to_filename(taxa = ith_taxa))   # Retrieve record
temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]  
temp[res[0]] = res[1:]                               # Add Col. with Nucleotides
temp.head()
Chromosome Position 05-397:250007467
0 1 52399 T
1 1 54239 T
2 1 54405 A
3 1 55917 N
4 1 55927 N

Look at SNP coverage

mask = (temp.Chromosome == 1)

temp_pos = temp.loc[mask, ['Position']]
temp_pos['Shift'] = 0
temp_pos.loc[1: , ['Shift']] = np.array(temp_pos.Position)[:-1]
temp_pos['Diff'] = temp_pos['Position'] - temp_pos['Shift']

temp_pos.loc[0, 'Diff'] = None
temp_pos
Position Shift Diff
0 52399 0 NaN
1 54239 52399 1840.0
2 54405 54239 166.0
3 55917 54405 1512.0
4 55927 55917 10.0
... ... ... ...
147145 306971046 306910117 60929.0
147146 306971061 306971046 15.0
147147 306971063 306971061 2.0
147148 306971073 306971063 10.0
147149 306971080 306971073 7.0

147150 rows × 3 columns

# px.histogram(temp_pos, x = 'Diff')

Demonstrate Hilbert Curve

# demonstrating the hilbert curve
temp = np.linspace(1, 100, num= 50)
# px.scatter(x = temp, y = [0 for e in range(len(temp))], color = temp)
px.imshow(temp.reshape((1, temp.shape[0])))
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
#                              n = 2    # dimensions
#                             )
# distances = list(range(len(temp)))

# points = hilbert_curve.points_from_distances(distances)
# # px.line(pd.DataFrame(points, columns = ['i', 'j']), x = 'i', y = 'j')
NameError: name 'HilbertCurve' is not defined
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan         #  empty values being used for visualization

# for i in range(len(temp)):
# #     print(i)
#     temp_mat[points[i][0], points[i][1]] = temp[i]
    
# # temp2 = pd.DataFrame(points, columns = ['i', 'j'])
# # temp2['value'] = temp
# # px.scatter(temp2, x = 'i', y = 'j', color = 'value')

# px.imshow(temp_mat)
NameError: name 'points' is not defined
# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
#                              n = 2    # dimensions
#                             )


# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists

# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan         #  empty values being used for visualization

# for i in range(len(temp)):
# #     print(i)
#     temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)
NameError: name 'HilbertCurve' is not defined

Hilbert curve for one sequence

# temp_pos['Present'] = 1
# temp_pos.shape[0]
147150
# def calc_needed_hilbert_p(n_needed = 1048576,
                        #                           max_p = 20):
                        #     out = None
                        #     for i in range(1, max_p):
                        #         if 4**i > n_needed:
                        #             out = i
                        #             break
                        #     return(out)

                        # calc_needed_hilbert_p(n_needed=147150)
9
# temp_pos['Position']
0             52399
1             54239
2             54405
3             55917
4             55927
            ...    
147145    306971046
147146    306971061
147147    306971063
147148    306971073
147149    306971080
Name: Position, Length: 147150, dtype: int64
# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
#                              n = 2    # dimensions
#                             )


# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists

# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan         #  empty values being used for visualization

# for i in range(len(temp)):
# #     print(i)
#     temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)