Retrieve genomic data for phenotypes

This notebook will likely be broken into at least two

import os

import numpy as np
import pandas as pd

import plotly.express as px

# import hilbertcurve
# from hilbertcurve.hilbertcurve import HilbertCurve
import tqdm
from tqdm import tqdm

import dlgwas
from dlgwas.dna import *

# ! conda install openpyxl -y
# ! conda install hilbertcurve -y
# ! pip install hilbertcurve

Load phenotypic data to explore

For this I’m using data referenced in Wallace et al 2014 which is available through panzea. This study refers to data from 9 studies (including itself) as a source of phenotypes for the NAM data. This combination of a large set of published GWAS hits, phenotypes, and many rils makes it ideal for use here.

This file contains results I can use to check if my approaches are producing similar hits.

Wallace_etal_2014_PLoSGenet_GWAS_hits = pd.read_table('../ext_data/zma/panzea/GWASResults/Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt')
Wallace_etal_2014_PLoSGenet_GWAS_hits.head()

	trait	chr	pos	allele	rmip	source
0	100 Kernel weight	1	3364007	A/G	1	Hapmap1
1	100 Kernel weight	1	22247033	A/G	3	Hapmap1
2	100 Kernel weight	1	22987420	C/T	1	Hapmap2
3	100 Kernel weight	1	23056483	C/G	1	Hapmap2
4	100 Kernel weight	1	23066099	A/G	1	Hapmap2

This file on I think matches Buckler et al 2009.

temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx', 
                     skiprows=1
                    ).rename(columns = {
    'Unnamed: 1': 'ASI', 
    'Unnamed: 2': 'Days to Anthesis', 
    'Unnamed: 3': 'Days to Silk'
})
temp = temp.loc[temp.index == 0]
temp.head()

/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed
  for idx, row in parser.parse():

	Heritability of line BLUPs across all crosses (excluding association panel):	ASI	Days to Anthesis	Days to Silk
0	Heritability of line BLUPs	0.775037	0.944619	0.942365

temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx', 
                     skiprows=4
                    ).rename(columns = {
    'Unnamed: 1': 'ASI', 
    'Unnamed: 2': 'Days to Anthesis', 
    'Unnamed: 3': 'Days to Silk'
})
temp.head()

/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed
  for idx, row in parser.parse():

	Heritability of line BLUPs within each cross:	ASI	Days to Anthesis	Days to Silk
0	B73×B97	0.64151	0.770513	0.766082
1	B73×CML103	0.613736	0.777381	0.734942
2	B73×CML228	0.680939	0.903684	0.892442
3	B73×CML247	0.748584	0.900707	0.894291
4	B73×CML277	0.708526	0.918574	0.911493

# pull in some of the data that Wallace et al 2014 point to:

buckler_2009_path = '../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/'
os.listdir(buckler_2009_path)

['NAM_DaysToTassel.txt',
 'NAM_TasselingDate.txt',
 'markergenotypes062508.txt',
 'NAM_SilkingDate.txt',
 'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt',
 'markers061208.txt',
 'NAM_DaysToSilk.txt']

nam_dts = pd.read_table(buckler_2009_path+'NAM_DaysToSilk.txt', encoding="ISO-8859-1")
nam_dts_taxa = list(nam_dts.loc[:, 'accename'].drop_duplicates())

# Look for the right taxa

# genome_files.sort()
# genome_files[0:10]

# # I think we need to match everything before the __
# def find_AGPv4_genome(
#     taxa, # should be the desired taxa or a regex fragment (stopping before the __). E.g. 'B73' or 'B\d+'
#     **kwargs # optionally pass in a genome list (this allows for a different path or precomputing if we're finding a lot of genomes)
#     ):
#     if 'genome_files' not in kwargs.keys():
#         import os
#         genome_files = os.listdir(
#     '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
#     else:
#         genome_files = kwargs['genome_files']
#     import re
#     return( [e for e in genome_files if re.match(taxa+'__.+', e)] )

genome_files = os.listdir(
    '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')

possible_matches = [{'taxa': e,
                     'matches': find_AGPv4(
                         taxa = e,
                         genome_files = genome_files
)} for e in tqdm(nam_dts_taxa)]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5369/5369 [01:25<00:00, 63.16it/s]

# how many have more than one match?
len(
[[len(e['matches']), e] for e in possible_matches if len(e['matches']) != 1])

'Z018E0021'




# ith_taxa = '05-397:250007467'
# res = get_AGPv4(taxa_to_filename(taxa = ith_taxa))   # Retrieve record
# temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]  
# temp[res[0]] = res[1:]                               # Add Col. with Nucleotides
# temp.head()

'Z018E0021'

pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")

	Geno_Code	Entry_ID	Group	pop	entry	Days_To_Anthesis_BLUP_Sum0607	Days_To_Silk_BLUP_Sum0607	ASI_BLUP_Sum0607
0	Z001E0001	04P1367A51A	Z001	1	1	75.5364	77.1298	1.4600
1	Z001E0002	04P1368A51A	Z001	1	2	76.9075	77.7945	1.3928
2	Z001E0003	04P1368B51A	Z001	1	3	75.2646	75.2555	0.8644
3	Z001E0004	04P1370B51A	Z001	1	4	73.6933	75.7604	2.0012
4	Z001E0005	04P1371B51A	Z001	1	5	79.2441	81.2611	1.8931
...	...	...	...	...	...	...	...	...
5458	Z027E0277	W64A	NaN	27	277	71.9008	73.9811	2.6756
5459	Z027E0278	WD	NaN	27	278	62.0212	60.5992	-0.5733
5460	Z027E0279	Wf9	NaN	27	279	71.9970	72.2319	0.8338
5461	Z027E0280	Yu796_NS	NaN	27	280	74.5107	73.9727	0.2935
5462	Z027E0282	Mo17	NaN	27	282	72.7428	75.5080	3.0455

5463 rows × 8 columns

pd.read_table(buckler_2009_path+'NAM_TasselingDate.txt', encoding="ISO-8859-1")

	loc_name	year	season	pop	block	plot	row	plot_id	entry_num	pedigree	seed_source	accename	value
0	Urbana, IL	2006	Summer	18	1	1	1	0650001	21	(B73×Mo18W)S5	05CL1572-blk	Z018E0021	07/28/06
1	Urbana, IL	2006	Summer	18	1	2	2	0650002	41	(B73×Mo18W)S5	05FL4779-blk	Z018E0041	07/31/06
2	Urbana, IL	2006	Summer	18	1	3	3	0650003	128	(B73×Mo18W)S5	24MM1520	Z018E0128	07/31/06
3	Urbana, IL	2006	Summer	18	1	4	4	0650004	146	(B73×Mo18W)S5	05P114751A	Z018E0146	07/31/06
4	Urbana, IL	2006	Summer	18	1	5	5	0650005	105	(B73×Mo18W)S5	24MM1482	Z018E0105	08/02/06
...	...	...	...	...	...	...	...	...	...	...	...	...	...
43454	Columbia, MO	2007	Summer	12	3	16	2936	27M32936	179	(B73×Ki11)S5	05P025251A	Z012E0179	7/25/2007
43455	Columbia, MO	2007	Summer	12	3	17	2937	27M32937	182	(B73×Ki11)S5	05P025351A	Z012E0182	7/23/2007
43456	Columbia, MO	2007	Summer	12	3	18	2938	27M32938	180	(B73×Ki11)S5	05P029351A	Z012E0180	7/23/2007
43457	Columbia, MO	2007	Summer	12	3	19	2939	27M32939	168	(B73×Ki11)S5	05P030451A	Z012E0168	7/25/2007
43458	Columbia, MO	2007	Summer	12	3	20	2940	27M32940	8	(B73×Ki11)S5	05CL1149-blk	Z012E0008	7/21/2007

43459 rows × 13 columns

# pd.read_table(buckler_2009_path+'markergenotypes062508.txt', encoding="ISO-8859-1")

pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")

	Geno_Code	Entry_ID	Group	pop	entry	Days_To_Anthesis_BLUP_Sum0607	Days_To_Silk_BLUP_Sum0607	ASI_BLUP_Sum0607
0	Z001E0001	04P1367A51A	Z001	1	1	75.5364	77.1298	1.4600
1	Z001E0002	04P1368A51A	Z001	1	2	76.9075	77.7945	1.3928
2	Z001E0003	04P1368B51A	Z001	1	3	75.2646	75.2555	0.8644
3	Z001E0004	04P1370B51A	Z001	1	4	73.6933	75.7604	2.0012
4	Z001E0005	04P1371B51A	Z001	1	5	79.2441	81.2611	1.8931
...	...	...	...	...	...	...	...	...
5458	Z027E0277	W64A	NaN	27	277	71.9008	73.9811	2.6756
5459	Z027E0278	WD	NaN	27	278	62.0212	60.5992	-0.5733
5460	Z027E0279	Wf9	NaN	27	279	71.9970	72.2319	0.8338
5461	Z027E0280	Yu796_NS	NaN	27	280	74.5107	73.9727	0.2935
5462	Z027E0282	Mo17	NaN	27	282	72.7428	75.5080	3.0455

5463 rows × 8 columns

# os.listdir('../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')

AGPv4_path = '../data/zma/panzea/genotypes/GBS/v27/'

# Other than listing the taxa this isn't expected to be of much use for our purposes.
AGPv4_taxa=pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
AGPv4_taxa.head()

	Taxa	LibraryPrepID	Status	DNAPlate	GENUS	INBREEDF	SPECIES	DNASample	Flowcell_Lane	NumLanes	...	GermplasmSet	Barcode	LibraryPlate	Tassel4SampleName	Population	LibraryPlateWell	SampleDNAWell	OwnerEmail	PEDIGREE	SeedLot
0	05-397:250007467	250007467	public	P3-GS-F	Zea	0.95	mays	05-397	C00R8ABXX_4	1.0	...	Margaret Smith lines	GTTGAA	P3-GS-F	05-397:C00R8ABXX:4:250007467	Inbred	F11	F11	esb33@cornell.edu	NaN	NaN
1	05-438:250007407	250007407	public	P3-GS-F	Zea	0.95	mays	05-438	C00R8ABXX_4	1.0	...	Margaret Smith lines	GTATT	P3-GS-F	05-438:C00R8ABXX:4:250007407	Inbred	B03	B03	esb33@cornell.edu	NaN	NaN
2	12E:250032344	250032344	public	Ames12	Zea	0.95	mays	12E	81FE8ABXX_4	1.0	...	Ames	GCTGTGGA	Ames12	12E:81FE8ABXX:4:250032344	inbred	H08	H08	esb33@cornell.edu	12E	NaN
3	207:250007202	250007202	public	P1-GS-F	Zea	0.95	mays	207	C00R8ABXX_2	1.0	...	Margaret Smith lines	TACAT	P1-GS-F	207:C00R8ABXX:2:250007202	Inbred	E12	E12	esb33@cornell.edu	NaN	NaN
4	22612:250007466	250007466	public	P3-GS-F	Zea	0.95	mays	22612	C00R8ABXX_4	1.0	...	Margaret Smith lines	GTACTT	P3-GS-F	22612:C00R8ABXX:4:250007466	Inbred	F10	F10	esb33@cornell.edu	NaN	NaN

5 rows × 21 columns

# Useful for converting between the physical location and site
AGPv4_site = pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()

	Site	Name	Chromosome	Position
0	0	S1_6370	1	52399
1	1	S1_8210	1	54239
2	2	S1_8376	1	54405
3	3	S1_9889	1	55917
4	4	S1_9899	1	55927

Retrieving a genome by taxa name:

# The genomes are in a folder with an identical name as their source table
table_directory = AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# Note however that the naming is altered to not use ':'
os.listdir(table_directory)[0:3]

['xztea', 'xzedh', 'Z020E0024__250026132']

taxa_to_filename(taxa = '05-397:250007467')

'05-397__250007467'

def get_AGPv4(taxa):
    with open(table_directory+taxa, 'r') as f:
        data = f.read()    
    data = data.split('\t')
    return(data)

get_AGPv4('05-397__250007467')[0:4]

['05-397:250007467', 'T', 'T', 'A']

In addition to returning a specific taxa, the table’s headers can be retieved with “taxa”.

get_AGPv4(taxa = 'taxa')[0:4]

['Taxa', '52399', '54239', '54405']

Converting between site and chromosome/position requires the AGPv4_site dataframe. A given record contains the taxa as well as the nucleotides, so with that entry excluded the chromosome / position can be paired up.

len(get_AGPv4(taxa = 'taxa')), AGPv4_site.shape

(943456, (943455, 4))

ith_taxa = '05-397:250007467'
res = get_AGPv4(taxa_to_filename(taxa = ith_taxa))   # Retrieve record
temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]  
temp[res[0]] = res[1:]                               # Add Col. with Nucleotides
temp.head()

	Chromosome	Position	05-397:250007467
0	1	52399	T
1	1	54239	T
2	1	54405	A
3	1	55917	N
4	1	55927	N

Look at SNP coverage

mask = (temp.Chromosome == 1)

temp_pos = temp.loc[mask, ['Position']]

temp_pos['Shift'] = 0
temp_pos.loc[1: , ['Shift']] = np.array(temp_pos.Position)[:-1]
temp_pos['Diff'] = temp_pos['Position'] - temp_pos['Shift']

temp_pos.loc[0, 'Diff'] = None

temp_pos

	Position	Shift	Diff
0	52399	0	NaN
1	54239	52399	1840.0
2	54405	54239	166.0
3	55917	54405	1512.0
4	55927	55917	10.0
...	...	...	...
147145	306971046	306910117	60929.0
147146	306971061	306971046	15.0
147147	306971063	306971061	2.0
147148	306971073	306971063	10.0
147149	306971080	306971073	7.0

147150 rows × 3 columns

# px.histogram(temp_pos, x = 'Diff')

Demonstrate Hilbert Curve

# demonstrating the hilbert curve
temp = np.linspace(1, 100, num= 50)
# px.scatter(x = temp, y = [0 for e in range(len(temp))], color = temp)
px.imshow(temp.reshape((1, temp.shape[0])))

# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
#                              n = 2    # dimensions
#                             )
# distances = list(range(len(temp)))

# points = hilbert_curve.points_from_distances(distances)
# # px.line(pd.DataFrame(points, columns = ['i', 'j']), x = 'i', y = 'j')

NameError: name 'HilbertCurve' is not defined

# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan         #  empty values being used for visualization

# for i in range(len(temp)):
# #     print(i)
#     temp_mat[points[i][0], points[i][1]] = temp[i]
    
# # temp2 = pd.DataFrame(points, columns = ['i', 'j'])
# # temp2['value'] = temp
# # px.scatter(temp2, x = 'i', y = 'j', color = 'value')

# px.imshow(temp_mat)

NameError: name 'points' is not defined

# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
#                              n = 2    # dimensions
#                             )


# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists

# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan         #  empty values being used for visualization

# for i in range(len(temp)):
# #     print(i)
#     temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)

NameError: name 'HilbertCurve' is not defined

Hilbert curve for one sequence

# temp_pos['Present'] = 1

# temp_pos.shape[0]

# def calc_needed_hilbert_p(n_needed = 1048576,
                        #                           max_p = 20):
                        #     out = None
                        #     for i in range(1, max_p):
                        #         if 4**i > n_needed:
                        #             out = i
                        #             break
                        #     return(out)

                        # calc_needed_hilbert_p(n_needed=147150)

# temp_pos['Position']

0             52399
1             54239
2             54405
3             55917
4             55927
            ...    
147145    306971046
147146    306971061
147147    306971063
147148    306971073
147149    306971080
Name: Position, Length: 147150, dtype: int64

# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
#                              n = 2    # dimensions
#                             )


# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists

# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan         #  empty values being used for visualization

# for i in range(len(temp)):
# #     print(i)
#     temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)