import os
import numpy as np
import pandas as pd
import plotly.express as px
# import hilbertcurve
# from hilbertcurve.hilbertcurve import HilbertCurve
import tqdm
from tqdm import tqdm
import dlgwas
from dlgwas.dna import *
# ! conda install openpyxl -y
# ! conda install hilbertcurve -y
# ! pip install hilbertcurveRetrieve genomic data for phenotypes
Load phenotypic data to explore
For this I’m using data referenced in Wallace et al 2014 which is available through panzea. This study refers to data from 9 studies (including itself) as a source of phenotypes for the NAM data. This combination of a large set of published GWAS hits, phenotypes, and many rils makes it ideal for use here.
This file contains results I can use to check if my approaches are producing similar hits.
Wallace_etal_2014_PLoSGenet_GWAS_hits = pd.read_table('../ext_data/zma/panzea/GWASResults/Wallace_etal_2014_PLoSGenet_GWAS_hits-150112.txt')
Wallace_etal_2014_PLoSGenet_GWAS_hits.head()| trait | chr | pos | allele | rmip | source | |
|---|---|---|---|---|---|---|
| 0 | 100 Kernel weight | 1 | 3364007 | A/G | 1 | Hapmap1 |
| 1 | 100 Kernel weight | 1 | 22247033 | A/G | 3 | Hapmap1 |
| 2 | 100 Kernel weight | 1 | 22987420 | C/T | 1 | Hapmap2 |
| 3 | 100 Kernel weight | 1 | 23056483 | C/G | 1 | Hapmap2 |
| 4 | 100 Kernel weight | 1 | 23066099 | A/G | 1 | Hapmap2 |
This file on I think matches Buckler et al 2009.
temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx',
skiprows=1
).rename(columns = {
'Unnamed: 1': 'ASI',
'Unnamed: 2': 'Days to Anthesis',
'Unnamed: 3': 'Days to Silk'
})
temp = temp.loc[temp.index == 0]
temp.head()/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed
for idx, row in parser.parse():
| Heritability of line BLUPs across all crosses (excluding association panel): | ASI | Days to Anthesis | Days to Silk | |
|---|---|---|---|---|
| 0 | Heritability of line BLUPs | 0.775037 | 0.944619 | 0.942365 |
temp = pd.read_excel('../ext_data/zma/panzea/GWASResults/JointGLMModels090324QTLLocations.xlsx',
skiprows=4
).rename(columns = {
'Unnamed: 1': 'ASI',
'Unnamed: 2': 'Days to Anthesis',
'Unnamed: 3': 'Days to Silk'
})
temp.head()/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/openpyxl/worksheet/_read_only.py:79: UserWarning: Unknown extension is not supported and will be removed
for idx, row in parser.parse():
| Heritability of line BLUPs within each cross: | ASI | Days to Anthesis | Days to Silk | |
|---|---|---|---|---|
| 0 | B73×B97 | 0.64151 | 0.770513 | 0.766082 |
| 1 | B73×CML103 | 0.613736 | 0.777381 | 0.734942 |
| 2 | B73×CML228 | 0.680939 | 0.903684 | 0.892442 |
| 3 | B73×CML247 | 0.748584 | 0.900707 | 0.894291 |
| 4 | B73×CML277 | 0.708526 | 0.918574 | 0.911493 |
# pull in some of the data that Wallace et al 2014 point to:
buckler_2009_path = '../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/'
os.listdir(buckler_2009_path)['NAM_DaysToTassel.txt',
'NAM_TasselingDate.txt',
'markergenotypes062508.txt',
'NAM_SilkingDate.txt',
'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt',
'markers061208.txt',
'NAM_DaysToSilk.txt']
nam_dts = pd.read_table(buckler_2009_path+'NAM_DaysToSilk.txt', encoding="ISO-8859-1")
nam_dts_taxa = list(nam_dts.loc[:, 'accename'].drop_duplicates())# Look for the right taxa# genome_files.sort()
# genome_files[0:10]# # I think we need to match everything before the __
# def find_AGPv4_genome(
# taxa, # should be the desired taxa or a regex fragment (stopping before the __). E.g. 'B73' or 'B\d+'
# **kwargs # optionally pass in a genome list (this allows for a different path or precomputing if we're finding a lot of genomes)
# ):
# if 'genome_files' not in kwargs.keys():
# import os
# genome_files = os.listdir(
# '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')
# else:
# genome_files = kwargs['genome_files']
# import re
# return( [e for e in genome_files if re.match(taxa+'__.+', e)] )genome_files = os.listdir(
'../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')possible_matches = [{'taxa': e,
'matches': find_AGPv4(
taxa = e,
genome_files = genome_files
)} for e in tqdm(nam_dts_taxa)]100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5369/5369 [01:25<00:00, 63.16it/s]
# how many have more than one match?
len(
[[len(e['matches']), e] for e in possible_matches if len(e['matches']) != 1])1297
'Z018E0021'
# ith_taxa = '05-397:250007467'
# res = get_AGPv4(taxa_to_filename(taxa = ith_taxa)) # Retrieve record
# temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]
# temp[res[0]] = res[1:] # Add Col. with Nucleotides
# temp.head()'Z018E0021'
pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")| Geno_Code | Entry_ID | Group | pop | entry | Days_To_Anthesis_BLUP_Sum0607 | Days_To_Silk_BLUP_Sum0607 | ASI_BLUP_Sum0607 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Z001E0001 | 04P1367A51A | Z001 | 1 | 1 | 75.5364 | 77.1298 | 1.4600 |
| 1 | Z001E0002 | 04P1368A51A | Z001 | 1 | 2 | 76.9075 | 77.7945 | 1.3928 |
| 2 | Z001E0003 | 04P1368B51A | Z001 | 1 | 3 | 75.2646 | 75.2555 | 0.8644 |
| 3 | Z001E0004 | 04P1370B51A | Z001 | 1 | 4 | 73.6933 | 75.7604 | 2.0012 |
| 4 | Z001E0005 | 04P1371B51A | Z001 | 1 | 5 | 79.2441 | 81.2611 | 1.8931 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5458 | Z027E0277 | W64A | NaN | 27 | 277 | 71.9008 | 73.9811 | 2.6756 |
| 5459 | Z027E0278 | WD | NaN | 27 | 278 | 62.0212 | 60.5992 | -0.5733 |
| 5460 | Z027E0279 | Wf9 | NaN | 27 | 279 | 71.9970 | 72.2319 | 0.8338 |
| 5461 | Z027E0280 | Yu796_NS | NaN | 27 | 280 | 74.5107 | 73.9727 | 0.2935 |
| 5462 | Z027E0282 | Mo17 | NaN | 27 | 282 | 72.7428 | 75.5080 | 3.0455 |
5463 rows × 8 columns
pd.read_table(buckler_2009_path+'NAM_TasselingDate.txt', encoding="ISO-8859-1")| loc_name | year | season | pop | block | plot | row | plot_id | entry_num | pedigree | seed_source | accename | value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Urbana, IL | 2006 | Summer | 18 | 1 | 1 | 1 | 0650001 | 21 | (B73×Mo18W)S5 | 05CL1572-blk | Z018E0021 | 07/28/06 |
| 1 | Urbana, IL | 2006 | Summer | 18 | 1 | 2 | 2 | 0650002 | 41 | (B73×Mo18W)S5 | 05FL4779-blk | Z018E0041 | 07/31/06 |
| 2 | Urbana, IL | 2006 | Summer | 18 | 1 | 3 | 3 | 0650003 | 128 | (B73×Mo18W)S5 | 24MM1520 | Z018E0128 | 07/31/06 |
| 3 | Urbana, IL | 2006 | Summer | 18 | 1 | 4 | 4 | 0650004 | 146 | (B73×Mo18W)S5 | 05P114751A | Z018E0146 | 07/31/06 |
| 4 | Urbana, IL | 2006 | Summer | 18 | 1 | 5 | 5 | 0650005 | 105 | (B73×Mo18W)S5 | 24MM1482 | Z018E0105 | 08/02/06 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 43454 | Columbia, MO | 2007 | Summer | 12 | 3 | 16 | 2936 | 27M32936 | 179 | (B73×Ki11)S5 | 05P025251A | Z012E0179 | 7/25/2007 |
| 43455 | Columbia, MO | 2007 | Summer | 12 | 3 | 17 | 2937 | 27M32937 | 182 | (B73×Ki11)S5 | 05P025351A | Z012E0182 | 7/23/2007 |
| 43456 | Columbia, MO | 2007 | Summer | 12 | 3 | 18 | 2938 | 27M32938 | 180 | (B73×Ki11)S5 | 05P029351A | Z012E0180 | 7/23/2007 |
| 43457 | Columbia, MO | 2007 | Summer | 12 | 3 | 19 | 2939 | 27M32939 | 168 | (B73×Ki11)S5 | 05P030451A | Z012E0168 | 7/25/2007 |
| 43458 | Columbia, MO | 2007 | Summer | 12 | 3 | 20 | 2940 | 27M32940 | 8 | (B73×Ki11)S5 | 05CL1149-blk | Z012E0008 | 7/21/2007 |
43459 rows × 13 columns
# pd.read_table(buckler_2009_path+'markergenotypes062508.txt', encoding="ISO-8859-1")pd.read_table(buckler_2009_path+'NAMSum0607FloweringTraitBLUPsAcross8Envs.txt', encoding="ISO-8859-1")| Geno_Code | Entry_ID | Group | pop | entry | Days_To_Anthesis_BLUP_Sum0607 | Days_To_Silk_BLUP_Sum0607 | ASI_BLUP_Sum0607 | |
|---|---|---|---|---|---|---|---|---|
| 0 | Z001E0001 | 04P1367A51A | Z001 | 1 | 1 | 75.5364 | 77.1298 | 1.4600 |
| 1 | Z001E0002 | 04P1368A51A | Z001 | 1 | 2 | 76.9075 | 77.7945 | 1.3928 |
| 2 | Z001E0003 | 04P1368B51A | Z001 | 1 | 3 | 75.2646 | 75.2555 | 0.8644 |
| 3 | Z001E0004 | 04P1370B51A | Z001 | 1 | 4 | 73.6933 | 75.7604 | 2.0012 |
| 4 | Z001E0005 | 04P1371B51A | Z001 | 1 | 5 | 79.2441 | 81.2611 | 1.8931 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5458 | Z027E0277 | W64A | NaN | 27 | 277 | 71.9008 | 73.9811 | 2.6756 |
| 5459 | Z027E0278 | WD | NaN | 27 | 278 | 62.0212 | 60.5992 | -0.5733 |
| 5460 | Z027E0279 | Wf9 | NaN | 27 | 279 | 71.9970 | 72.2319 | 0.8338 |
| 5461 | Z027E0280 | Yu796_NS | NaN | 27 | 280 | 74.5107 | 73.9727 | 0.2935 |
| 5462 | Z027E0282 | Mo17 | NaN | 27 | 282 | 72.7428 | 75.5080 | 3.0455 |
5463 rows × 8 columns
# os.listdir('../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/')AGPv4_path = '../data/zma/panzea/genotypes/GBS/v27/'# Other than listing the taxa this isn't expected to be of much use for our purposes.
AGPv4_taxa=pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
AGPv4_taxa.head()| Taxa | LibraryPrepID | Status | DNAPlate | GENUS | INBREEDF | SPECIES | DNASample | Flowcell_Lane | NumLanes | ... | GermplasmSet | Barcode | LibraryPlate | Tassel4SampleName | Population | LibraryPlateWell | SampleDNAWell | OwnerEmail | PEDIGREE | SeedLot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 05-397:250007467 | 250007467 | public | P3-GS-F | Zea | 0.95 | mays | 05-397 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTTGAA | P3-GS-F | 05-397:C00R8ABXX:4:250007467 | Inbred | F11 | F11 | esb33@cornell.edu | NaN | NaN |
| 1 | 05-438:250007407 | 250007407 | public | P3-GS-F | Zea | 0.95 | mays | 05-438 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTATT | P3-GS-F | 05-438:C00R8ABXX:4:250007407 | Inbred | B03 | B03 | esb33@cornell.edu | NaN | NaN |
| 2 | 12E:250032344 | 250032344 | public | Ames12 | Zea | 0.95 | mays | 12E | 81FE8ABXX_4 | 1.0 | ... | Ames | GCTGTGGA | Ames12 | 12E:81FE8ABXX:4:250032344 | inbred | H08 | H08 | esb33@cornell.edu | 12E | NaN |
| 3 | 207:250007202 | 250007202 | public | P1-GS-F | Zea | 0.95 | mays | 207 | C00R8ABXX_2 | 1.0 | ... | Margaret Smith lines | TACAT | P1-GS-F | 207:C00R8ABXX:2:250007202 | Inbred | E12 | E12 | esb33@cornell.edu | NaN | NaN |
| 4 | 22612:250007466 | 250007466 | public | P3-GS-F | Zea | 0.95 | mays | 22612 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTACTT | P3-GS-F | 22612:C00R8ABXX:4:250007466 | Inbred | F10 | F10 | esb33@cornell.edu | NaN | NaN |
5 rows × 21 columns
# Useful for converting between the physical location and site
AGPv4_site = pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()| Site | Name | Chromosome | Position | |
|---|---|---|---|---|
| 0 | 0 | S1_6370 | 1 | 52399 |
| 1 | 1 | S1_8210 | 1 | 54239 |
| 2 | 2 | S1_8376 | 1 | 54405 |
| 3 | 3 | S1_9889 | 1 | 55917 |
| 4 | 4 | S1_9899 | 1 | 55927 |
Retrieving a genome by taxa name:
# The genomes are in a folder with an identical name as their source table
table_directory = AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# Note however that the naming is altered to not use ':'
os.listdir(table_directory)[0:3]['xztea', 'xzedh', 'Z020E0024__250026132']
taxa_to_filename(taxa = '05-397:250007467')'05-397__250007467'
def get_AGPv4(taxa):
with open(table_directory+taxa, 'r') as f:
data = f.read()
data = data.split('\t')
return(data)get_AGPv4('05-397__250007467')[0:4]['05-397:250007467', 'T', 'T', 'A']
In addition to returning a specific taxa, the table’s headers can be retieved with “taxa”.
get_AGPv4(taxa = 'taxa')[0:4]['Taxa', '52399', '54239', '54405']
Converting between site and chromosome/position requires the AGPv4_site dataframe. A given record contains the taxa as well as the nucleotides, so with that entry excluded the chromosome / position can be paired up.
len(get_AGPv4(taxa = 'taxa')), AGPv4_site.shape(943456, (943455, 4))
ith_taxa = '05-397:250007467'
res = get_AGPv4(taxa_to_filename(taxa = ith_taxa)) # Retrieve record
temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]
temp[res[0]] = res[1:] # Add Col. with Nucleotides
temp.head()| Chromosome | Position | 05-397:250007467 | |
|---|---|---|---|
| 0 | 1 | 52399 | T |
| 1 | 1 | 54239 | T |
| 2 | 1 | 54405 | A |
| 3 | 1 | 55917 | N |
| 4 | 1 | 55927 | N |
Look at SNP coverage
mask = (temp.Chromosome == 1)
temp_pos = temp.loc[mask, ['Position']]temp_pos['Shift'] = 0
temp_pos.loc[1: , ['Shift']] = np.array(temp_pos.Position)[:-1]
temp_pos['Diff'] = temp_pos['Position'] - temp_pos['Shift']
temp_pos.loc[0, 'Diff'] = Nonetemp_pos| Position | Shift | Diff | |
|---|---|---|---|
| 0 | 52399 | 0 | NaN |
| 1 | 54239 | 52399 | 1840.0 |
| 2 | 54405 | 54239 | 166.0 |
| 3 | 55917 | 54405 | 1512.0 |
| 4 | 55927 | 55917 | 10.0 |
| ... | ... | ... | ... |
| 147145 | 306971046 | 306910117 | 60929.0 |
| 147146 | 306971061 | 306971046 | 15.0 |
| 147147 | 306971063 | 306971061 | 2.0 |
| 147148 | 306971073 | 306971063 | 10.0 |
| 147149 | 306971080 | 306971073 | 7.0 |
147150 rows × 3 columns
# px.histogram(temp_pos, x = 'Diff')Demonstrate Hilbert Curve
# demonstrating the hilbert curve
temp = np.linspace(1, 100, num= 50)
# px.scatter(x = temp, y = [0 for e in range(len(temp))], color = temp)
px.imshow(temp.reshape((1, temp.shape[0])))# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
# n = 2 # dimensions
# )
# distances = list(range(len(temp)))
# points = hilbert_curve.points_from_distances(distances)
# # px.line(pd.DataFrame(points, columns = ['i', 'j']), x = 'i', y = 'j')NameError: name 'HilbertCurve' is not defined
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
# for i in range(len(temp)):
# # print(i)
# temp_mat[points[i][0], points[i][1]] = temp[i]
# # temp2 = pd.DataFrame(points, columns = ['i', 'j'])
# # temp2['value'] = temp
# # px.scatter(temp2, x = 'i', y = 'j', color = 'value')
# px.imshow(temp_mat)NameError: name 'points' is not defined
# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
# n = 2 # dimensions
# )
# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists
# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
# for i in range(len(temp)):
# # print(i)
# temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)NameError: name 'HilbertCurve' is not defined
Hilbert curve for one sequence
# temp_pos['Present'] = 1# temp_pos.shape[0]147150
# def calc_needed_hilbert_p(n_needed = 1048576,
# max_p = 20):
# out = None
# for i in range(1, max_p):
# if 4**i > n_needed:
# out = i
# break
# return(out)
# calc_needed_hilbert_p(n_needed=147150)9
# temp_pos['Position']0 52399
1 54239
2 54405
3 55917
4 55927
...
147145 306971046
147146 306971061
147147 306971063
147148 306971073
147149 306971080
Name: Position, Length: 147150, dtype: int64
# # Data represented need not be continuous -- it need only have int positions
# # a sequence or a sequence with gaps can be encoded
# hilbert_curve = HilbertCurve(p = 10, # iterations i.e. hold 4^p positions
# n = 2 # dimensions
# )
# fake_dists = list(range(len(temp)))
# # Introdude a gap in the sequence
# fake_dists = [e if e>10 else e+5 for e in fake_dists]
# distances = fake_dists
# points = hilbert_curve.points_from_distances(distances)
# dim_0 = np.max(np.array(points)[:, 0])+1 # add 1 to account for 0 indexing
# dim_1 = np.max(np.array(points)[:, 1])+1
# temp_mat = np.zeros(shape = [dim_0, dim_1])
# temp_mat[temp_mat == 0] = np.nan # empty values being used for visualization
# for i in range(len(temp)):
# # print(i)
# temp_mat[points[i][0], points[i][1]] = temp[i]
# px.imshow(temp_mat)