# !cd ../ && pip install -e '.[dev]'Retrieve Nucleotide Data
import os
import numpy as np
import pandas as pd
import plotly.express as px
import hilbertcurve
from hilbertcurve.hilbertcurve import HilbertCurve
# !conda install openpyxl -y
# ! conda install h5py -y
# ! pip install hilbertcurveAccess marker records
Working with these records proved tricky. Ultimately I need the nucleotide data in a tensor, but after using tassel to save the data (ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023) as a table (along with position list and taxa list) it’s too big to easily load (>30Gb). As a work around to easily access specific genomes, I split the table into a separate file for the header and each genome so that these files can be read piecemeal. See the Readme below for more details.
read_txt
read_txt (path, **kwargs)
print_txt
print_txt (path)
AGPv4_path = '../data/zma/panzea/genotypes/GBS/v27/'print_txt(path = AGPv4_path+'Readme')Getting a script to split the table by line and rename all to the taxa name didn't work. It's possible that this is from sed, but it's not worth debugging. Instead I'm doing this manually which is what I should have done to begin with.
1. use split to produce the needed files
split -l 1 ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table.txt
2. move those to their own folder
mv xz* GBSv27_publicSamples_imputedV5_AGPv4-181023_Table/
3. go over all files in the folder, pull out column one, swap out the : for another character and rename it.
This last point was completed with the following shell script.
print_txt(path = AGPv4_path+'rename_all.sh')#!/usr/bin/bash
files_path='./ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
out_path='./ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
#out_path='./out/'
files=$(ls "$files_path")
#echo $files
for file in $files
do
#echo $file
taxa=$(awk '{print $1}' <<< cat "$files_path$file")
taxa_sub=$(sed -r 's/[:]/__/g' <<< "$taxa")
#echo $taxa_sub
cp $files_path$file $out_path$taxa_sub
#echo $taxa
done
With that done, and with the summary files from tassel (position and taxa), the genomes can be individually loaded as needed.
# Other than listing the taxa this isn't expected to be of much use for our purposes.
AGPv4_taxa=pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_TaxaList.txt')
AGPv4_taxa.head()| Taxa | LibraryPrepID | Status | DNAPlate | GENUS | INBREEDF | SPECIES | DNASample | Flowcell_Lane | NumLanes | ... | GermplasmSet | Barcode | LibraryPlate | Tassel4SampleName | Population | LibraryPlateWell | SampleDNAWell | OwnerEmail | PEDIGREE | SeedLot | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 05-397:250007467 | 250007467 | public | P3-GS-F | Zea | 0.95 | mays | 05-397 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTTGAA | P3-GS-F | 05-397:C00R8ABXX:4:250007467 | Inbred | F11 | F11 | esb33@cornell.edu | NaN | NaN |
| 1 | 05-438:250007407 | 250007407 | public | P3-GS-F | Zea | 0.95 | mays | 05-438 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTATT | P3-GS-F | 05-438:C00R8ABXX:4:250007407 | Inbred | B03 | B03 | esb33@cornell.edu | NaN | NaN |
| 2 | 12E:250032344 | 250032344 | public | Ames12 | Zea | 0.95 | mays | 12E | 81FE8ABXX_4 | 1.0 | ... | Ames | GCTGTGGA | Ames12 | 12E:81FE8ABXX:4:250032344 | inbred | H08 | H08 | esb33@cornell.edu | 12E | NaN |
| 3 | 207:250007202 | 250007202 | public | P1-GS-F | Zea | 0.95 | mays | 207 | C00R8ABXX_2 | 1.0 | ... | Margaret Smith lines | TACAT | P1-GS-F | 207:C00R8ABXX:2:250007202 | Inbred | E12 | E12 | esb33@cornell.edu | NaN | NaN |
| 4 | 22612:250007466 | 250007466 | public | P3-GS-F | Zea | 0.95 | mays | 22612 | C00R8ABXX_4 | 1.0 | ... | Margaret Smith lines | GTACTT | P3-GS-F | 22612:C00R8ABXX:4:250007466 | Inbred | F10 | F10 | esb33@cornell.edu | NaN | NaN |
5 rows × 21 columns
# Useful for converting between the physical location and site
AGPv4_site = pd.read_table(AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_PositionList.txt')
AGPv4_site.head()| Site | Name | Chromosome | Position | |
|---|---|---|---|---|
| 0 | 0 | S1_6370 | 1 | 52399 |
| 1 | 1 | S1_8210 | 1 | 54239 |
| 2 | 2 | S1_8376 | 1 | 54405 |
| 3 | 3 | S1_9889 | 1 | 55917 |
| 4 | 4 | S1_9899 | 1 | 55927 |
Retrieving a genome by taxa name:
# The genomes are in a folder with an identical name as their source table
table_directory = AGPv4_path+'ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# Note however that the naming is altered to not use ':'
os.listdir(table_directory)[0:3]['xztea', 'xzedh', 'Z020E0024__250026132']
taxa_to_filename
taxa_to_filename (taxa='05-397:250007467')
taxa_to_filename(taxa = '05-397:250007467')'05-397__250007467'
find_AGPv4
find_AGPv4 (taxa, **kwargs)
Search for existing marker sets __
| Details | |
|---|---|
| taxa | should be the desired taxa or a regex fragment (stopping before the __). E.g. ‘B73’ or ‘B’ |
| kwargs |
get_AGPv4
get_AGPv4 (taxa, **kwargs)
Retrieve an existing marker set
# def get_AGPv4(
# taxa,
# table_directory = '../data/zma/panzea/genotypes/GBS/v27/ZeaGBSv27_publicSamples_imputedV5_AGPv4-181023_Table/'
# ):
# with open(table_directory+taxa, 'r') as f:
# data = f.read()
# data = data.split('\t')
# return(data)get_AGPv4('05-397__250007467')[0:4]['05-397:250007467', 'T', 'T', 'A']
In addition to returning a specific taxa, the table’s headers can be retieved with “taxa”.
get_AGPv4(taxa = 'taxa')[0:4]['Taxa', '52399', '54239', '54405']
Converting between site and chromosome/position requires the AGPv4_site dataframe. A given record contains the taxa as well as the nucleotides, so with that entry excluded the chromosome / position can be paired up.
len(get_AGPv4(taxa = 'taxa')), AGPv4_site.shape(943456, (943455, 4))
ith_taxa = '05-397:250007467'
res = get_AGPv4(taxa_to_filename(taxa = ith_taxa)) # Retrieve record
temp = AGPv4_site.loc[:, ['Chromosome', 'Position']]
temp[res[0]] = res[1:] # Add Col. with Nucleotides
temp.head()| Chromosome | Position | 05-397:250007467 | |
|---|---|---|---|
| 0 | 1 | 52399 | T |
| 1 | 1 | 54239 | T |
| 2 | 1 | 54405 | A |
| 3 | 1 | 55917 | N |
| 4 | 1 | 55927 | N |
Look at SNP coverage
mask = (temp.Chromosome == 1)
temp_pos = temp.loc[mask, ['Position']]temp_pos['Shift'] = 0
temp_pos.loc[1: , ['Shift']] = np.array(temp_pos.Position)[:-1]
temp_pos['Diff'] = temp_pos['Position'] - temp_pos['Shift']
temp_pos.loc[0, 'Diff'] = Nonetemp_pos| Position | Shift | Diff | |
|---|---|---|---|
| 0 | 52399 | 0 | NaN |
| 1 | 54239 | 52399 | 1840.0 |
| 2 | 54405 | 54239 | 166.0 |
| 3 | 55917 | 54405 | 1512.0 |
| 4 | 55927 | 55917 | 10.0 |
| ... | ... | ... | ... |
| 147145 | 306971046 | 306910117 | 60929.0 |
| 147146 | 306971061 | 306971046 | 15.0 |
| 147147 | 306971063 | 306971061 | 2.0 |
| 147148 | 306971073 | 306971063 | 10.0 |
| 147149 | 306971080 | 306971073 | 7.0 |
147150 rows × 3 columns
# px.histogram(temp_pos, x = 'Diff')Encode a marker sequence into ATCG
res = get_AGPv4(taxa_to_filename(taxa = '05-397:250007467'))
res = res[1:] # drop taxalist_to_ACGT
list_to_ACGT (in_seq, progress=False)
| Type | Default | Details | |
|---|---|---|---|
| in_seq | This should be a list with strings corresponding to IUPAC codes e.g. [‘A’, ‘C’, ‘Y’] | ||
| progress | bool | False |
res = list_to_ACGT(in_seq = res)
res = res[0:1000]100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 14.64it/s]
Hilbert Curve Transform
res.shape(1000, 4)
calc_needed_hilbert_p
calc_needed_hilbert_p (n_needed=1048576, max_p=20)
np_2d_to_hilbert
np_2d_to_hilbert (in_seq)
| Details | |
|---|---|
| in_seq | This should be a 2d numpy array with dimensions of [sequence, channels] |
np_3d_to_hilbert
np_3d_to_hilbert (in_seq)
This is the 3d version of np_2d_to_hilbert. The goal is to process all of the samples of an array in one go.
| Details | |
|---|---|
| in_seq | This should be a 3d numpy array with dimensions of [samples, sequence, channels] |
demo = np_2d_to_hilbert(
in_seq = np.asarray([np.linspace(1, 100, num= 50),
np.linspace(100, 1, num= 50)]).T
)100%|███████████████████████████████████████████████████████████████████████████████| 50/50 [00:00<00:00, 168852.82it/s]
px.imshow(demo[:,:,0])px.imshow(demo[:,:,1])Apply to subset of real marker data
Explictly convert a taxa
taxa_to_filename(taxa = '05-397:250007467')'05-397__250007467'
Or search for a taxa
find_AGPv4(taxa = '05-397')['05-397__250007467']
Retrieve the sequence data
res = get_AGPv4(taxa_to_filename(taxa = '05-397:250007467'))
res = res[1:] # drop taxa
res[0:10]['T', 'T', 'A', 'N', 'N', 'N', 'N', 'C', 'C', 'N']
Convert from characters to encoded nucleotide probabilities
res = list_to_ACGT(in_seq = res)
res = res[0:1000]
res100%|███████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 15.00it/s]
array([[0., 0., 0., 1.],
[0., 0., 0., 1.],
[1., 0., 0., 0.],
...,
[0., 0., 0., 1.],
[1., 0., 0., 0.],
[0., 1., 0., 0.]])
Convert the sequence to a hilbert curve
# This will happen under the hood
# calc_needed_hilbert_p(n_needed=res.shape[0])
res_hilb = np_2d_to_hilbert(
in_seq = res
)100%|██████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1209081.58it/s]
px.imshow( res[0:20, 0:1] )px.imshow( res_hilb[:, :, 0] )px.imshow( res_hilb[:, :, 1] )