import os
import tqdm
from tqdm import tqdm
import re
import time
import sys
import requests
import numpy as np
import pandas as pd
import plotly.express as px
import pickle5 as pklSNPS to Networks 1: KEGG
Helpful Custom Functions
ensure_dir_path_exists
ensure_dir_path_exists (dir_path='../ext_data')
get_cached_result
get_cached_result (save_path)
put_cached_result
put_cached_result (save_path, save_obj)
# # This is the expected boiler plate for using the above functions.
# save_path = './demofile.txt'
# demo = get_cached_result(save_path=save_path)
# if None == samples_and_matches:
# demo = CODE TO MAKE DEMO HERE
# put_cached_result(
# save_path = save_path,
# save_obj = demo
# )Identify genes to be downloaded:
get_kegg_species_list
get_kegg_species_list (species='zma')
mkdf_kegg_species_list
mkdf_kegg_species_list (kegg_species_list)
The KEGG list contains a gene id that can be used to download further information. The chromosomal position will be key to match up collected SNPs to genes.
kegg_zma_list = get_kegg_species_list(species = 'zma')
kegg_list_zma = mkdf_kegg_species_list(kegg_species_list = kegg_zma_list)There are also entries that don’t start with a chromosome number. These are non-nuclear sequences from plastid (Pltd), Mitochondria (MT), etc.
no_chrm = [e for e in kegg_list_zma['chromosome_positon'] if re.match('\D.+', e)]
no_chrm = list(set(no_chrm))
kegg_list_zma.loc[kegg_list_zma.chromosome_positon.isin(no_chrm), ]| gene | seq_type | chromosome_positon | gene_type | |
|---|---|---|---|---|
| 35639 | zma:845199 | CDS | Pltd:complement(89..1150) | psbA, ZemaCp002; photosystem II protein D1 |
| 35640 | zma:845256 | tRNA | Pltd:complement(1386..3946) | trnK, ZemaCt121; tRNA-Lys |
| 35641 | zma:845178 | CDS | Pltd:complement(1674..3215) | matK, ZemaCp003; maturase K |
| 35642 | zma:845232 | CDS | Pltd:complement(4491..5604) | rps16, ZemaCp004; ribosomal protein S16 |
| 35643 | zma:845265 | tRNA | Pltd:complement(6773..6844) | trnQ, ZemaCt122; tRNA-Gln |
| ... | ... | ... | ... | ... |
| 37707 | zma:118475995 | rRNA | Unknown | 28S ribosomal RNA |
| 37708 | zma:118475996 | rRNA | Unknown | 5.8S ribosomal RNA |
| 37709 | zma:5951366 | CDS | MT | pBMSmt19_00005; hypothetical protein |
| 37710 | zma:5951368 | CDS | MT | pBMSmt19_00010; hypothetical protein |
| 37711 | zma:5951367 | CDS | MT | pBMSmt19_00015; hypothetical protein |
2066 rows × 4 columns
Download and cache KEGG entries
download_kegg_gene
download_kegg_gene (kegg_gene='zma:103644366', **kwargs)
Downloads kegg gene entry if it does not exist locally. Can optionally take a numeric value as sleep_for to sleep after downloading a file. Useful for controlling the rate requests being sent to the API.
read_kegg_gene
read_kegg_gene (kegg_gene='zma:103644366', **kwargs)
Reads in locally cached KEGG gene entries. Will download the requested entry if it doesn’t exist locally.
Example usage for one gene. Species is inferred and if it doesn’t exist in ext_data then it will automatically be downloaded with download_kegg_gene().
print(read_kegg_gene(kegg_gene = 'zma:103644366'))ENTRY 103644366 CDS T01088
NAME (RefSeq) uncharacterized protein LOC103644366
ORTHOLOGY K15032 mTERF domain-containing protein, mitochondrial
ORGANISM zma Zea mays (maize)
BRITE KEGG Orthology (KO) [BR:zma00001]
09180 Brite Hierarchies
09182 Protein families: genetic information processing
03012 Translation factors [BR:zma03012]
103644366
03029 Mitochondrial biogenesis [BR:zma03029]
103644366
Translation factors [BR:zma03012]
Eukaryotic type
Release factors
103644366
Mitochondrial biogenesis [BR:zma03029]
Mitochondrial DNA transcription, translation, and replication factors
Mitochondrial transcription and translation factors
Mitochondrial translation factors
103644366
POSITION 1:34607..40208
MOTIF Pfam: mTERF
DBLINKS NCBI-GeneID: 103644366
NCBI-ProteinID: XP_020400304
AASEQ 644
MPWIMHAGTEQRHAACGASLLWSSLQPSTVVMAAAAATFGFLHPPIRKPAVPPLYILRLP
TKPHSKTHPRSPPLLFLLLGRRRGGPIAAFPNTTSSSTNAPASPTYDVREAEAAVADLLR
EGGASADDAASIAARAPAYAAMLADGVRELDELGLWASWSSGARARLGLSGVVEMEMGRL
GFRRKVYLMGRSKPDHGVVPLLESLGMRLSSAKLIAPYVAAAGLTVLIDRVKFLKEMLFS
SSDYAILIGRNAKRMMTYLSIPADDALQSTLSFFEKMEARYGGVSMLGHGDVSFPYLIES
FPMLLLCSEDNHLKPLVDFLEHIGIPKPKIASVLLLFPPIILSDVENDIKPRIREWEKAG
IEQDYVSRMLLKYPWILSTSVIENYSQMLLFFNQKRISSTVLAIAVKSWPHILGSSSKRM
NSVLELFHVLGISKKMVVPVITSSPQLLLRKPDQFMQNVLFFREMGVDKKTTGKILCRSP
EIFASNVDNTLKKKIDFLTNFGVSKHHLPRIIRKYPELLLLDINCTLLPRMNYLLEMGLS
KKDLCSMIFRFSPLLGYSIELVMKPKLEFLLRTMKKPLKAVVEYPRYFSYSLEGKIKPRF
WVLQSRNIDCTLTEMLAKNDELFAEEYLGLGGLLEKPLQSSIGS
NTSEQ 1935
atgccatggataatgcacgcgggaacggaacaaagacacgcggcgtgcggggcctcgctc
ctctggtcttccctccagccctcgacggtggtcatggccgccgccgccgccactttcggc
ttcctccatcctccaatccggaaacctgcagtcccaccactgtacattctccggcttccc
accaaaccccactccaaaacgcaccctcgttctccccccctcctcttcctcctcctgggc
cgccgccggggaggccccatcgccgccttccccaacaccacatcttcatccacgaatgcc
cctgcctcgcccacctacgacgtccgggaggcagaggccgccgtcgcggatctcctccgc
gagggcggcgcctccgcggacgacgccgcctccatcgccgcgcgcgcgccggcctacgcc
gctatgctcgccgatggcgtccgcgagctggacgagcttggcctctgggcgtcgtggagc
tccggtgccagggcccggctgggcctcagcggggtcgtcgagatggagatggggaggctc
ggttttaggaggaaggtgtatctcatgggacggagcaagcctgaccacggcgtggtgccg
ctcctcgagagcttgggaatgcgtctctcctcggccaaactcatcgcgccttacgtcgcg
gctgcgggccttactgtgctgattgatagggttaagtttttgaaggaaatgttattttca
agcagtgattatgcaatactaattggaaggaatgctaagcggatgatgacatacttatca
atacctgcagatgatgcactccaaagtactttatctttttttgaaaaaatggaggctagg
tatggtggtgttagcatgttgggacatggagatgtgtcatttccttacctcattgaatca
tttccgatgcttcttctctgctcagaagataatcatctcaagccgttagttgattttctc
gagcacattggaattccaaagccaaagattgcatcagttctgctgctatttcctcctatc
attctttctgatgttgaaaatgatattaagcctaggattcgtgaatgggagaaggctggc
attgaacaagactatgttagtaggatgttgttgaagtatccatggattctttcaacgagt
gtgatagagaactacagtcaaatgctgttgtttttcaaccaaaaaaggatttccagtaca
gtcctcgctattgctgtgaaaagttggcctcatattcttggctcctcttcaaaaagaatg
aattcagttttggagctgtttcatgttctgggcatcagtaaaaaaatggtggttccagtc
attacatcaagtccacagttattactgagaaaacctgatcagtttatgcagaatgttttg
tttttcagagaaatgggtgttgataagaaaacaacaggaaaaattctgtgtcgttcgcct
gaaatatttgcttcaaacgtggataacaccctcaagaagaaaatcgattttcttaccaac
tttggtgtttctaaacatcatcttcctcgcatcattcggaagtatccagaacttttattg
ttggacataaattgtacattgctccctaggatgaactacttattggagatgggtttgtct
aagaaagatctgtgctcaatgatctttagattttccccacttctaggttacagtattgaa
cttgttatgaaaccaaagcttgagtttctgctaagaaccatgaagaagccacttaaagca
gttgtagaatacccaaggtacttcagttattcactcgaggggaagatcaaaccgcggttc
tgggtattgcagagtagaaacatagactgcactctgacagagatgttagcaaagaacgat
gaactctttgctgaagagtacttgggacttggaggattgctcgagaaacctctacaatca
agcataggcagttaa
///
To cache all genes on KEGG for an organism, loop over all entries in the KEGG list for that species.
for kegg_gene in tqdm(kegg_list_zma.gene):
try:
download_kegg_gene(kegg_gene = kegg_gene,
sleep_for = np.random.uniform(0.5, 1.5))
except:
print('Problem with '+kegg_gene)100%|██████████████████████████████████████████████████████████████████████████| 37712/37712 [00:00<00:00, 76079.21it/s]
Imported from 02
Load Zea mays gene list
kegg_zma_list = get_kegg_species_list(species = 'zma')kegg_zma_list = get_kegg_species_list(species = 'zma')kegg_list_zma = mkdf_kegg_species_list(kegg_species_list = kegg_zma_list)kegg_list_zma.head()| gene | seq_type | chromosome_positon | gene_type | |
|---|---|---|---|---|
| 0 | zma:103644366 | CDS | 1:34607..40208 | uncharacterized protein LOC103644366 |
| 1 | zma:100382519 | CDS | 1:complement(41254..46100) | uncharacterized protein LOC100382519 |
| 2 | zma:103649349 | CDS | 1:complement(92293..107876) | protein FAR1-RELATED SEQUENCE 5-like |
| 3 | zma:115032971 | CDS | 1:complement(187229..189591) | uncharacterized protein LOC115032971 |
| 4 | zma:103630223 | CDS | 1:complement(200404..203170) | granule-bound starch synthase 1b, chloroplasti... |
Read KEGG gene entries
Convert a single file into a dictionary
The gene files contain sections (ENTRY, NAME, AASEQ, etc.). To aid processing each of these sections will be separated into a key in a dictionary.
kegg_file = 'zma_100125650.txt'
# convert back to KEGG format
kegg_gene_name = kegg_file.replace('_', ':').replace('.txt', '')
# convert back to kegg format
r_text = read_kegg_gene(kegg_gene = kegg_gene_name)
# parsing gene entry
r_text_list = r_text.split('\n')
r_text_list[0:5]['ENTRY 100125650 CDS T01088',
'NAME (RefSeq) barren inflorescence 2',
'ORGANISM zma Zea mays (maize)',
'POSITION 1:177940567..177942323',
'MOTIF Pfam: Pkinase PK_Tyr_Ser-Thr Kinase-like Pkinase_fungal']
# Some files may not contain all sections so this list needs to be created for each file
section_names = [r_text_list[i][0:12].strip() for i in range(len(r_text_list)) ]
section_names = [e for e in section_names if e != '']
section_starts = [
[i for i in range(len(r_text_list)) if re.match('^'+e, r_text_list[i])][0]
for e in section_names]
section_names, section_starts(['ENTRY',
'NAME',
'ORGANISM',
'POSITION',
'MOTIF',
'DBLINKS',
'AASEQ',
'NTSEQ',
'///'],
[0, 1, 2, 3, 4, 5, 8, 18, 44])
# just split each file into text of it's sections
out = {}
# get lines associated with section
def _get_section_text(section_name = 'AASEQ'):
idx = section_names.index(section_name)
section_text = r_text_list[section_starts[idx]:section_starts[idx+1]]
# remove leading indent
section_text = [e[12:] for e in section_text]
return(section_text)
for section_name in section_names:
if section_name != '///': # end of file
out[section_name] = _get_section_text(section_name = section_name)out{'ENTRY': ['100125650 CDS T01088'],
'NAME': ['(RefSeq) barren inflorescence 2'],
'ORGANISM': ['zma Zea mays (maize)'],
'POSITION': ['1:177940567..177942323'],
'MOTIF': ['Pfam: Pkinase PK_Tyr_Ser-Thr Kinase-like Pkinase_fungal'],
'DBLINKS': ['NCBI-GeneID: 100125650',
'NCBI-ProteinID: NP_001106051',
'UniProt: A6MW92'],
'AASEQ': ['491',
'MDAAVRVPPALGNKTVTEVTPPPPPPAGEERLSDADTTASSTAAPNSSLSSASSAASLPR',
'CSSLSRLSFDCSPSAALSSSSAAAAAAAASSPAPAPARPHRAGDAAWAAIRAASASAAAP',
'LGPRDFRLLRRVGGGDVGTVYLCRLRAPPAPAPVCCLYAMKVVDRRVAAAKKKLEHAAAE',
'RRILRALDHPFLPTLFADFDAAPHFSCVVTEFCPGGDLHSLRHRMPNRRFPLPSARFYAA',
'EVLLALEYLHMMGIVYRDLKPENVLIRADGHIMLTDFDLSLQCTSTPSLEPCAAPEAAAA',
'SCFPDHLFRRRRARLRRAASARRPPTTLVAEPVEARSCSFVGTHEYVAPEVARGGPHGAA',
'VDWWALGVFLYELLHGRTPFAGADNEATLRNIARRPLSFPAAGAGDADARDLIARLLAKD',
'PRHRLGSRRGAADVKAHPFFRGLNFALLRSSRPPVVPAASRSPLHRSQSCSAARTRASKP',
'KPPPDTRFDLF'],
'NTSEQ': ['1476',
'atggacgccgcggtgcgcgtccccccggcgctcgggaacaagacggtgaccgaggtgacg',
'ccgccgccgccaccaccggcgggggaggagcggctgtcggacgccgacacgacggcgtcg',
'tcgacggcggcgcccaactcgagcctcagctcggccagcagcgccgccagcctgccgcgc',
'tgctccagcctgtcccgcctctccttcgactgctctccgtccgcggccctgtcctcttcc',
'tcggcggcggcggcggccgcggccgcgtcatcgccggcgccagcgccggcgcggccgcac',
'cgggcaggggacgcggcgtgggcggcgatccgcgcggcgtcggcgtcggccgcggcgccg',
'ctggggccgcgggacttcaggctgctgcgccgcgtgggcggcggcgacgtcggcaccgtg',
'tacctgtgccgcctcagggcgccacccgcgcccgcgcccgtctgctgcctgtacgcgatg',
'aaggtggtggaccggcgcgtggcggccgcgaagaagaagctggagcacgcggcggcggag',
'cggcggatcctgcgggcgctggaccatccgttcctgcccacgctcttcgccgacttcgac',
'gccgcgccgcacttctcctgcgtcgtcacggagttctgccccggcggggacctccactcg',
'ctccgccaccgcatgcccaaccgccgcttcccgctcccgtcagctcggttctacgcggcg',
'gaggtgttgctggcgctggagtacctgcacatgatgggcatcgtgtaccgcgacctcaag',
'ccggagaacgtgctgatccgcgcggacggccacatcatgctcacggacttcgacctgtcg',
'ctgcagtgcacgtcgacgccgtcgctcgagccgtgcgccgcccccgaggcggcggcggcg',
'tcctgcttcccggaccacctgttccgccgccggcgcgcgcgactccgccgtgccgcctcg',
'gcgcggcggccgccaacgaccctggtggcggagccggtggaggcgcggtcgtgctcgttc',
'gtgggcacgcacgagtacgtggcgcccgaggtggcccgcggcgggccccacggcgcggcc',
'gtcgactggtgggcgctcggcgtgttcctgtacgagctcctgcacgggcgcaccccgttc',
'gcgggcgccgacaacgaggccacgctccgcaacatcgcgcgccgcccgctgtccttcccc',
'gctgccggcgccggtgatgccgacgcgcgcgacctcatcgcccgcctcctcgccaaggac',
'ccgcgccaccggttggggtcccggcgcggcgccgccgacgtgaaggcgcacccgttcttc',
'cgcgggctcaacttcgcgctgctccggtcctcccgcccgcccgtcgtccccgccgcgtcg',
'cgctccccgctgcaccgctcgcagtcctgcagcgcggcgcgcacgagagcgtcgaagccg',
'aagccgccgccggacacccggttcgacctgttctga']}
Convert all downloaded gene files
def kegg_gene_to_dict(kegg_file = 'zma_100125650.txt',
**kwargs):
# convert back to kegg format
if 'encoding' in kwargs.keys():
r_text = read_kegg_gene(
kegg_gene= kegg_file.replace('_', ':').replace('.txt', ''),
encoding = kwargs['encoding']
)
else:
r_text = read_kegg_gene(
kegg_gene= kegg_file.replace('_', ':').replace('.txt', '') )
# parsing gene entry
r_text_list = r_text.split('\n')
# Some files may not contain all sections so this list needs to be created for each file
section_names = [r_text_list[i][0:12].strip() for i in range(len(r_text_list)) ]
section_names = [e for e in section_names if e != '']
section_starts = [
[i for i in range(len(r_text_list)) if re.match('^'+e, r_text_list[i])][0]
for e in section_names]
# just split each file into text of it's sections
out = {}
# get lines associated with section
def _get_section_text(section_name = 'AASEQ'):
idx = section_names.index(section_name)
section_text = r_text_list[section_starts[idx]:section_starts[idx+1]]
# remove leading indent
section_text = [e[12:] for e in section_text]
return(section_text)
for section_name in section_names:
if section_name != '///': # end of file
out[section_name] = _get_section_text(section_name = section_name)
return(out)save_dir = '../data/zma/kegg/'
save_path = save_dir+'cached_kegg_gene_files.pkl'
ensure_dir_path_exists(dir_path = save_dir)# 27258 zma_100037738.txt
# kegg_gene_to_dict(
# kegg_file = 'zma_100037738.txt',
# # encoding="ISO-8859-1"
# )# kegg_gene_files = os.listdir('../ext_data/zma/kegg/gene_entries/')
# lst_out = []
# for i in range(len(kegg_gene_files)):
# kegg_gene_file = kegg_gene_files[i]
# print(i, kegg_gene_file)
# lst_out += [
# kegg_gene_to_dict(
# kegg_file = kegg_gene_file,
# encoding="ISO-8859-1"
# )]if not os.path.exists(save_path):
# This is a pain. I would like to parallelize reading these files in, but since it only has to be done once per species doing so would be premature optimization.
# BUT it seems that nbdev is rerunning it as part of the CI workflow. So caching it makes sense.
kegg_gene_files = os.listdir('../ext_data/zma/kegg/gene_entries/')
# kegg_gene_entries = [kegg_gene_to_dict(kegg_file = kegg_gene_file) for kegg_gene_file in tqdm.tqdm(kegg_gene_files)]
kegg_gene_entries = [kegg_gene_to_dict(
kegg_file = kegg_gene_file,
encoding="ISO-8859-1"
) for kegg_gene_file in kegg_gene_files]
# 37712/37712 [07:12<00:00, 87.23it/s]
with open(save_path, 'wb') as handle:
pkl.dump(kegg_gene_entries,
handle,
protocol=pkl.HIGHEST_PROTOCOL)
else:
# Reading in data
with open(save_path, 'rb') as handle:
kegg_gene_entries = pkl.load(handle)Clean up KEGG gene dictionaries
The goal here is to use a finite-state machine to convert the lines for each section into a useful representation. This approach makes extending or altering this code easier.
Starting out I needed to find how many states were featured across all files.
# produce a flat list of keys then deduplicate them
kegg_gene_sections = [entry for sublist in [list(kegg_gene_entries[i].keys()) for i in range(len(kegg_gene_entries))]
for entry in sublist] # entry is defined here so without it the list comprehension fails instead of producing the list of sublists
kegg_gene_sections = list(set(kegg_gene_sections))
kegg_gene_sections['ORTHOLOGY',
'MODULE',
'DBLINKS',
'PATHWAY',
'ENTRY',
'MOTIF',
'AASEQ',
'SYMBOL',
'BRITE',
'ORGANISM',
'STRUCTURE',
'POSITION',
'NAME',
'NTSEQ']
To aid in writing the logic for each section, pull several examples as a reference.
# Helper Functions ---------------------------------------------------------------------
# helper fcn to pull indices and examples of sections
def get_section_examples(section = 'MOTIF', n = 5):
i_section_matches = [i for i in range(len(kegg_gene_entries)) if section in kegg_gene_entries[i].keys()]
out = [[i_section_matches[i],
kegg_gene_entries[i_section_matches[i]][section] ] for i in range(n)]
return(out)get_section_examples(section = 'SYMBOL', n = 5)[[89, ['MIR395l']],
[100, ['lox6']],
[133, ['HDA108']],
[147, ['MIR169o']],
[167, ['MIR169p']]]
Now create a function to process each section of the file or more precisely, each section type that needs a different treatment.
# Any non-hierarical list of attributes can go here. It will be transformed into a dict
def _gene_entry_flat_list(section_entry):
# split like so [['NCBI-GeneID', '103644366'], ['NCBI-ProteinID', 'XP_020400304']]
# then convert to dict
# {'NCBI-GeneID': '103644366', 'NCBI-ProteinID': 'XP_020400304'}
section_entry = [e.replace(section_name, '').strip().split(': ') for e in section_entry]
section_entry = dict(section_entry)
return(section_entry)# 'AASEQ', 'NTSEQ'
def _gene_seq_to_dict(seq_list):
out = {}
out['lenght'] = int(seq_list[0])
out['seq'] = ''.join(seq_list[1:])
return(out)The hardest one to think through was the “BRITE”. This section contains a tree categorizing the gene’s role(s). Here’s a sample excerpt:
KEGG Orthology (KO) [BR:zma00001]
09120 Genetic Information Processing
09121 Transcription
03020 RNA polymerase
100037782
09180 Brite Hierarchies
09182 Protein families: genetic information processing
03021 Transcription machinery [BR:zma03021]
100037782
# ...
The goal is to represent these data in a way that can be used to construct a hierarchy for all genes that are in a given sample. Level is defined by leading whitespace.
KEGG Orthology (KO) [BR:zma00001]
├ 09120 Genetic Information Processing
│ └ 09121 Transcription
│ └ 03020 RNA polymerase
│ └ 100037782
└ 09180 Brite Hierarchies
└ 09182 Protein families: genetic information processing
└ 03021 Transcription machinery [BR:zma03021]
└ 100037782
Parsing into a dictionary makes sense (and then creating a full hierarchy would be as simple as merging on keys) but building one is tricky to do since you can’t pass a list of keys into a dictionary. Instead of reading from the root to the leaf, we work backwards to find the path from leaf to root. This makes finding the right parent easy – track the indent level and go up one line skipping lines with an indent level >= the current level. These paths have redundant information (shared parents) but represent what is needed.
# Figuring out how to process BRITE entries
kegg_gene_entry = kegg_gene_entries[0]
section_entry = kegg_gene_entry['BRITE']
section_entry['KEGG Orthology (KO) [BR:zma00001]',
' 09180 Brite Hierarchies',
' 09183 Protein families: signaling and cellular processes',
' 02000 Transporters [BR:zma02000]',
' 100283122',
'Transporters [BR:zma02000]',
' Solute carrier family (SLC)',
' SLC50: Sugar efflux transporter',
' 100283122']
# get the leading whitespace for each line
indent_spaces = [re.findall('^ +', e)[0] if re.match('^ ', e) else [''][0] for e in section_entry]
indent_spaces = [len(e) for e in indent_spaces]
len(indent_spaces), indent_spaces(9, [0, 1, 2, 3, 4, 0, 1, 2, 3])
Because the lists are ordered, given a position one can start at a leaf and work back up to the root by walking backwards and looking for the next point at which the indent level is the expected level. IF it’s true that each of the leaves contain only an identifer number and no letters then the process is to 1. Find all leaves 1. Walk backwards from all leaves 1. Add a list of lists with these paths (could be used in a tidy format)
Below the process of building up the path from leaf to root and the indents at each step (as a sanity check) are shown.
i = 4 # position in list
indent = indent_spaces[i] # indent at that position
j = i
current_indent = indent
# work backawards to get the paths
j_backtrack = [j]
indent_backtrack = [current_indent]
while indent_backtrack[-1] > 0:
while current_indent != indent_backtrack[-1]-1:
j = j-1
current_indent = indent_spaces[j]
j_backtrack.extend([j])
indent_backtrack.extend([current_indent])
# indent
print(j_backtrack, indent_backtrack) #[4, 3] [4, 3]
[4, 3, 2] [4, 3, 2]
[4, 3, 2, 1] [4, 3, 2, 1]
[4, 3, 2, 1, 0] [4, 3, 2, 1, 0]
Wrap this in a function, then integrate into a processing function for BRITE
def _indent_backtrack_path(i, # position in list
indent_spaces
):
indent = indent_spaces[i] # indent at that position
j = i
current_indent = indent
# work backawards to get the paths
j_backtrack = [j]
indent_backtrack = [current_indent]
while indent_backtrack[-1] > 0:
while current_indent != indent_backtrack[-1]-1:
j = j-1
current_indent = indent_spaces[j]
j_backtrack.extend([j])
indent_backtrack.extend([current_indent])
# indent_backtrack is not needed beyond debugging
# confirm that the indent only decreases as you walk through the backtrack
indent_check = indent_backtrack[1:]+[-1]
indent_check = [True if indent_backtrack[i] > indent_check[i] else False for i in range(len(indent_backtrack))]
assert False not in indent_check
return(j_backtrack)def _gene_entry_BRITE(section_entry):
# get the leading whitespace for each line
indent_spaces = [re.findall('^ +', e)[0] if re.match('^ ', e) else [''][0] for e in section_entry]
indent_spaces = [len(e) for e in indent_spaces]
# find the leaves
# ['KEGG Orthology (KO) [BR:zma00001]',
# ' 09120 Genetic Information Processing',
# ' 09121 Transcription',
# ' 03020 RNA polymerase',
# ' 100037782' <------------- Whitespace followed by digits and no letters
leaf_idxs = [i for i in range(len(section_entry)) if re.match('^\s+\d+$', section_entry[i])]
# for each leaf get the backtrack path to the root
leaf_backtraces = [_indent_backtrack_path(i = leaf_idx,
indent_spaces = indent_spaces
) for leaf_idx in leaf_idxs]
# reverse the inner lists
[e.reverse() for e in leaf_backtraces]
out = [[section_entry[i] for i in leaf_backtraces[j]] for j in range(len(leaf_backtraces))]
# sample input
# ['KEGG Orthology (KO) [BR:zma00001]',
# ' 09120 Genetic Information Processing',
# ' 09121 Transcription',
# ' 03020 RNA polymerase',
# ' 100037782',
# ' 09180 Brite Hierarchies',
# ' 09182 Protein families: genetic information processing',
# ' 03021 Transcription machinery [BR:zma03021]',
# ' 100037782',
# 'Enzymes [BR:zma01000]',
# ' 2. Transferases',
# ' 2.7 Transferring phosphorus-containing groups',
# ' 2.7.7 Nucleotidyltransferases',
# ' 2.7.7.6 DNA-directed RNA polymerase',
# ' 100037782',
# 'Transcription machinery [BR:zma03021]',
# ' Eukaryotic type',
# ' RNA polymerase II system',
# ' RNA polymerase II',
# ' Pol IV and V specific subunits',
# ' 100037782']
# Sample output
# [['KEGG Orthology (KO) [BR:zma00001]',
# ' 09120 Genetic Information Processing',
# ' 09121 Transcription',
# ' 03020 RNA polymerase',
# ' 100037782'],
# ['KEGG Orthology (KO) [BR:zma00001]',
# ' 09180 Brite Hierarchies',
# ' 09182 Protein families: genetic information processing',
# ' 03021 Transcription machinery [BR:zma03021]',
# ' 100037782'],
# ['Enzymes [BR:zma01000]',
# ' 2. Transferases',
# ' 2.7 Transferring phosphorus-containing groups',
# ' 2.7.7 Nucleotidyltransferases',
# ' 2.7.7.6 DNA-directed RNA polymerase',
# ' 100037782'],
# ['Transcription machinery [BR:zma03021]',
# ' Eukaryotic type',
# ' RNA polymerase II system',
# ' RNA polymerase II',
# ' Pol IV and V specific subunits',
# ' 100037782']]
# and then strip whitespace
out = [[ee.strip() for ee in e] for e in out]
return(out)def parse_kegg_gene_entry(kegg_gene_entry):
for section in kegg_gene_entry.keys():
if type(kegg_gene_entry[section]) != list :
# This is a safeguard to prevent the code from breaking when rerun. All entries start out as list
# so if an entry isn't a list then it's already been transformed
pass
else:
if section in ['TEMPLATE']:
kegg_gene_entry[section] = kegg_gene_entry[section]
elif section in ['ENTRY', 'NAME', 'ORTHOLOGY', 'ORGANISM', 'POSITION', 'SYMBOL', 'STRUCTURE']:
kegg_gene_entry[section] = kegg_gene_entry[section][0]
elif section in ['PATHWAY', 'MODULE']:
# NOTE: PATHWAY contains two spaces between the identifier and name which is why I'm not splitting on the first instance of whitespace.
# 'zma00591 Linoleic acid metabolism'
# ^^ ^ ^
kegg_gene_entry[section] = dict([e.split(' ') for e in kegg_gene_entry[section]])
elif section in ['BRITE']:
# This dict is only expected to have a single key. Representing these data as a dict allows for the
# list checking logic above to still work.
kegg_gene_entry[section] = {"BRITE_PATHS": _gene_entry_BRITE(section_entry = kegg_gene_entry[section])}
# elif section in ['MODULE']:
# pass
# kegg_gene_entry[section] = _gene_entry_flat_list(section_entry=kegg_gene_entry[section])
elif section in ['MOTIF', 'DBLINKS']:
kegg_gene_entry[section] = _gene_entry_flat_list(section_entry=kegg_gene_entry[section])
elif section in ['AASEQ', 'NTSEQ']:
kegg_gene_entry[section] = _gene_seq_to_dict(seq_list = kegg_gene_entry[section])
else:
print("No behavior defined for "+section)
return(kegg_gene_entry)parsed_kegg_gene_entries = [parse_kegg_gene_entry(kegg_gene_entry = kegg_gene_entry
# ) for kegg_gene_entry in tqdm.tqdm(kegg_gene_entries)]
) for kegg_gene_entry in kegg_gene_entries]Write out cleaned data
save_dir = '../data/zma/kegg/'
ensure_dir_path_exists(dir_path = '../data/zma/kegg/')with open(save_dir+'kegg_gene_entries.pkl', 'wb') as handle:
pkl.dump(parsed_kegg_gene_entries,
handle,
protocol=pkl.HIGHEST_PROTOCOL)
# Reading in data
# with open('./data/kegg_gene_entries.pkl', 'rb') as handle:
# kegg_gene_entries = pkl.load(handle)Visualize KEGG data
# Restrict to only those with pathway
kegg_gene_brite = [e for e in parsed_kegg_gene_entries if 'BRITE' in e.keys()]tqdm.std.tqdm
i = 0
is_list = []
for i in tqdm(range(len(kegg_gene_brite))):
js_list = []
for j in range(len(kegg_gene_brite[i]['BRITE']['BRITE_PATHS'])):
entries_path = kegg_gene_brite[i]['BRITE']['BRITE_PATHS'][j]
if entries_path != []:
temp = pd.DataFrame(entries_path)
temp = temp.T
temp['ENTRY'] = kegg_gene_brite[i]['ENTRY']+'_'+str(j)
js_list = js_list + [temp]
if js_list != []:
is_list = is_list + [pd.concat(js_list)]
# is_list
BRITE_df = pd.concat(is_list)
BRITE_df = BRITE_df.drop_duplicates().reset_index().drop(columns = 'index')
BRITE_df100%|████████████████████████████████████████████████████████████████████████████| 11835/11835 [00:25<00:00, 456.36it/s]
| 0 | 1 | 2 | 3 | 4 | ENTRY | 5 | 6 | |
|---|---|---|---|---|---|---|---|---|
| 0 | KEGG Orthology (KO) [BR:zma00001] | 09180 Brite Hierarchies | 09183 Protein families: signaling and cellular... | 02000 Transporters [BR:zma02000] | 100283122 | 100283122 CDS T01088_0 | NaN | NaN |
| 1 | Transporters [BR:zma02000] | Solute carrier family (SLC) | SLC50: Sugar efflux transporter | 100283122 | NaN | 100283122 CDS T01088_1 | NaN | NaN |
| 2 | KEGG Orthology (KO) [BR:zma00001] | 09180 Brite Hierarchies | 09182 Protein families: genetic information pr... | 03019 Messenger RNA biogenesis [BR:zma03019] | 100384259 | 100384259 CDS T01088_0 | NaN | NaN |
| 3 | KEGG Orthology (KO) [BR:zma00001] | 09180 Brite Hierarchies | 09182 Protein families: genetic information pr... | 03036 Chromosome and associated proteins [BR:z... | 100384259 | 100384259 CDS T01088_1 | NaN | NaN |
| 4 | Messenger RNA biogenesis [BR:zma03019] | Eukaryotic type | mRNA surveillance and transport factors | mRNA cycle factors | Common to processing body (P body) and stress ... | 100384259 CDS T01088_2 | 100384259 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 40927 | Bacterial toxins [BR:zma02042] | Type II toxins: Membrane damaging toxins | Toxins that enzymatically damage the membrane | 100384643 | NaN | 100384643 CDS T01088_5 | NaN | NaN |
| 40928 | KEGG Orthology (KO) [BR:zma00001] | 09180 Brite Hierarchies | 09183 Protein families: signaling and cellular... | 02000 Transporters [BR:zma02000] | 100384530 | 100384530 CDS T01088_0 | NaN | NaN |
| 40929 | Transporters [BR:zma02000] | Other transporters | Electrochemical potential-driven transporters ... | 100384530 | NaN | 100384530 CDS T01088_1 | NaN | NaN |
| 40930 | KEGG Orthology (KO) [BR:zma00001] | 09100 Metabolism | 09105 Amino acid metabolism | 00400 Phenylalanine, tyrosine and tryptophan b... | 100284499 | 100284499 CDS T01088_0 | NaN | NaN |
| 40931 | Enzymes [BR:zma01000] | 2. Transferases | 2.5 Transferring alkyl or aryl groups, other ... | 2.5.1 Transferring alkyl or aryl groups, othe... | 2.5.1.54 3-deoxy-7-phosphoheptulonate synthase | 100284499 CDS T01088_1 | 100284499 | NaN |
40932 rows × 8 columns
#TODO find out what is causing levels deeper than 3 or the indices below (and others) to fail
fig = px.treemap(BRITE_df.loc[:
# (
# (BRITE_df.index != 7480) &
# (BRITE_df.index != 8131) &
# (BRITE_df.index != 8838) &
# (BRITE_df.index != 8837) &
# (BRITE_df.index != 8836)
# )
, :], path=[px.Constant("all"), 0, 1, 2, 3, #4, 5, 6
],
# values='a'
)
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()