This notebook explores the size and composition of 717 Genbank Pectobacteriaceae CAZomes.
Information of the complete method for this analysis, including augmenting the dataset, a README-walkthrough, and the output figure files, can be found in the GitHub repository.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import statistics
import re
import time
from copy import copy
from matplotlib.patches import Patch
from pathlib import Path
import adjustText
import statsmodels.api as sm
import upsetplot
from Bio import SeqIO
from saintBioutils.utilities.file_io.get_paths import get_file_paths
from saintBioutils.utilities.file_io import make_output_directory
from scipy.stats import ttest_ind, f_oneway
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from tqdm.notebook import tqdm
%matplotlib inline
# loading and parsing data
from cazomevolve.cazome.explore.parse_data import (
load_fgp_data,
load_tax_data,
add_tax_data_from_tax_df,
add_tax_column_from_row_index,
)
# functions for exploring the sizes of CAZomes
from cazomevolve.cazome.explore.cazome_sizes import (
calc_proteome_representation,
count_items_in_cazome,
get_proteome_sizes,
count_cazyme_fam_ratio,
)
# explore the frequency of CAZymes per CAZy class
from cazomevolve.cazome.explore.cazy_classes import calculate_class_sizes
# explore the frequencies of CAZy families and identify the co-cazome
from cazomevolve.cazome.explore.cazy_families import (
build_fam_freq_df,
build_row_colours,
build_family_clustermap,
identify_core_cazome,
plot_fam_boxplot,
build_fam_mean_freq_df,
get_group_specific_fams,
build_family_clustermap_multi_legend,
)
# functions to identify and explore CAZy families that are always present together
from cazomevolve.cazome.explore.cooccurring_families import (
identify_cooccurring_fams_corrM,
calc_cooccuring_fam_freqs,
identify_cooccurring_fam_pairs,
add_to_upsetplot_membership,
build_upsetplot,
get_upsetplot_grps,
add_upsetplot_grp_freqs,
build_upsetplot_matrix,
)
# functions to perform PCA
from cazomevolve.cazome.explore.pca import (
perform_pca,
plot_explained_variance,
plot_scree,
plot_pca,
plot_loadings,
)
To make the parent output directory, and further down to make output subdirectories, use the function make_output_directory from the saintBioutils package.
One positional argument is required: the path to the target output directory to be build - this must be a Path object.
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results'), force=True, nodelete=True)
Output directory ../results exists, nodelete is True. Adding output to output directory.
To import CAZy family annotations, load the tab delimited file listing cazy families, genomes and protein accessions, by providing the path to the 'gfp file' to load_gfp_data() function from cazomevolve.
fgp_file = "../data/cazomes/pecto_fam_genomes_proteins"
fgp_df = load_fgp_data(fgp_file)
fgp_df.head(3)
| Family | Genome | Protein | |
|---|---|---|---|
| 0 | CBM50 | GCA_003382565.3 | UEM40323.1 |
| 1 | GT35 | GCA_003382565.3 | UEM39157.1 |
| 2 | GH5 | GCA_003382565.3 | UEM41238.1 |
print(f"Total CAZymes (i.e. the number of unique protein IDs): {len(set(fgp_df['Protein']))}")
Total CAZymes (i.e. the number of unique protein IDs): 78132
Load in CSV containing taxonomic information, that was generated using the cazomevolve subcommand add_taxs, by providing a path to the file to load_tax_data() function from cazomevovle, and specify which taxonomic ranks (kingdom, phylum, etc.) are listed in the CSV file.
In this case, genus and species inforamtion was retrieved from the NCBI database and are stored in the CSV file.
tax_csv_path = "../data/cazomes/fg_genome_taxs.csv"
tax_df = load_tax_data(tax_csv_path, genus=True, species=True)
tax_df.head(3)
| Genome | Genus | Species | |
|---|---|---|---|
| 0 | GCA_922021645.1 | Pectobacterium | versatile |
| 1 | GCA_004296685.1 | Pectobacterium | versatile |
| 2 | GCA_018094705.1 | Pectobacterium | versatile |
Build dataframe of:
fgp_df = add_tax_data_from_tax_df(
fgp_df,
tax_df,
genus=True,
species=True,
)
fgp_df.head(3)
Collecting Genus data: 100%|████████████████████████████████████████████████████████████████| 83143/83143 [00:40<00:00, 2073.90it/s] Collecting Species data: 100%|██████████████████████████████████████████████████████████████| 83143/83143 [00:40<00:00, 2033.06it/s]
| Family | Genome | Protein | Genus | Species | |
|---|---|---|---|---|---|
| 0 | CBM50 | GCA_003382565.3 | UEM40323.1 | Pectobacterium | aquaticum |
| 1 | GT35 | GCA_003382565.3 | UEM39157.1 | Pectobacterium | aquaticum |
| 2 | GH5 | GCA_003382565.3 | UEM41238.1 | Pectobacterium | aquaticum |
print(f"Total CAZymes: {len(set(fgp_df['Protein']))}")
Total CAZymes: 78132
Explore and compare the sizes of the CAZomes, by calculating:
Use the count_items_in_cazome() function to retrieve the number of CAZymes and the number of CAZy families per genome, and the mean (and standard deviation (SD)) counts per genus.
# check all genomes are represented in the fgp_df
f"Examining {len(set(fgp_df['Genome']))} genomes"
'Examining 717 genomes'
print(f"Examining {len(set(fgp_df['Genus']))} genera:")
for genus in set(fgp_df['Genus']):
print(f'- {genus}')
Examining 8 genera: - Lonsdalea - Samsonia - Pectobacterium - Musicola - Brenneria - Affinibrenneria - Dickeya - Acerihabitans
# Calculate CAZymes per genome
cazome_sizes_dict, cazome_sizes_df = count_items_in_cazome(fgp_df, 'Protein', 'Genus', round_by=2)
cazome_sizes_df
Gathering CAZy families per genome: 100%|██████████████████████████████████████████████████| 83143/83143 [00:05<00:00, 15652.38it/s] Calculating num of Protein per genome and per Genus: 100%|██████████████████████████████████████████| 8/8 [00:00<00:00, 3528.70it/s]
| Genus | MeanProteins | SdProteins | NumOfGenomes | |
|---|---|---|---|---|
| 0 | Pectobacterium | 112.65 | 8.02 | 432 |
| 1 | Dickeya | 111.16 | 6.60 | 206 |
| 2 | Musicola | 92.25 | 2.28 | 4 |
| 3 | Brenneria | 87.79 | 7.46 | 33 |
| 4 | Lonsdalea | 77.15 | 4.70 | 39 |
| 5 | Acerihabitans | 106.00 | 0.00 | 1 |
| 6 | Affinibrenneria | 108.00 | 0.00 | 1 |
| 7 | Samsonia | 81.00 | 0.00 | 1 |
# calculate mean across pectobacteriaceae
pectobact_cazome_sizes = []
for genus in cazome_sizes_dict:
for genome in cazome_sizes_dict[genus]:
pectobact_cazome_sizes.append(cazome_sizes_dict[genus][genome]['numOfProteins'])
pd.concat(
[
cazome_sizes_df,
pd.DataFrame(
[[
'Pectobacteriaceae',
np.mean(pectobact_cazome_sizes),
np.std(pectobact_cazome_sizes),
len(set(fgp_df['Genome'])),
]],
columns=cazome_sizes_df.columns
),
],
axis=0,
)
| Genus | MeanProteins | SdProteins | NumOfGenomes | |
|---|---|---|---|---|
| 0 | Pectobacterium | 112.650000 | 8.020000 | 432 |
| 1 | Dickeya | 111.160000 | 6.600000 | 206 |
| 2 | Musicola | 92.250000 | 2.280000 | 4 |
| 3 | Brenneria | 87.790000 | 7.460000 | 33 |
| 4 | Lonsdalea | 77.150000 | 4.700000 | 39 |
| 5 | Acerihabitans | 106.000000 | 0.000000 | 1 |
| 6 | Affinibrenneria | 108.000000 | 0.000000 | 1 |
| 7 | Samsonia | 81.000000 | 0.000000 | 1 |
| 0 | Pectobacteriaceae | 108.970711 | 11.958225 | 717 |
print(f"The total number of CAZymes is {len(set(fgp_df['Protein']))}")
for genus in set(fgp_df['Genus']):
genus_df = fgp_df[fgp_df['Genus'] == genus]
print(f"The total number of {genus} CAZymes is {len(set(genus_df['Protein']))}")
The total number of CAZymes is 78132 The total number of Lonsdalea CAZymes is 3009 The total number of Samsonia CAZymes is 81 The total number of Pectobacterium CAZymes is 48663 The total number of Musicola CAZymes is 369 The total number of Brenneria CAZymes is 2897 The total number of Affinibrenneria CAZymes is 108 The total number of Dickeya CAZymes is 22899 The total number of Acerihabitans CAZymes is 106
# Calculate CAZy families per genome
cazome_fam_dict, cazome_fams_df = count_items_in_cazome(fgp_df, 'Family', 'Genus', round_by=2)
cazome_fams_df
Gathering CAZy families per genome: 100%|██████████████████████████████████████████████████| 83143/83143 [00:05<00:00, 15878.20it/s] Calculating num of Family per genome and per Genus: 100%|███████████████████████████████████████████| 8/8 [00:00<00:00, 3446.43it/s]
| Genus | MeanFamilys | SdFamilys | NumOfGenomes | |
|---|---|---|---|---|
| 0 | Pectobacterium | 62.08 | 3.46 | 432 |
| 1 | Dickeya | 59.05 | 3.04 | 206 |
| 2 | Musicola | 50.25 | 0.43 | 4 |
| 3 | Brenneria | 53.67 | 3.71 | 33 |
| 4 | Lonsdalea | 42.62 | 2.14 | 39 |
| 5 | Acerihabitans | 48.00 | 0.00 | 1 |
| 6 | Affinibrenneria | 48.00 | 0.00 | 1 |
| 7 | Samsonia | 49.00 | 0.00 | 1 |
# calculate mean across pectobacteriaceae
pectobact_fam_nums = []
for genus in cazome_fam_dict:
for genome in cazome_fam_dict[genus]:
pectobact_fam_nums.append(cazome_fam_dict[genus][genome]['numOfFamilys'])
pd.concat(
[
cazome_fams_df,
pd.DataFrame(
[[
'Pectobacteriaceae',
np.mean(pectobact_fam_nums),
np.std(pectobact_fam_nums),
len(set(fgp_df['Genome'])),
]],
columns=cazome_fams_df.columns
),
],
axis=0,
)
| Genus | MeanFamilys | SdFamilys | NumOfGenomes | |
|---|---|---|---|---|
| 0 | Pectobacterium | 62.080000 | 3.460000 | 432 |
| 1 | Dickeya | 59.050000 | 3.040000 | 206 |
| 2 | Musicola | 50.250000 | 0.430000 | 4 |
| 3 | Brenneria | 53.670000 | 3.710000 | 33 |
| 4 | Lonsdalea | 42.620000 | 2.140000 | 39 |
| 5 | Acerihabitans | 48.000000 | 0.000000 | 1 |
| 6 | Affinibrenneria | 48.000000 | 0.000000 | 1 |
| 7 | Samsonia | 49.000000 | 0.000000 | 1 |
| 0 | Pectobacteriaceae | 59.638773 | 5.733681 | 717 |
The CAZyme to CAZy families ratio can be used to determine whether a genus typically contains few or many CAZymes per CAZy family.
cazome_ratio_dict, cazome_ratio_df = count_cazyme_fam_ratio(fgp_df, 'Genus', round_by=2)
cazome_ratio_df
Gathering CAZymes and CAZy families per genome: 100%|██████████████████████████████████████| 83143/83143 [00:05<00:00, 15141.59it/s] Calculating CAZyme/CAZy family ratio: 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3781.63it/s]
| Genus | MeanCAZymeToFamRatio | SdCAZymeToFamRatio | NumOfGenomes | |
|---|---|---|---|---|
| 0 | Pectobacterium | 1.81 | 0.09 | 432 |
| 1 | Dickeya | 1.88 | 0.07 | 206 |
| 2 | Musicola | 1.84 | 0.04 | 4 |
| 3 | Brenneria | 1.64 | 0.07 | 33 |
| 4 | Lonsdalea | 1.81 | 0.07 | 39 |
| 5 | Acerihabitans | 2.21 | 0.00 | 1 |
| 6 | Affinibrenneria | 2.25 | 0.00 | 1 |
| 7 | Samsonia | 1.65 | 0.00 | 1 |
pecto_ratios = []
for genus in cazome_sizes_dict:
for genome in cazome_sizes_dict[genus]:
ratio = (
cazome_sizes_dict[genus][genome]['numOfProteins'] / cazome_fam_dict[genus][genome]['numOfFamilys']
)
pecto_ratios.append(ratio)
pd.concat(
[
cazome_ratio_df,
pd.DataFrame(
[[
'Pectobacteriaceae',
np.mean(pecto_ratios),
np.std(pecto_ratios),
len(set(fgp_df['Genome'])),
]],
columns=cazome_ratio_df.columns
)
],
axis=0,
)
| Genus | MeanCAZymeToFamRatio | SdCAZymeToFamRatio | NumOfGenomes | |
|---|---|---|---|---|
| 0 | Pectobacterium | 1.810000 | 0.09000 | 432 |
| 1 | Dickeya | 1.880000 | 0.07000 | 206 |
| 2 | Musicola | 1.840000 | 0.04000 | 4 |
| 3 | Brenneria | 1.640000 | 0.07000 | 33 |
| 4 | Lonsdalea | 1.810000 | 0.07000 | 39 |
| 5 | Acerihabitans | 2.210000 | 0.00000 | 1 |
| 6 | Affinibrenneria | 2.250000 | 0.00000 | 1 |
| 7 | Samsonia | 1.650000 | 0.00000 | 1 |
| 0 | Pectobacteriaceae | 1.826642 | 0.09796 | 717 |
Calculate the number of unique protein IDs, total, in each genome - i.e. not just CAZymes but all proteins.
# Get the size of the proteome (the number of protein acc) per genome
grp = 'Genus'
proteome_dir = "../data/proteomes"
proteome_dict = get_proteome_sizes(proteome_dir, fgp_df, grp)
Getting proteome sizes: 100%|█████████████████████████████████████████████████████████████████████| 717/717 [00:38<00:00, 18.52it/s]
# get total number of proteins across all proteomes
total_proteins = 0
for genus in proteome_dict:
for genome in proteome_dict[genus]:
total_proteins += proteome_dict[genus][genome]['numOfProteins']
print(f"Total number of proteins across all genomes: {total_proteins}")
Total number of proteins across all genomes: 2994018
Calculate the percentage of the proteome that is made up of the CAZome.
# Calculate the mean proteome size by genus and the proportion of the proteome represented by the CAZome
proteome_perc_df = calc_proteome_representation(proteome_dict, cazome_sizes_dict, grp, round_by=2)
proteome_perc_df
Getting proteome size: 100%|███████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 12069.94it/s] Calc proteome perc: 100%|███████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3333.77it/s]
| Genus | MeanProteomeSize | SdProteomeSize | MeanProteomePerc | SdProteomePerc | NumOfGenomes | |
|---|---|---|---|---|---|---|
| 0 | Pectobacterium | 4260.71 | 216.78 | 2.64 | 0.15 | 432 |
| 1 | Dickeya | 4176.86 | 155.23 | 2.66 | 0.11 | 206 |
| 2 | Musicola | 3992.00 | 55.76 | 2.31 | 0.05 | 4 |
| 3 | Brenneria | 4270.24 | 478.85 | 2.07 | 0.15 | 33 |
| 4 | Lonsdalea | 3142.28 | 132.55 | 2.45 | 0.09 | 39 |
| 5 | Acerihabitans | 4969.00 | 0.00 | 2.13 | 0.00 | 1 |
| 6 | Affinibrenneria | 5064.00 | 0.00 | 2.13 | 0.00 | 1 |
| 7 | Samsonia | 3489.00 | 0.00 | 2.32 | 0.00 | 1 |
pectobact_average = ['Pectobacteriaceae']
for col in proteome_perc_df.columns[1:]:
pectobact_average.append(np.mean(list(proteome_perc_df[col])))
pectobact_average[-1] == 660
df = pd.DataFrame([pectobact_average], columns=proteome_perc_df.columns)
pd.concat([proteome_perc_df, df], ignore_index=True, axis=0).round(2)
| Genus | MeanProteomeSize | SdProteomeSize | MeanProteomePerc | SdProteomePerc | NumOfGenomes | |
|---|---|---|---|---|---|---|
| 0 | Pectobacterium | 4260.71 | 216.78 | 2.64 | 0.15 | 432.00 |
| 1 | Dickeya | 4176.86 | 155.23 | 2.66 | 0.11 | 206.00 |
| 2 | Musicola | 3992.00 | 55.76 | 2.31 | 0.05 | 4.00 |
| 3 | Brenneria | 4270.24 | 478.85 | 2.07 | 0.15 | 33.00 |
| 4 | Lonsdalea | 3142.28 | 132.55 | 2.45 | 0.09 | 39.00 |
| 5 | Acerihabitans | 4969.00 | 0.00 | 2.13 | 0.00 | 1.00 |
| 6 | Affinibrenneria | 5064.00 | 0.00 | 2.13 | 0.00 | 1.00 |
| 7 | Samsonia | 3489.00 | 0.00 | 2.32 | 0.00 | 1.00 |
| 8 | Pectobacteriaceae | 4170.51 | 129.90 | 2.34 | 0.07 | 89.62 |
For easier comparison and presentation, combine the dataframes made above into a single dataframe, with each row representing a different genus.
all_df = pd.concat([proteome_perc_df, cazome_sizes_df, cazome_fams_df, cazome_ratio_df], axis=1, join='inner')
make_output_directory(Path('../results/cazome_size'), force=True, nodelete=True)
all_df.to_csv('../results/cazome_size/cazome_sizes.csv')
all_df
Output directory ../results/cazome_size exists, nodelete is True. Adding output to output directory.
| Genus | MeanProteomeSize | SdProteomeSize | MeanProteomePerc | SdProteomePerc | NumOfGenomes | Genus | MeanProteins | SdProteins | NumOfGenomes | Genus | MeanFamilys | SdFamilys | NumOfGenomes | Genus | MeanCAZymeToFamRatio | SdCAZymeToFamRatio | NumOfGenomes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Pectobacterium | 4260.71 | 216.78 | 2.64 | 0.15 | 432 | Pectobacterium | 112.65 | 8.02 | 432 | Pectobacterium | 62.08 | 3.46 | 432 | Pectobacterium | 1.81 | 0.09 | 432 |
| 1 | Dickeya | 4176.86 | 155.23 | 2.66 | 0.11 | 206 | Dickeya | 111.16 | 6.60 | 206 | Dickeya | 59.05 | 3.04 | 206 | Dickeya | 1.88 | 0.07 | 206 |
| 2 | Musicola | 3992.00 | 55.76 | 2.31 | 0.05 | 4 | Musicola | 92.25 | 2.28 | 4 | Musicola | 50.25 | 0.43 | 4 | Musicola | 1.84 | 0.04 | 4 |
| 3 | Brenneria | 4270.24 | 478.85 | 2.07 | 0.15 | 33 | Brenneria | 87.79 | 7.46 | 33 | Brenneria | 53.67 | 3.71 | 33 | Brenneria | 1.64 | 0.07 | 33 |
| 4 | Lonsdalea | 3142.28 | 132.55 | 2.45 | 0.09 | 39 | Lonsdalea | 77.15 | 4.70 | 39 | Lonsdalea | 42.62 | 2.14 | 39 | Lonsdalea | 1.81 | 0.07 | 39 |
| 5 | Acerihabitans | 4969.00 | 0.00 | 2.13 | 0.00 | 1 | Acerihabitans | 106.00 | 0.00 | 1 | Acerihabitans | 48.00 | 0.00 | 1 | Acerihabitans | 2.21 | 0.00 | 1 |
| 6 | Affinibrenneria | 5064.00 | 0.00 | 2.13 | 0.00 | 1 | Affinibrenneria | 108.00 | 0.00 | 1 | Affinibrenneria | 48.00 | 0.00 | 1 | Affinibrenneria | 2.25 | 0.00 | 1 |
| 7 | Samsonia | 3489.00 | 0.00 | 2.32 | 0.00 | 1 | Samsonia | 81.00 | 0.00 | 1 | Samsonia | 49.00 | 0.00 | 1 | Samsonia | 1.65 | 0.00 | 1 |
# calculate means for Pectobacteriaceae
for col in all_df:
if col == 'Genus' or col == 'NumOfGenomes':
continue
print(col, '--', np.mean(list(all_df[col])).round(2))
MeanProteomeSize -- 4170.51 SdProteomeSize -- 129.9 MeanProteomePerc -- 2.34 SdProteomePerc -- 0.07 MeanProteins -- 97.0 SdProteins -- 3.63 MeanFamilys -- 51.58 SdFamilys -- 1.6 MeanCAZymeToFamRatio -- 1.89 SdCAZymeToFamRatio -- 0.04
For each parameter:
Build a box and whisker plot over laid by a one dimensional scatter plot, with one data point per genome.
Test if there any statitically signficant differences in the means between the genera using a one-way ANOVA test (P<0.05).
If a statistically significant differnce is detected, using a post hoc Turkey HDS test (<0.05) to determine between which genera the means are statistically significantly different and in which direction.
To test if there is any statitically signficant difference in the means between soft plant tissue targeting genera:
and the hard plant tissue targeting genera:
and Musicola (which presently does not have a well defined preference for soft or hard plant tissue, use a one-way ANOVA test (P<0.05).
If a statistically significant differnce is detected, using a post hoc Turkey HDS test (<0.05) to determine between which groups the means are statistically significantly different and in which direction.
# go down to one Genus column
genus = []
for i in range(len(all_df)):
genus.append(
all_df.iloc[i]['Genus'].values[0]
)
try:
all_df = all_df.drop('Genus', axis=1)
except KeyError:
pass
all_df['Genus'] = genus
# Define the soft and hard plant tissue targeting genera that are represented by multiple genomes
soft = ['Pectobacterium','Dickeya']
hard = ['Lonsdalea', 'Brenneria']
First test between the genera
# build df with each genome represented as a different row / observation
proteome_data = []
for genus in proteome_dict:
for genome in proteome_dict[genus]:
proteome_data.append([genus, proteome_dict[genus][genome]['numOfProteins']])
proteome_size_df = pd.DataFrame(proteome_data, columns=['Genus','ProteomeSize'])
# One way Anova using scipy
f_oneway(
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Pectobacterium'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Dickeya'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Musicola'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Lonsdalea'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Brenneria'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Affinibrenneria'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Acerihabitans'],
proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=142.25683602203867, pvalue=1.5932408551086004e-130)
Post hoc test to find out which genera are differing.
proteome_size_turkey = pairwise_tukeyhsd(
endog=proteome_size_df['ProteomeSize'],
groups=proteome_size_df['Genus'],
alpha=0.05, # cut off
)
prot_size_turkey_df = pd.DataFrame(
proteome_size_turkey._results_table[1:],
columns=proteome_size_turkey._results_table.data[0]
)
prot_size_turkey_df.to_csv("../results/cazome_size/proteome_size_turkey_results.csv")
prot_size_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 95.0 | 1.0 | -839.9267 | 1029.9267 | False |
| 1 | Acerihabitans | Brenneria | -698.7576 | 0.0344 | -1369.7924 | -27.7227 | True |
| 2 | Acerihabitans | Dickeya | -792.1408 | 0.0072 | -1454.8365 | -129.4451 | True |
| 3 | Acerihabitans | Lonsdalea | -1826.7179 | 0.0 | -2496.2329 | -1157.203 | True |
| 4 | Acerihabitans | Musicola | -977.0 | 0.0017 | -1716.1245 | -237.8755 | True |
| 5 | Acerihabitans | Pectobacterium | -708.287 | 0.0262 | -1370.1448 | -46.4293 | True |
| 6 | Acerihabitans | Samsonia | -1480.0 | 0.0 | -2414.9267 | -545.0733 | True |
| 7 | Affinibrenneria | Brenneria | -793.7576 | 0.0083 | -1464.7924 | -122.7227 | True |
| 8 | Affinibrenneria | Dickeya | -887.1408 | 0.0014 | -1549.8365 | -224.4451 | True |
| 9 | Affinibrenneria | Lonsdalea | -1921.7179 | 0.0 | -2591.2329 | -1252.203 | True |
| 10 | Affinibrenneria | Musicola | -1072.0 | 0.0003 | -1811.1245 | -332.8755 | True |
| 11 | Affinibrenneria | Pectobacterium | -803.287 | 0.0059 | -1465.1448 | -141.4293 | True |
| 12 | Affinibrenneria | Samsonia | -1575.0 | 0.0 | -2509.9267 | -640.0733 | True |
| 13 | Brenneria | Dickeya | -93.3832 | 0.3004 | -217.3402 | 30.5738 | False |
| 14 | Brenneria | Lonsdalea | -1127.9604 | 0.0 | -1284.3254 | -971.5954 | True |
| 15 | Brenneria | Musicola | -278.2424 | 0.2348 | -628.2492 | 71.7644 | False |
| 16 | Brenneria | Pectobacterium | -9.5295 | 1.0 | -128.9256 | 109.8667 | False |
| 17 | Brenneria | Samsonia | -781.2424 | 0.0101 | -1452.2773 | -110.2076 | True |
| 18 | Dickeya | Lonsdalea | -1034.5772 | 0.0 | -1150.0234 | -919.131 | True |
| 19 | Dickeya | Musicola | -184.8592 | 0.6978 | -518.5995 | 148.881 | False |
| 20 | Dickeya | Pectobacterium | 83.8537 | 0.0002 | 27.8783 | 139.8292 | True |
| 21 | Dickeya | Samsonia | -687.8592 | 0.0355 | -1350.5549 | -25.1635 | True |
| 22 | Lonsdalea | Musicola | 849.7179 | 0.0 | 502.634 | 1196.8019 | True |
| 23 | Lonsdalea | Pectobacterium | 1118.4309 | 0.0 | 1007.8962 | 1228.9657 | True |
| 24 | Lonsdalea | Samsonia | 346.7179 | 0.7657 | -322.797 | 1016.2329 | False |
| 25 | Musicola | Pectobacterium | 268.713 | 0.2146 | -63.3603 | 600.7863 | False |
| 26 | Musicola | Samsonia | -503.0 | 0.4363 | -1242.1245 | 236.1245 | False |
| 27 | Pectobacterium | Samsonia | -771.713 | 0.0099 | -1433.5707 | -109.8552 | True |
g = sns.stripplot(
x=proteome_size_df['Genus'],
y=proteome_size_df['ProteomeSize'],
color='black',
alpha=0.75,
size=3,
);
g = sns.boxplot(
x=proteome_size_df['Genus'],
y=proteome_size_df['ProteomeSize'],
fliersize=0,
);
g.set_ylabel(ylabel='Number of proteins in the CAZome')
plt.xticks(rotation=90)
plt.savefig(
"../results/cazome_size/proteome_size_boxplot.png",
bbox_inches='tight',
format='png'
)
Test between soft and hard plant tissue targeting genera.
soft_hard_dict = {
'Pectobacterium': 'Soft',
'Dickeya': 'Soft',
'Musicola': 'Unknown',
'Brenneria': 'Hard',
'Lonsdalea': 'Hard',
'Acerihabitans': 'Hard',
'Affinibrenneria': 'Hard',
'Samsonia': 'Hard',
}
pheno_col = []
for i in range(len(proteome_size_df)):
pheno_col.append(soft_hard_dict[proteome_size_df.iloc[i]['Genus']])
proteome_size_df['Phenotype'] = pheno_col
f_oneway(
proteome_size_df['ProteomeSize'][proteome_size_df['Phenotype'] == 'Soft'],
proteome_size_df['ProteomeSize'][proteome_size_df['Phenotype'] == 'Hard'],
proteome_size_df['ProteomeSize'][proteome_size_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=115.73644297310237, pvalue=2.907639003391272e-44)
proteome_size_turkey = pairwise_tukeyhsd(
endog=proteome_size_df['ProteomeSize'],
groups=proteome_size_df['Phenotype'],
alpha=0.05, # cut off
)
prot_size_turkey_pheno_df = pd.DataFrame(
proteome_size_turkey._results_table[1:],
columns=proteome_size_turkey._results_table.data[0]
)
prot_size_turkey_pheno_df.to_csv("../results/cazome_size/proteome_size_turkey_phenotype_results.csv")
prot_size_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 540.4513 | 0.0 | 456.7338 | 624.1688 | True |
| 1 | Hard | Unknown | 298.8133 | 0.1144 | -53.124 | 650.7507 | False |
| 2 | Soft | Unknown | -241.6379 | 0.2255 | -585.623 | 102.3472 | False |
First test for differences between the genera.
# build df with each genome represented as a different row / observation
proteome_perc_data = []
for genus in proteome_dict:
for genome in proteome_dict[genus]:
perc = (
cazome_sizes_dict[genus][genome]['numOfProteins'] /
proteome_dict[genus][genome]['numOfProteins']
) * 100
proteome_perc_data.append([genus, perc])
proteome_perc_df = pd.DataFrame(proteome_perc_data, columns=['Genus','ProteomePerc'])
# One way Anova using scipy
f_oneway(
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Pectobacterium'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Dickeya'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Musicola'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Lonsdalea'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Brenneria'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Affinibrenneria'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Acerihabitans'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=100.22498879010466, pvalue=1.569439830366814e-101)
# follow up with a post hoc test to see which genera differ
proteome_perc_turkey = pairwise_tukeyhsd(
endog=proteome_perc_df['ProteomePerc'],
groups=proteome_perc_df['Genus'],
alpha=0.05, # cut off
)
prot_per_turkey_df = pd.DataFrame(
proteome_perc_turkey._results_table[1:],
columns=proteome_perc_turkey._results_table.data[0]
)
prot_per_turkey_df.to_csv("../results/cazome_size/proteome_perc_turkey_results.csv")
prot_per_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | -0.0005 | 1.0 | -0.5762 | 0.5751 | False |
| 1 | Acerihabitans | Brenneria | -0.0662 | 0.9997 | -0.4793 | 0.347 | False |
| 2 | Acerihabitans | Dickeya | 0.5273 | 0.0024 | 0.1192 | 0.9353 | True |
| 3 | Acerihabitans | Lonsdalea | 0.3215 | 0.2572 | -0.0907 | 0.7337 | False |
| 4 | Acerihabitans | Musicola | 0.1777 | 0.9356 | -0.2774 | 0.6327 | False |
| 5 | Acerihabitans | Pectobacterium | 0.5105 | 0.0038 | 0.103 | 0.918 | True |
| 6 | Acerihabitans | Samsonia | 0.1884 | 0.9752 | -0.3873 | 0.764 | False |
| 7 | Affinibrenneria | Brenneria | -0.0656 | 0.9997 | -0.4788 | 0.3475 | False |
| 8 | Affinibrenneria | Dickeya | 0.5278 | 0.0023 | 0.1198 | 0.9358 | True |
| 9 | Affinibrenneria | Lonsdalea | 0.322 | 0.2553 | -0.0902 | 0.7343 | False |
| 10 | Affinibrenneria | Musicola | 0.1782 | 0.9347 | -0.2769 | 0.6333 | False |
| 11 | Affinibrenneria | Pectobacterium | 0.511 | 0.0037 | 0.1035 | 0.9185 | True |
| 12 | Affinibrenneria | Samsonia | 0.1889 | 0.9748 | -0.3868 | 0.7645 | False |
| 13 | Brenneria | Dickeya | 0.5934 | 0.0 | 0.5171 | 0.6697 | True |
| 14 | Brenneria | Lonsdalea | 0.3877 | 0.0 | 0.2914 | 0.484 | True |
| 15 | Brenneria | Musicola | 0.2438 | 0.0142 | 0.0283 | 0.4593 | True |
| 16 | Brenneria | Pectobacterium | 0.5767 | 0.0 | 0.5031 | 0.6502 | True |
| 17 | Brenneria | Samsonia | 0.2545 | 0.5702 | -0.1586 | 0.6677 | False |
| 18 | Dickeya | Lonsdalea | -0.2057 | 0.0 | -0.2768 | -0.1347 | True |
| 19 | Dickeya | Musicola | -0.3496 | 0.0 | -0.5551 | -0.1441 | True |
| 20 | Dickeya | Pectobacterium | -0.0168 | 0.8188 | -0.0512 | 0.0177 | False |
| 21 | Dickeya | Samsonia | -0.3389 | 0.1866 | -0.7469 | 0.0691 | False |
| 22 | Lonsdalea | Musicola | -0.1439 | 0.4512 | -0.3576 | 0.0698 | False |
| 23 | Lonsdalea | Pectobacterium | 0.189 | 0.0 | 0.1209 | 0.257 | True |
| 24 | Lonsdalea | Samsonia | -0.1332 | 0.9769 | -0.5454 | 0.2791 | False |
| 25 | Musicola | Pectobacterium | 0.3328 | 0.0 | 0.1284 | 0.5373 | True |
| 26 | Musicola | Samsonia | 0.0107 | 1.0 | -0.4444 | 0.4658 | False |
| 27 | Pectobacterium | Samsonia | -0.3221 | 0.2413 | -0.7296 | 0.0854 | False |
g = sns.stripplot(
x=proteome_perc_df['Genus'],
y=proteome_perc_df['ProteomePerc'],
color='black',
alpha=0.75,
size=3,
);
g = sns.boxplot(
x=proteome_perc_df['Genus'],
y=proteome_perc_df['ProteomePerc'],
fliersize=0,
);
g.set_ylabel(ylabel='Percentage of proteome in the CAZome')
plt.xticks(rotation=90)
plt.savefig(
"../results/cazome_size/proteome_perc_boxplot.png",
bbox_inches='tight',
format='png'
)
Next test for differences between soft and hard plant tissue targeting genomes.
pheno_col = []
for i in range(len(proteome_perc_df)):
pheno_col.append(soft_hard_dict[proteome_perc_df.iloc[i]['Genus']])
proteome_perc_df['Phenotype'] = pheno_col
f_oneway(
proteome_perc_df['ProteomePerc'][proteome_perc_df['Phenotype'] == 'Soft'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Phenotype'] == 'Hard'],
proteome_perc_df['ProteomePerc'][proteome_perc_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=226.22200801781082, pvalue=7.940529938618575e-77)
# follow up with a post hoc test to see which genera differ
proteome_perc_pheno_turkey = pairwise_tukeyhsd(
endog=proteome_perc_df['ProteomePerc'],
groups=proteome_perc_df['Phenotype'],
alpha=0.05, # cut off
)
prot_per_pheno_turkey_df = pd.DataFrame(
proteome_perc_pheno_turkey._results_table[1:],
columns=proteome_perc_pheno_turkey._results_table.data[0]
)
prot_per_pheno_turkey_df.to_csv("../results/cazome_size/proteome_perc_turkey_phenotype_results.csv")
prot_per_pheno_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 0.3753 | 0.0 | 0.3331 | 0.4175 | True |
| 1 | Hard | Unknown | 0.0371 | 0.8757 | -0.1404 | 0.2145 | False |
| 2 | Soft | Unknown | -0.3382 | 0.0 | -0.5117 | -0.1648 | True |
First test between the genera
# build df with each genome represented as a different row / observation
cazome_s_data = []
for genus in cazome_sizes_dict:
for genome in cazome_sizes_dict[genus]:
cazome_s_data.append([genus, cazome_sizes_dict[genus][genome]['numOfProteins']])
cazome_s_df = pd.DataFrame(cazome_s_data, columns=['Genus','CAZomeSize'])
# One way Anova using scipy
f_oneway(
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Pectobacterium'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Dickeya'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Musicola'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Lonsdalea'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Brenneria'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Affinibrenneria'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Acerihabitans'],
cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=161.55655161375054, pvalue=3.282343275946478e-142)
# follow up with a post hoc test to see which genera differ
cazome_size_turkey = pairwise_tukeyhsd(
endog=cazome_s_df['CAZomeSize'],
groups=cazome_s_df['Genus'],
alpha=0.05, # cut off
)
cazome_size_turkey_df = pd.DataFrame(
cazome_size_turkey._results_table[1:],
columns=cazome_size_turkey._results_table.data[0]
)
cazome_size_turkey_df.to_csv("../results/cazome_size/cazome_size_turkey_results.csv")
cazome_size_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 2.0 | 1.0 | -30.0935 | 34.0935 | False |
| 1 | Acerihabitans | Brenneria | -18.2121 | 0.2411 | -41.2469 | 4.8227 | False |
| 2 | Acerihabitans | Dickeya | 5.1602 | 0.9973 | -17.5883 | 27.9087 | False |
| 3 | Acerihabitans | Lonsdalea | -28.8462 | 0.0037 | -51.8288 | -5.8635 | True |
| 4 | Acerihabitans | Musicola | -13.75 | 0.7211 | -39.1221 | 11.6221 | False |
| 5 | Acerihabitans | Pectobacterium | 6.6458 | 0.987 | -16.0739 | 29.3656 | False |
| 6 | Acerihabitans | Samsonia | -25.0 | 0.2588 | -57.0935 | 7.0935 | False |
| 7 | Affinibrenneria | Brenneria | -20.2121 | 0.1344 | -43.2469 | 2.8227 | False |
| 8 | Affinibrenneria | Dickeya | 3.1602 | 0.9999 | -19.5883 | 25.9087 | False |
| 9 | Affinibrenneria | Lonsdalea | -30.8462 | 0.0013 | -53.8288 | -7.8635 | True |
| 10 | Affinibrenneria | Musicola | -15.75 | 0.5603 | -41.1221 | 9.6221 | False |
| 11 | Affinibrenneria | Pectobacterium | 4.6458 | 0.9986 | -18.0739 | 27.3656 | False |
| 12 | Affinibrenneria | Samsonia | -27.0 | 0.1735 | -59.0935 | 5.0935 | False |
| 13 | Brenneria | Dickeya | 23.3723 | 0.0 | 19.1172 | 27.6274 | True |
| 14 | Brenneria | Lonsdalea | -10.634 | 0.0 | -16.0016 | -5.2664 | True |
| 15 | Brenneria | Musicola | 4.4621 | 0.9504 | -7.5527 | 16.4769 | False |
| 16 | Brenneria | Pectobacterium | 24.858 | 0.0 | 20.7594 | 28.9565 | True |
| 17 | Brenneria | Samsonia | -6.7879 | 0.9864 | -29.8227 | 16.2469 | False |
| 18 | Dickeya | Lonsdalea | -34.0063 | 0.0 | -37.9693 | -30.0434 | True |
| 19 | Dickeya | Musicola | -18.9102 | 0.0 | -30.3666 | -7.4538 | True |
| 20 | Dickeya | Pectobacterium | 1.4856 | 0.2679 | -0.4358 | 3.4071 | False |
| 21 | Dickeya | Samsonia | -30.1602 | 0.0016 | -52.9087 | -7.4117 | True |
| 22 | Lonsdalea | Musicola | 15.0962 | 0.0032 | 3.1817 | 27.0106 | True |
| 23 | Lonsdalea | Pectobacterium | 35.492 | 0.0 | 31.6976 | 39.2863 | True |
| 24 | Lonsdalea | Samsonia | 3.8462 | 0.9996 | -19.1365 | 26.8288 | False |
| 25 | Musicola | Pectobacterium | 20.3958 | 0.0 | 8.9967 | 31.795 | True |
| 26 | Musicola | Samsonia | -11.25 | 0.88 | -36.6221 | 14.1221 | False |
| 27 | Pectobacterium | Samsonia | -31.6458 | 0.0007 | -54.3656 | -8.9261 | True |
g = sns.stripplot(
x=cazome_s_df['Genus'],
y=cazome_s_df['CAZomeSize'],
color='black',
alpha=0.75,
size=3,
);
g = sns.boxplot(
x=cazome_s_df['Genus'],
y=cazome_s_df['CAZomeSize'],
fliersize=0,
);
g.set_ylabel(ylabel='Number of proteins in the CAZome')
plt.xticks(rotation=90)
plt.savefig(
"../results/cazome_size/cazome_size_boxplot.png",
bbox_inches='tight',
format='png'
)
Now test between soft and hard plant tissue targeting genomes.
pheno_col = []
for i in range(len(cazome_s_df)):
pheno_col.append(soft_hard_dict[cazome_s_df.iloc[i]['Genus']])
cazome_s_df['Phenotype'] = pheno_col
f_oneway(
cazome_s_df['CAZomeSize'][cazome_s_df['Phenotype'] == 'Soft'],
cazome_s_df['CAZomeSize'][cazome_s_df['Phenotype'] == 'Hard'],
cazome_s_df['CAZomeSize'][cazome_s_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=493.10468604523317, pvalue=3.0262355426671687e-135)
# follow up with a post hoc test to see which genera differ
cazome_s_pheno_turkey = pairwise_tukeyhsd(
endog=cazome_s_df['CAZomeSize'],
groups=cazome_s_df['Phenotype'],
alpha=0.05, # cut off
)
cazome_s_pheno_turkey_df = pd.DataFrame(
cazome_s_pheno_turkey._results_table[1:],
columns=cazome_s_pheno_turkey._results_table.data[0]
)
cazome_s_pheno_turkey_df.to_csv("../results/cazome_size/cazome_size_turkey_phenotype_results.csv")
cazome_s_pheno_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 29.4861 | 0.0 | 27.2598 | 31.7125 | True |
| 1 | Hard | Unknown | 9.57 | 0.0437 | 0.2108 | 18.9292 | True |
| 2 | Soft | Unknown | -19.9161 | 0.0 | -29.0639 | -10.7684 | True |
First test between the genera.
# build df with each genome represented as a different row / observation
cazy_fam_data = []
for genus in cazome_fam_dict:
for genome in cazome_fam_dict[genus]:
cazy_fam_data.append([genus, cazome_fam_dict[genus][genome]['numOfFamilys']])
cazy_f_df = pd.DataFrame(cazy_fam_data, columns=['Genus','CAZyFamilies'])
# One way Anova using scipy
f_oneway(
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Pectobacterium'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Dickeya'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Musicola'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Lonsdalea'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Brenneria'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Affinibrenneria'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Acerihabitans'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=208.2841856410692, pvalue=2.6418322599463134e-167)
# follow up with a post hoc test to see which genera differ
cazome_fam_turkey = pairwise_tukeyhsd(
endog=cazy_f_df['CAZyFamilies'],
groups=cazy_f_df['Genus'],
alpha=0.05, # cut off
)
cazome_fam_turkey_df = pd.DataFrame(
cazome_fam_turkey._results_table[1:],
columns=cazome_fam_turkey._results_table.data[0]
)
cazome_fam_turkey_df.to_csv("../results/cazome_size/cazome_fam_turkey_results.csv")
cazome_fam_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 0.0 | 1.0 | -14.1792 | 14.1792 | False |
| 1 | Acerihabitans | Brenneria | 5.6667 | 0.6921 | -4.5103 | 15.8437 | False |
| 2 | Acerihabitans | Dickeya | 11.0485 | 0.0197 | 0.998 | 21.0991 | True |
| 3 | Acerihabitans | Lonsdalea | -5.3846 | 0.743 | -15.5386 | 4.7693 | False |
| 4 | Acerihabitans | Musicola | 2.25 | 0.9987 | -8.9597 | 13.4597 | False |
| 5 | Acerihabitans | Pectobacterium | 14.0787 | 0.0006 | 4.0409 | 24.1165 | True |
| 6 | Acerihabitans | Samsonia | 1.0 | 1.0 | -13.1792 | 15.1792 | False |
| 7 | Affinibrenneria | Brenneria | 5.6667 | 0.6921 | -4.5103 | 15.8437 | False |
| 8 | Affinibrenneria | Dickeya | 11.0485 | 0.0197 | 0.998 | 21.0991 | True |
| 9 | Affinibrenneria | Lonsdalea | -5.3846 | 0.743 | -15.5386 | 4.7693 | False |
| 10 | Affinibrenneria | Musicola | 2.25 | 0.9987 | -8.9597 | 13.4597 | False |
| 11 | Affinibrenneria | Pectobacterium | 14.0787 | 0.0006 | 4.0409 | 24.1165 | True |
| 12 | Affinibrenneria | Samsonia | 1.0 | 1.0 | -13.1792 | 15.1792 | False |
| 13 | Brenneria | Dickeya | 5.3819 | 0.0 | 3.5019 | 7.2618 | True |
| 14 | Brenneria | Lonsdalea | -11.0513 | 0.0 | -13.4227 | -8.6798 | True |
| 15 | Brenneria | Musicola | -3.4167 | 0.5122 | -8.7249 | 1.8916 | False |
| 16 | Brenneria | Pectobacterium | 8.412 | 0.0 | 6.6013 | 10.2228 | True |
| 17 | Brenneria | Samsonia | -4.6667 | 0.86 | -14.8437 | 5.5103 | False |
| 18 | Dickeya | Lonsdalea | -16.4332 | 0.0 | -18.184 | -14.6823 | True |
| 19 | Dickeya | Musicola | -8.7985 | 0.0 | -13.8601 | -3.737 | True |
| 20 | Dickeya | Pectobacterium | 3.0302 | 0.0 | 2.1812 | 3.8791 | True |
| 21 | Dickeya | Samsonia | -10.0485 | 0.0501 | -20.0991 | 0.002 | False |
| 22 | Lonsdalea | Musicola | 7.6346 | 0.0003 | 2.3707 | 12.8985 | True |
| 23 | Lonsdalea | Pectobacterium | 19.4633 | 0.0 | 17.7869 | 21.1397 | True |
| 24 | Lonsdalea | Samsonia | 6.3846 | 0.5434 | -3.7693 | 16.5386 | False |
| 25 | Musicola | Pectobacterium | 11.8287 | 0.0 | 6.7924 | 16.865 | True |
| 26 | Musicola | Samsonia | -1.25 | 1.0 | -12.4597 | 9.9597 | False |
| 27 | Pectobacterium | Samsonia | -13.0787 | 0.0021 | -23.1165 | -3.0409 | True |
Scatter plot and box plot of the number of CAZy families per genome.
g = sns.stripplot(
x=cazy_f_df['Genus'],
y=cazy_f_df['CAZyFamilies'],
color='black',
alpha=0.75,
size=3,
);
g = sns.boxplot(
x=cazy_f_df['Genus'],
y=cazy_f_df['CAZyFamilies'],
fliersize=0,
);
g.set_ylabel(ylabel='Number of CAZyme families')
plt.xticks(rotation=90)
plt.savefig(
"../results/cazome_size/cazy_fams_boxplot.png",
bbox_inches='tight',
format='png'
)
Now test between soft and hard plant tissue targeting genomes.
pheno_col = []
for i in range(len(cazy_f_df)):
pheno_col.append(soft_hard_dict[cazy_f_df.iloc[i]['Genus']])
cazy_f_df['Phenotype'] = pheno_col
f_oneway(
cazy_f_df['CAZyFamilies'][cazy_f_df['Phenotype'] == 'Soft'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Phenotype'] == 'Hard'],
cazy_f_df['CAZyFamilies'][cazy_f_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=395.8267787310159, pvalue=2.1009567376244338e-116)
# follow up with a post hoc test to see which genera differ
cazome_f_pheno_turkey = pairwise_tukeyhsd(
endog=cazy_f_df['CAZyFamilies'],
groups=cazy_f_df['Phenotype'],
alpha=0.05, # cut off
)
cazome_f_pheno_turkey_df = pd.DataFrame(
cazome_f_pheno_turkey._results_table[1:],
columns=cazome_f_pheno_turkey._results_table.data[0]
)
cazome_f_pheno_turkey_df.to_csv("../results/cazome_size/cazome_fam_turkey_phenotype_results.csv")
cazome_f_pheno_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 13.3936 | 0.0 | 12.2593 | 14.528 | True |
| 1 | Hard | Unknown | 2.5433 | 0.4226 | -2.2253 | 7.312 | False |
| 2 | Soft | Unknown | -10.8503 | 0.0 | -15.5112 | -6.1894 | True |
Calculate the number of CAZymes (identified as the number of unique protein accessions) per CAZy class. Also, calculate the mean size of CAZy classes (i.e. the mean number of unique protein accessions per CAZy class in each genome) per genus.
The results are added to a dataframe, which is written to results/pecto_dic/cazy_class_sizes.csv, and was used to generate a proportiona area plot using RawGraphs.
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/cazy_classes/'), force=True, nodelete=True)
Output directory ../results/cazy_classes exists, nodelete is True. Adding output to output directory.
class_df, class_size_dict = calculate_class_sizes(fgp_df, 'Genus', round_by=2)
Getting CAZy class sizes: 100%|█████████████████████████████████████████████████████████████| 83143/83143 [00:19<00:00, 4309.82it/s] Calculating CAZy class sizes: 100%|██████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 120.67it/s]
# add values with means across all genera to represent pectobacteriaceae
pectobact_class_means = []
for cazy_class in set(class_df['CAZyClass']):
df = class_df[class_df['CAZyClass'] == cazy_class]
new_row = [cazy_class, 'Pectobacteriaceae']
for col in class_df.columns[2:]:
mean = np.mean(df[col])
new_row.append(mean)
new_row[-1] = 660
pectobact_class_means.append(new_row)
df = pd.DataFrame(pectobact_class_means, columns = class_df.columns)
all_class_df = pd.concat([class_df, df], axis=0, ignore_index=True)
all_class_df = all_class_df.round(2)
# replace nan with 0
all_class_df = all_class_df.fillna(0)
filtered_class_df = all_class_df[all_class_df['Genus'] != 'Haf']
all_class_df.to_csv('../results/cazy_classes/cazy_class_sizes.csv')
all_class_df
| CAZyClass | Genus | MeanCazyClass | SdCazyClass | MeanClassPerc | SdClassPerc | NumOfGenomes | |
|---|---|---|---|---|---|---|---|
| 0 | GH | Lonsdalea | 30.85 | 2.28 | 39.96 | 1.31 | 39 |
| 1 | GH | Samsonia | 34.00 | 0.00 | 41.98 | 0.00 | 1 |
| 2 | GH | Pectobacterium | 50.11 | 3.91 | 44.50 | 1.87 | 432 |
| 3 | GH | Musicola | 37.00 | 0.00 | 40.13 | 0.99 | 4 |
| 4 | GH | Brenneria | 42.48 | 6.60 | 48.14 | 4.19 | 33 |
| 5 | GH | Affinibrenneria | 59.00 | 0.00 | 54.63 | 0.00 | 1 |
| 6 | GH | Dickeya | 42.53 | 3.55 | 38.24 | 1.85 | 206 |
| 7 | GH | Acerihabitans | 51.00 | 0.00 | 48.11 | 0.00 | 1 |
| 8 | GT | Lonsdalea | 32.46 | 2.30 | 42.07 | 1.24 | 39 |
| 9 | GT | Samsonia | 25.00 | 0.00 | 30.86 | 0.00 | 1 |
| 10 | GT | Pectobacterium | 31.76 | 3.86 | 28.15 | 2.23 | 432 |
| 11 | GT | Musicola | 31.00 | 3.00 | 33.55 | 2.44 | 4 |
| 12 | GT | Brenneria | 32.30 | 2.55 | 36.90 | 2.61 | 33 |
| 13 | GT | Affinibrenneria | 35.00 | 0.00 | 32.41 | 0.00 | 1 |
| 14 | GT | Dickeya | 37.21 | 3.12 | 33.52 | 2.62 | 206 |
| 15 | GT | Acerihabitans | 44.00 | 0.00 | 41.51 | 0.00 | 1 |
| 16 | PL | Lonsdalea | 3.79 | 0.56 | 4.92 | 0.72 | 39 |
| 17 | PL | Samsonia | 8.00 | 0.00 | 9.88 | 0.00 | 1 |
| 18 | PL | Pectobacterium | 14.78 | 1.36 | 13.14 | 0.97 | 432 |
| 19 | PL | Musicola | 11.25 | 0.43 | 12.19 | 0.33 | 4 |
| 20 | PL | Brenneria | 4.24 | 1.23 | 4.81 | 1.29 | 33 |
| 21 | PL | Affinibrenneria | 1.00 | 0.00 | 0.93 | 0.00 | 1 |
| 22 | PL | Dickeya | 16.29 | 1.72 | 14.63 | 1.19 | 206 |
| 23 | PL | Acerihabitans | 1.00 | 0.00 | 0.94 | 0.00 | 1 |
| 24 | CE | Lonsdalea | 3.15 | 0.48 | 4.09 | 0.57 | 39 |
| 25 | CE | Samsonia | 8.00 | 0.00 | 9.88 | 0.00 | 1 |
| 26 | CE | Pectobacterium | 7.12 | 0.83 | 6.33 | 0.68 | 432 |
| 27 | CE | Musicola | 6.00 | 0.00 | 6.51 | 0.16 | 4 |
| 28 | CE | Brenneria | 4.30 | 1.06 | 4.93 | 1.24 | 33 |
| 29 | CE | Affinibrenneria | 7.00 | 0.00 | 6.48 | 0.00 | 1 |
| 30 | CE | Dickeya | 7.16 | 0.80 | 6.44 | 0.60 | 206 |
| 31 | CE | Acerihabitans | 5.00 | 0.00 | 4.72 | 0.00 | 1 |
| 32 | AA | Lonsdalea | 0.00 | 0.00 | 0.00 | 0.00 | 39 |
| 33 | AA | Samsonia | 1.00 | 0.00 | 1.23 | 0.00 | 1 |
| 34 | AA | Pectobacterium | 1.03 | 0.16 | 0.91 | 0.17 | 371 |
| 35 | AA | Musicola | 0.00 | 0.00 | 0.00 | 0.00 | 4 |
| 36 | AA | Brenneria | 1.00 | 0.00 | 1.27 | 0.06 | 8 |
| 37 | AA | Affinibrenneria | 0.00 | 0.00 | 0.00 | 0.00 | 1 |
| 38 | AA | Dickeya | 1.00 | 0.00 | 0.90 | 0.06 | 85 |
| 39 | AA | Acerihabitans | 0.00 | 0.00 | 0.00 | 0.00 | 1 |
| 40 | CBM | Lonsdalea | 8.74 | 0.54 | 11.35 | 0.63 | 39 |
| 41 | CBM | Samsonia | 10.00 | 0.00 | 12.35 | 0.00 | 1 |
| 42 | CBM | Pectobacterium | 13.98 | 1.49 | 12.41 | 1.02 | 432 |
| 43 | CBM | Musicola | 11.00 | 1.00 | 11.96 | 1.38 | 4 |
| 44 | CBM | Brenneria | 9.36 | 0.64 | 10.74 | 1.17 | 33 |
| 45 | CBM | Affinibrenneria | 10.00 | 0.00 | 9.26 | 0.00 | 1 |
| 46 | CBM | Dickeya | 12.27 | 1.25 | 11.03 | 0.83 | 206 |
| 47 | CBM | Acerihabitans | 11.00 | 0.00 | 10.38 | 0.00 | 1 |
| 48 | GH | Pectobacteriaceae | 43.37 | 2.04 | 44.46 | 1.28 | 660 |
| 49 | AA | Pectobacteriaceae | 0.50 | 0.02 | 0.54 | 0.04 | 660 |
| 50 | CE | Pectobacteriaceae | 5.97 | 0.40 | 6.17 | 0.41 | 660 |
| 51 | CBM | Pectobacteriaceae | 10.79 | 0.62 | 11.18 | 0.63 | 660 |
| 52 | PL | Pectobacteriaceae | 7.54 | 0.66 | 7.68 | 0.56 | 660 |
| 53 | GT | Pectobacteriaceae | 33.59 | 1.85 | 34.87 | 1.39 | 660 |
First run a two-way ANOVA across the CAZy class frequencies, evaluating the influence of the CAZy class and genus.
# provide each observation (genome) separately , not just the mean
class_freq_obs = [] # observations, one obs per genome - class pair
for genome in tqdm(set(fgp_df['Genome']), desc='Getting freq per class per genome'):
class_freqs = {
'GH': 0,
'GT': 0,
'PL': 0,
'CE': 0,
'AA': 0,
'CBM': 0,
} # {class: count}
g_rows = fgp_df[fgp_df['Genome'] == genome]
for i in range(len(g_rows)):
if g_rows.iloc[i]['Family'].startswith('GH'):
class_freqs['GH'] += 1
elif g_rows.iloc[i]['Family'].startswith('GT'):
class_freqs['GT'] += 1
elif g_rows.iloc[i]['Family'].startswith('PL'):
class_freqs['PL'] += 1
elif g_rows.iloc[i]['Family'].startswith('CE'):
class_freqs['CE'] += 1
elif g_rows.iloc[i]['Family'].startswith('AA'):
class_freqs['AA'] += 1
else:
class_freqs['CBM'] += 1
for cazy_class in class_freqs:
class_freq_obs.append(
[
g_rows.iloc[0]['Genus'],
cazy_class,
class_freqs[cazy_class],
soft_hard_dict[g_rows.iloc[0]['Genus']],
]
)
class_freq_obs_df = pd.DataFrame(
class_freq_obs,
columns=['Genus','CAZyClass','Frequency', 'Phenotype'],
)
class_freq_obs_df.head(2)
Getting freq per class per genome: 0%| | 0/717 [00:00<?, ?it/s]
| Genus | CAZyClass | Frequency | Phenotype | |
|---|---|---|---|---|
| 0 | Pectobacterium | GH | 55 | Soft |
| 1 | Pectobacterium | GT | 34 | Soft |
# Test for differences between the genera and soft/hard plant tissue targeting phenotype
# perform two-way ANOVA
model = ols(
'Frequency ~ + C(Genus) + C(CAZyClass) + C(Genus):C(CAZyClass)',
data=class_freq_obs_df,
).fit()
sm.stats.anova_lm(model, typ=2)
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Genus) | 1.372953e+04 | 7.0 | 137.428160 | 4.378316e-183 |
| C(CAZyClass) | 1.113669e+06 | 5.0 | 15606.446141 | 0.000000e+00 |
| C(Genus):C(CAZyClass) | 2.400325e+04 | 35.0 | 48.052939 | 1.698934e-276 |
| Residual | 6.071269e+04 | 4254.0 | NaN | NaN |
Now break it down per CAZy class. Testing for differences between the genera, and then between the phenotypes (soft vs. hard plant tissue targeting phenotype).
For each (genera testing and phenotype testing), run a one-way ANOVA followed by a post hoc Turkey HDS test if a statistically signficant difference is detected by the ANOVA.
# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'GH']
f_oneway(
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=70.48729402779328, pvalue=3.718837585881247e-77)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gh_class_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Genus'],
alpha=0.05, # cut off
)
gh_class_turkey_df = pd.DataFrame(
gh_class_turkey._results_table[1:],
columns=gh_class_turkey._results_table.data[0]
)
gh_class_turkey_df.to_csv("../results/cazy_classes/gh_class_turkey_results.csv")
gh_class_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 8.0 | 0.9884 | -19.8655 | 35.8655 | False |
| 1 | Acerihabitans | Brenneria | -8.5152 | 0.901 | -28.5153 | 11.485 | False |
| 2 | Acerihabitans | Dickeya | -7.6699 | 0.9374 | -27.4215 | 12.0817 | False |
| 3 | Acerihabitans | Lonsdalea | -20.1538 | 0.0458 | -40.1087 | -0.199 | True |
| 4 | Acerihabitans | Musicola | -14.0 | 0.5293 | -36.0296 | 8.0296 | False |
| 5 | Acerihabitans | Pectobacterium | -0.1551 | 1.0 | -19.8818 | 19.5716 | False |
| 6 | Acerihabitans | Samsonia | -17.0 | 0.5827 | -44.8655 | 10.8655 | False |
| 7 | Affinibrenneria | Brenneria | -16.5152 | 0.1927 | -36.5153 | 3.485 | False |
| 8 | Affinibrenneria | Dickeya | -15.6699 | 0.2371 | -35.4215 | 4.0817 | False |
| 9 | Affinibrenneria | Lonsdalea | -28.1538 | 0.0005 | -48.1087 | -8.199 | True |
| 10 | Affinibrenneria | Musicola | -22.0 | 0.0506 | -44.0296 | 0.0296 | False |
| 11 | Affinibrenneria | Pectobacterium | -8.1551 | 0.9142 | -27.8818 | 11.5716 | False |
| 12 | Affinibrenneria | Samsonia | -25.0 | 0.1161 | -52.8655 | 2.8655 | False |
| 13 | Brenneria | Dickeya | 0.8452 | 0.9971 | -2.8493 | 4.5398 | False |
| 14 | Brenneria | Lonsdalea | -11.6387 | 0.0 | -16.2992 | -6.9782 | True |
| 15 | Brenneria | Musicola | -5.4848 | 0.7514 | -15.9168 | 4.9471 | False |
| 16 | Brenneria | Pectobacterium | 8.3601 | 0.0 | 4.8015 | 11.9187 | True |
| 17 | Brenneria | Samsonia | -8.4848 | 0.9027 | -28.485 | 11.5153 | False |
| 18 | Dickeya | Lonsdalea | -12.4839 | 0.0 | -15.9248 | -9.0431 | True |
| 19 | Dickeya | Musicola | -6.3301 | 0.5274 | -16.2772 | 3.617 | False |
| 20 | Dickeya | Pectobacterium | 7.5148 | 0.0 | 5.8465 | 9.1832 | True |
| 21 | Dickeya | Samsonia | -9.3301 | 0.8403 | -29.0817 | 10.4215 | False |
| 22 | Lonsdalea | Musicola | 6.1538 | 0.6146 | -4.191 | 16.4987 | False |
| 23 | Lonsdalea | Pectobacterium | 19.9988 | 0.0 | 16.7043 | 23.2932 | True |
| 24 | Lonsdalea | Samsonia | 3.1538 | 0.9997 | -16.801 | 23.1087 | False |
| 25 | Musicola | Pectobacterium | 13.8449 | 0.0006 | 3.9475 | 23.7423 | True |
| 26 | Musicola | Samsonia | -3.0 | 0.9999 | -25.0296 | 19.0296 | False |
| 27 | Pectobacterium | Samsonia | -16.8449 | 0.159 | -36.5716 | 2.8818 | False |
Now test for differences between the phenotypes
f_oneway(
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=84.86443453434632, pvalue=8.597326941700086e-34)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gh_class_phenot_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Phenotype'],
alpha=0.05, # cut off
)
gh_class_turkey_pheno_df = pd.DataFrame(
gh_class_phenot_turkey._results_table[1:],
columns=gh_class_phenot_turkey._results_table.data[0]
)
gh_class_turkey_pheno_df.to_csv("../results/cazy_classes/gh_class_turkey_phenotype_results.csv")
gh_class_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 11.7652 | 0.0 | 9.5977 | 13.9327 | True |
| 1 | Hard | Unknown | 0.3467 | 0.9956 | -8.7652 | 9.4586 | False |
| 2 | Soft | Unknown | -11.4185 | 0.0076 | -20.3245 | -2.5125 | True |
# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'GT']
f_oneway(
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=21.25128713661445, pvalue=4.428956585432569e-26)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gt_class_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Genus'],
alpha=0.05, # cut off
)
gt_class_turkey_df = pd.DataFrame(
gt_class_turkey._results_table[1:],
columns=gt_class_turkey._results_table.data[0]
)
gt_class_turkey_df.to_csv("../results/cazy_classes/gt_class_turkey_results.csv")
gt_class_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | -9.0 | 0.9555 | -33.7332 | 15.7332 | False |
| 1 | Acerihabitans | Brenneria | -11.6667 | 0.484 | -29.4187 | 6.0853 | False |
| 2 | Acerihabitans | Dickeya | -6.0437 | 0.9668 | -23.5751 | 11.4877 | False |
| 3 | Acerihabitans | Lonsdalea | -11.5385 | 0.4959 | -29.2502 | 6.1733 | False |
| 4 | Acerihabitans | Musicola | -13.0 | 0.4682 | -32.5533 | 6.5533 | False |
| 5 | Acerihabitans | Pectobacterium | -11.7361 | 0.4571 | -29.2453 | 5.7731 | False |
| 6 | Acerihabitans | Samsonia | -19.0 | 0.2758 | -43.7332 | 5.7332 | False |
| 7 | Affinibrenneria | Brenneria | -2.6667 | 0.9998 | -20.4187 | 15.0853 | False |
| 8 | Affinibrenneria | Dickeya | 2.9563 | 0.9996 | -14.5751 | 20.4877 | False |
| 9 | Affinibrenneria | Lonsdalea | -2.5385 | 0.9999 | -20.2502 | 15.1733 | False |
| 10 | Affinibrenneria | Musicola | -4.0 | 0.9986 | -23.5533 | 15.5533 | False |
| 11 | Affinibrenneria | Pectobacterium | -2.7361 | 0.9998 | -20.2453 | 14.7731 | False |
| 12 | Affinibrenneria | Samsonia | -10.0 | 0.9232 | -34.7332 | 14.7332 | False |
| 13 | Brenneria | Dickeya | 5.623 | 0.0 | 2.3437 | 8.9022 | True |
| 14 | Brenneria | Lonsdalea | 0.1282 | 1.0 | -4.0084 | 4.2648 | False |
| 15 | Brenneria | Musicola | -1.3333 | 0.9999 | -10.5926 | 7.926 | False |
| 16 | Brenneria | Pectobacterium | -0.0694 | 1.0 | -3.228 | 3.0891 | False |
| 17 | Brenneria | Samsonia | -7.3333 | 0.9145 | -25.0853 | 10.4187 | False |
| 18 | Dickeya | Lonsdalea | -5.4948 | 0.0 | -8.5489 | -2.4407 | True |
| 19 | Dickeya | Musicola | -6.9563 | 0.2452 | -15.7853 | 1.8727 | False |
| 20 | Dickeya | Pectobacterium | -5.6924 | 0.0 | -7.1732 | -4.2116 | True |
| 21 | Dickeya | Samsonia | -12.9563 | 0.3252 | -30.4877 | 4.5751 | False |
| 22 | Lonsdalea | Musicola | -1.4615 | 0.9997 | -10.6435 | 7.7204 | False |
| 23 | Lonsdalea | Pectobacterium | -0.1976 | 1.0 | -3.1218 | 2.7265 | False |
| 24 | Lonsdalea | Samsonia | -7.4615 | 0.9059 | -25.1733 | 10.2502 | False |
| 25 | Musicola | Pectobacterium | 1.2639 | 0.9999 | -7.521 | 10.0488 | False |
| 26 | Musicola | Samsonia | -6.0 | 0.9828 | -25.5533 | 13.5533 | False |
| 27 | Pectobacterium | Samsonia | -7.2639 | 0.9127 | -24.7731 | 10.2453 | False |
# one-way ANOVA the phenotypes
f_oneway(
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=2.6331141394580087, pvalue=0.07255204343497902)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gt_class_phenot_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Phenotype'],
alpha=0.05, # cut off
)
gt_class_turkey_pheno_df = pd.DataFrame(
gt_class_phenot_turkey._results_table[1:],
columns=gt_class_phenot_turkey._results_table.data[0]
)
gt_class_turkey_pheno_df.to_csv("../results/cazy_classes/gt_class_turkey_phenotype_results.csv")
gt_class_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 1.6085 | 0.0911 | -0.1926 | 3.4097 | False |
| 1 | Hard | Unknown | -1.4933 | 0.8885 | -9.065 | 6.0784 | False |
| 2 | Soft | Unknown | -3.1019 | 0.587 | -10.5025 | 4.2987 | False |
# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'PL']
f_oneway(
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=222.45853144424663, pvalue=3.5647020997539887e-174)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
pl_class_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Genus'],
alpha=0.05, # cut off
)
pl_class_turkey_df = pd.DataFrame(
pl_class_turkey._results_table[1:],
columns=pl_class_turkey._results_table.data[0]
)
pl_class_turkey_df.to_csv("../results/cazy_classes/pl_class_turkey_results.csv")
pl_class_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 0.0 | 1.0 | -10.5548 | 10.5548 | False |
| 1 | Acerihabitans | Brenneria | 3.2424 | 0.8984 | -4.3332 | 10.8181 | False |
| 2 | Acerihabitans | Dickeya | 15.6019 | 0.0 | 8.1204 | 23.0834 | True |
| 3 | Acerihabitans | Lonsdalea | 2.7949 | 0.9515 | -4.7636 | 10.3534 | False |
| 4 | Acerihabitans | Musicola | 10.25 | 0.005 | 1.9057 | 18.5943 | True |
| 5 | Acerihabitans | Pectobacterium | 14.0162 | 0.0 | 6.5442 | 21.4882 | True |
| 6 | Acerihabitans | Samsonia | 7.0 | 0.4716 | -3.5548 | 17.5548 | False |
| 7 | Affinibrenneria | Brenneria | 3.2424 | 0.8984 | -4.3332 | 10.8181 | False |
| 8 | Affinibrenneria | Dickeya | 15.6019 | 0.0 | 8.1204 | 23.0834 | True |
| 9 | Affinibrenneria | Lonsdalea | 2.7949 | 0.9515 | -4.7636 | 10.3534 | False |
| 10 | Affinibrenneria | Musicola | 10.25 | 0.005 | 1.9057 | 18.5943 | True |
| 11 | Affinibrenneria | Pectobacterium | 14.0162 | 0.0 | 6.5442 | 21.4882 | True |
| 12 | Affinibrenneria | Samsonia | 7.0 | 0.4716 | -3.5548 | 17.5548 | False |
| 13 | Brenneria | Dickeya | 12.3595 | 0.0 | 10.9601 | 13.7589 | True |
| 14 | Brenneria | Lonsdalea | -0.4476 | 0.9945 | -2.2128 | 1.3177 | False |
| 15 | Brenneria | Musicola | 7.0076 | 0.0 | 3.0562 | 10.959 | True |
| 16 | Brenneria | Pectobacterium | 10.7738 | 0.0 | 9.4259 | 12.1217 | True |
| 17 | Brenneria | Samsonia | 3.7576 | 0.8033 | -3.8181 | 11.3332 | False |
| 18 | Dickeya | Lonsdalea | -12.8071 | 0.0 | -14.1104 | -11.5037 | True |
| 19 | Dickeya | Musicola | -5.3519 | 0.0005 | -9.1197 | -1.5842 | True |
| 20 | Dickeya | Pectobacterium | -1.5857 | 0.0 | -2.2177 | -0.9538 | True |
| 21 | Dickeya | Samsonia | -8.6019 | 0.0118 | -16.0834 | -1.1204 | True |
| 22 | Lonsdalea | Musicola | 7.4551 | 0.0 | 3.5367 | 11.3735 | True |
| 23 | Lonsdalea | Pectobacterium | 11.2213 | 0.0 | 9.9735 | 12.4692 | True |
| 24 | Lonsdalea | Samsonia | 4.2051 | 0.693 | -3.3534 | 11.7636 | False |
| 25 | Musicola | Pectobacterium | 3.7662 | 0.048 | 0.0173 | 7.5151 | True |
| 26 | Musicola | Samsonia | -3.25 | 0.9364 | -11.5943 | 5.0943 | False |
| 27 | Pectobacterium | Samsonia | -7.0162 | 0.0837 | -14.4882 | 0.4558 | False |
# one-way ANOVA the phenotypes
f_oneway(
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=689.0535233630009, pvalue=2.0980145132039776e-167)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
pl_class_phenot_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Phenotype'],
alpha=0.05, # cut off
)
pl_class_turkey_pheno_df = pd.DataFrame(
pl_class_phenot_turkey._results_table[1:],
columns=pl_class_phenot_turkey._results_table.data[0]
)
pl_class_turkey_pheno_df.to_csv("../results/cazy_classes/pl_class_turkey_phenotype_results.csv")
pl_class_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 11.5549 | 0.0 | 10.8223 | 12.2874 | True |
| 1 | Hard | Unknown | 7.2767 | 0.0 | 4.1971 | 10.3562 | True |
| 2 | Soft | Unknown | -4.2782 | 0.0026 | -7.2882 | -1.2683 | True |
# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'CE']
f_oneway(
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=103.88807201350583, pvalue=2.7639451642084564e-104)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
ce_class_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Genus'],
alpha=0.05, # cut off
)
ce_class_turkey_df = pd.DataFrame(
ce_class_turkey._results_table[1:],
columns=ce_class_turkey._results_table.data[0]
)
ce_class_turkey_df.to_csv("../results/cazy_classes/ce_class_turkey_results.csv")
ce_class_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 2.0 | 0.8959 | -2.6483 | 6.6483 | False |
| 1 | Acerihabitans | Brenneria | -0.697 | 0.9984 | -4.0332 | 2.6393 | False |
| 2 | Acerihabitans | Dickeya | 2.2767 | 0.4155 | -1.0181 | 5.5715 | False |
| 3 | Acerihabitans | Lonsdalea | -1.8462 | 0.6964 | -5.1749 | 1.4826 | False |
| 4 | Acerihabitans | Musicola | 1.0 | 0.9916 | -2.6748 | 4.6748 | False |
| 5 | Acerihabitans | Pectobacterium | 2.2176 | 0.4497 | -1.0731 | 5.5082 | False |
| 6 | Acerihabitans | Samsonia | 3.0 | 0.5085 | -1.6483 | 7.6483 | False |
| 7 | Affinibrenneria | Brenneria | -2.697 | 0.2157 | -6.0332 | 0.6393 | False |
| 8 | Affinibrenneria | Dickeya | 0.2767 | 1.0 | -3.0181 | 3.5715 | False |
| 9 | Affinibrenneria | Lonsdalea | -3.8462 | 0.0111 | -7.1749 | -0.5174 | True |
| 10 | Affinibrenneria | Musicola | -1.0 | 0.9916 | -4.6748 | 2.6748 | False |
| 11 | Affinibrenneria | Pectobacterium | 0.2176 | 1.0 | -3.0731 | 3.5082 | False |
| 12 | Affinibrenneria | Samsonia | 1.0 | 0.998 | -3.6483 | 5.6483 | False |
| 13 | Brenneria | Dickeya | 2.9737 | 0.0 | 2.3574 | 3.59 | True |
| 14 | Brenneria | Lonsdalea | -1.1492 | 0.0002 | -1.9266 | -0.3718 | True |
| 15 | Brenneria | Musicola | 1.697 | 0.062 | -0.0432 | 3.4371 | False |
| 16 | Brenneria | Pectobacterium | 2.9146 | 0.0 | 2.3209 | 3.5082 | True |
| 17 | Brenneria | Samsonia | 3.697 | 0.018 | 0.3607 | 7.0332 | True |
| 18 | Dickeya | Lonsdalea | -4.1229 | 0.0 | -4.6968 | -3.5489 | True |
| 19 | Dickeya | Musicola | -1.2767 | 0.2739 | -2.936 | 0.3826 | False |
| 20 | Dickeya | Pectobacterium | -0.0591 | 0.9982 | -0.3374 | 0.2192 | False |
| 21 | Dickeya | Samsonia | 0.7233 | 0.9978 | -2.5715 | 4.0181 | False |
| 22 | Lonsdalea | Musicola | 2.8462 | 0.0 | 1.1205 | 4.5718 | True |
| 23 | Lonsdalea | Pectobacterium | 4.0637 | 0.0 | 3.5142 | 4.6133 | True |
| 24 | Lonsdalea | Samsonia | 4.8462 | 0.0003 | 1.5174 | 8.1749 | True |
| 25 | Musicola | Pectobacterium | 1.2176 | 0.3279 | -0.4334 | 2.8686 | False |
| 26 | Musicola | Samsonia | 2.0 | 0.7166 | -1.6748 | 5.6748 | False |
| 27 | Pectobacterium | Samsonia | 0.7824 | 0.9963 | -2.5082 | 4.0731 | False |
# one-way ANOVA the phenotypes
f_oneway(
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=321.59905675699997, pvalue=2.6098906666833877e-100)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
ce_class_phenot_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Phenotype'],
alpha=0.05, # cut off
)
ce_class_turkey_pheno_df = pd.DataFrame(
ce_class_phenot_turkey._results_table[1:],
columns=ce_class_phenot_turkey._results_table.data[0]
)
ce_class_turkey_pheno_df.to_csv("../results/cazy_classes/ce_class_turkey_phenotype_results.csv")
ce_class_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 3.4367 | 0.0 | 3.1178 | 3.7555 | True |
| 1 | Hard | Unknown | 2.2 | 0.0004 | 0.8595 | 3.5405 | True |
| 2 | Soft | Unknown | -1.2367 | 0.069 | -2.5469 | 0.0735 | False |
# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'AA']
f_oneway(
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=48.76173592829987, pvalue=1.3727065409992368e-56)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
aa_class_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Genus'],
alpha=0.05, # cut off
)
aa_class_turkey_df = pd.DataFrame(
aa_class_turkey._results_table[1:],
columns=aa_class_turkey._results_table.data[0]
)
aa_class_turkey_df.to_csv("../results/cazy_classes/aa_class_turkey_results.csv")
aa_class_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | 0.0 | 1.0 | -1.7994 | 1.7994 | False |
| 1 | Acerihabitans | Brenneria | 0.2424 | 0.9992 | -1.0491 | 1.5339 | False |
| 2 | Acerihabitans | Dickeya | 0.4126 | 0.9767 | -0.8628 | 1.6881 | False |
| 3 | Acerihabitans | Lonsdalea | 0.0 | 1.0 | -1.2886 | 1.2886 | False |
| 4 | Acerihabitans | Musicola | 0.0 | 1.0 | -1.4225 | 1.4225 | False |
| 5 | Acerihabitans | Pectobacterium | 0.8843 | 0.4092 | -0.3896 | 2.1581 | False |
| 6 | Acerihabitans | Samsonia | 1.0 | 0.6942 | -0.7994 | 2.7994 | False |
| 7 | Affinibrenneria | Brenneria | 0.2424 | 0.9992 | -1.0491 | 1.5339 | False |
| 8 | Affinibrenneria | Dickeya | 0.4126 | 0.9767 | -0.8628 | 1.6881 | False |
| 9 | Affinibrenneria | Lonsdalea | 0.0 | 1.0 | -1.2886 | 1.2886 | False |
| 10 | Affinibrenneria | Musicola | 0.0 | 1.0 | -1.4225 | 1.4225 | False |
| 11 | Affinibrenneria | Pectobacterium | 0.8843 | 0.4092 | -0.3896 | 2.1581 | False |
| 12 | Affinibrenneria | Samsonia | 1.0 | 0.6942 | -0.7994 | 2.7994 | False |
| 13 | Brenneria | Dickeya | 0.1702 | 0.3721 | -0.0684 | 0.4088 | False |
| 14 | Brenneria | Lonsdalea | -0.2424 | 0.2197 | -0.5434 | 0.0585 | False |
| 15 | Brenneria | Musicola | -0.2424 | 0.958 | -0.9161 | 0.4312 | False |
| 16 | Brenneria | Pectobacterium | 0.6418 | 0.0 | 0.412 | 0.8716 | True |
| 17 | Brenneria | Samsonia | 0.7576 | 0.6317 | -0.5339 | 2.0491 | False |
| 18 | Dickeya | Lonsdalea | -0.4126 | 0.0 | -0.6348 | -0.1904 | True |
| 19 | Dickeya | Musicola | -0.4126 | 0.5148 | -1.055 | 0.2297 | False |
| 20 | Dickeya | Pectobacterium | 0.4716 | 0.0 | 0.3639 | 0.5794 | True |
| 21 | Dickeya | Samsonia | 0.5874 | 0.8573 | -0.6881 | 1.8628 | False |
| 22 | Lonsdalea | Musicola | 0.0 | 1.0 | -0.668 | 0.668 | False |
| 23 | Lonsdalea | Pectobacterium | 0.8843 | 0.0 | 0.6715 | 1.097 | True |
| 24 | Lonsdalea | Samsonia | 1.0 | 0.2634 | -0.2886 | 2.2886 | False |
| 25 | Musicola | Pectobacterium | 0.8843 | 0.0008 | 0.2451 | 1.5234 | True |
| 26 | Musicola | Samsonia | 1.0 | 0.392 | -0.4225 | 2.4225 | False |
| 27 | Pectobacterium | Samsonia | 0.1157 | 1.0 | -1.1581 | 1.3896 | False |
# one-way ANOVA the phenotypes
f_oneway(
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=61.148610732738945, pvalue=3.072615168467825e-25)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
aa_class_phenot_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Phenotype'],
alpha=0.05, # cut off
)
aa_class_turkey_pheno_df = pd.DataFrame(
aa_class_phenot_turkey._results_table[1:],
columns=aa_class_phenot_turkey._results_table.data[0]
)
aa_class_turkey_pheno_df.to_csv("../results/cazy_classes/aa_class_turkey_phenotype_results.csv")
aa_class_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 0.612 | 0.0 | 0.4775 | 0.7464 | True |
| 1 | Hard | Unknown | -0.12 | 0.872 | -0.6853 | 0.4453 | False |
| 2 | Soft | Unknown | -0.732 | 0.0055 | -1.2845 | -0.1794 | True |
# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'CBM']
f_oneway(
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],
)
F_onewayResult(statistic=80.88764296424834, pvalue=4.027926882977604e-86)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
cbm_class_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Genus'],
alpha=0.05, # cut off
)
cbm_class_turkey_df = pd.DataFrame(
cbm_class_turkey._results_table[1:],
columns=cbm_class_turkey._results_table.data[0]
)
cbm_class_turkey_df.to_csv("../results/cazy_classes/cbm_class_turkey_results.csv")
cbm_class_turkey_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Acerihabitans | Affinibrenneria | -1.0 | 0.9999 | -8.6335 | 6.6335 | False |
| 1 | Acerihabitans | Brenneria | -1.6364 | 0.9853 | -7.1152 | 3.8425 | False |
| 2 | Acerihabitans | Dickeya | 1.4272 | 0.993 | -3.9836 | 6.838 | False |
| 3 | Acerihabitans | Lonsdalea | -2.2564 | 0.9148 | -7.7229 | 3.2101 | False |
| 4 | Acerihabitans | Musicola | 0.0 | 1.0 | -6.0348 | 6.0348 | False |
| 5 | Acerihabitans | Pectobacterium | 3.1204 | 0.6506 | -2.2836 | 8.5243 | False |
| 6 | Acerihabitans | Samsonia | -1.0 | 0.9999 | -8.6335 | 6.6335 | False |
| 7 | Affinibrenneria | Brenneria | -0.6364 | 1.0 | -6.1152 | 4.8425 | False |
| 8 | Affinibrenneria | Dickeya | 2.4272 | 0.8734 | -2.9836 | 7.838 | False |
| 9 | Affinibrenneria | Lonsdalea | -1.2564 | 0.997 | -6.7229 | 4.2101 | False |
| 10 | Affinibrenneria | Musicola | 1.0 | 0.9996 | -5.0348 | 7.0348 | False |
| 11 | Affinibrenneria | Pectobacterium | 4.1204 | 0.2852 | -1.2836 | 9.5243 | False |
| 12 | Affinibrenneria | Samsonia | 0.0 | 1.0 | -7.6335 | 7.6335 | False |
| 13 | Brenneria | Dickeya | 3.0635 | 0.0 | 2.0515 | 4.0756 | True |
| 14 | Brenneria | Lonsdalea | -0.62 | 0.8199 | -1.8967 | 0.6566 | False |
| 15 | Brenneria | Musicola | 1.6364 | 0.6604 | -1.2214 | 4.4941 | False |
| 16 | Brenneria | Pectobacterium | 4.7567 | 0.0 | 3.7819 | 5.7316 | True |
| 17 | Brenneria | Samsonia | 0.6364 | 1.0 | -4.8425 | 6.1152 | False |
| 18 | Dickeya | Lonsdalea | -3.6836 | 0.0 | -4.6262 | -2.741 | True |
| 19 | Dickeya | Musicola | -1.4272 | 0.755 | -4.1521 | 1.2977 | False |
| 20 | Dickeya | Pectobacterium | 1.6932 | 0.0 | 1.2362 | 2.1502 | True |
| 21 | Dickeya | Samsonia | -2.4272 | 0.8734 | -7.838 | 2.9836 | False |
| 22 | Lonsdalea | Musicola | 2.2564 | 0.2329 | -0.5775 | 5.0903 | False |
| 23 | Lonsdalea | Pectobacterium | 5.3768 | 0.0 | 4.4743 | 6.2793 | True |
| 24 | Lonsdalea | Samsonia | 1.2564 | 0.997 | -4.2101 | 6.7229 | False |
| 25 | Musicola | Pectobacterium | 3.1204 | 0.0116 | 0.4091 | 5.8317 | True |
| 26 | Musicola | Samsonia | -1.0 | 0.9996 | -7.0348 | 5.0348 | False |
| 27 | Pectobacterium | Samsonia | -4.1204 | 0.2852 | -9.5243 | 1.2836 | False |
# one-way ANOVA the phenotypes
f_oneway(
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)
F_onewayResult(statistic=185.10059849921967, pvalue=1.7220017140436788e-65)
# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
cbm_class_phenot_turkey = pairwise_tukeyhsd(
endog=sub_class_df['Frequency'],
groups=sub_class_df['Phenotype'],
alpha=0.05, # cut off
)
cbm_class_turkey_pheno_df = pd.DataFrame(
cbm_class_phenot_turkey._results_table[1:],
columns=cbm_class_phenot_turkey._results_table.data[0]
)
cbm_class_turkey_pheno_df.to_csv("../results/cazy_classes/cbm_class_turkey_phenotype_results.csv")
cbm_class_turkey_pheno_df
| group1 | group2 | meandiff | p-adj | lower | upper | reject | |
|---|---|---|---|---|---|---|---|
| 0 | Hard | Soft | 4.4937 | 0.0 | 3.9416 | 5.0457 | True |
| 1 | Hard | Unknown | 1.92 | 0.1275 | -0.4008 | 4.2408 | False |
| 2 | Soft | Unknown | -2.5737 | 0.0215 | -4.842 | -0.3053 | True |
Very few genomes contained any AA CAZymes.
Identify the number of genomes were no AA CAZymes were found, additionally, find the maximum, minimum and mode number of AA CAZymes found across all 660 genomes.
# calc genomes with no AAs
no_aa_genomes = 0
for genus in class_size_dict['AA']:
for genome in class_size_dict['AA'][genus]:
no_aa_genomes+=1
print(f"{no_aa_genomes} genomes have no AAs")
aa_counts = [0] * no_aa_genomes
for genus in class_size_dict['AA']:
for genome in class_size_dict['AA'][genus]:
aa_counts.append(len(class_size_dict['AA'][genus][genome]['proteins']))
print(f"Max: {max(aa_counts)}\nMin: {min(aa_counts)}\nMode: {statistics.mode(aa_counts)}")
465 genomes have no AAs Max: 2 Min: 0 Mode: 0
Count the number of genomes were 1 or 2 AA CAZymes were found.
# find genomes with 2 AAs
two_aa_genomes = {}
one_aa_genomes = {}
for genus in class_size_dict['AA']:
for genome in class_size_dict['AA'][genus]:
if len(class_size_dict['AA'][genus][genome]['proteins']) == 2:
try:
two_aa_genomes[genus].add(genome)
except KeyError:
two_aa_genomes[genus] = {genome}
elif len(class_size_dict['AA'][genus][genome]['proteins']) == 1:
try:
one_aa_genomes[genus].add(genome)
except KeyError:
one_aa_genomes[genus] = {genome}
two_aa_genomes
{'Pectobacterium': {'GCA_000738125.1',
'GCA_000749915.1',
'GCA_011378985.1',
'GCA_011379045.1',
'GCA_020971565.1',
'GCA_024343355.1',
'GCA_024722495.1',
'GCA_028335745.1',
'GCA_900195285.2',
'GCA_900195295.2'}}
for genus in one_aa_genomes:
print(f"{genus}: {len(one_aa_genomes[genus])}")
Pectobacterium: 361 Dickeya: 85 Brenneria: 8 Samsonia: 1
Calculate the number of CAZymes per CAZy family presented in each genome, where the number of CAZymes is the number of unqiue protein accessions. This value may be greater than the number of CAZymes in the genome because a CAZyme may be annotated with multiple CAZy families.
# make output directory
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/cazy_families/'), force=True, nodelete=True)
Output directory ../results/cazy_families exists, nodelete is True. Adding output to output directory.
fam_freq_df = build_fam_freq_df(fgp_df, ['Genus', 'Species'])
fam_freq_df
The dataset contains 117 CAZy families
Counting fam frequencies: 100%|███████████████████████████████████████████████████████████████████| 717/717 [00:45<00:00, 15.80it/s]
| Genome | Genus | Species | AA10 | AA3 | CBM0 | CBM13 | CBM18 | CBM3 | CBM32 | ... | PL11 | PL17 | PL2 | PL22 | PL26 | PL3 | PL35 | PL38 | PL4 | PL9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GCA_009931295.1 | Pectobacterium | odoriferum | 0 | 1 | 0 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 2 | 1 | 1 | 2 | 0 | 1 | 1 | 2 |
| 1 | GCA_009874305.1 | Dickeya | solani | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 1 | 3 |
| 2 | GCA_016944275.1 | Pectobacterium | brasiliense | 0 | 1 | 0 | 1 | 0 | 1 | 3 | ... | 0 | 0 | 2 | 1 | 0 | 2 | 0 | 1 | 1 | 2 |
| 3 | GCA_003403135.1 | Dickeya | dianthicola | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 1 | 2 | 3 |
| 4 | GCA_922011735.1 | Pectobacterium | versatile | 0 | 1 | 0 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 712 | GCA_022747695.1 | Dickeya | dianthicola | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 2 | 3 |
| 713 | GCA_003096475.1 | Pectobacterium | versatile | 0 | 1 | 0 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 2 | 1 | 1 | 2 | 0 | 0 | 1 | 2 |
| 714 | GCA_016415585.1 | Pectobacterium | carotovorum | 1 | 0 | 0 | 0 | 0 | 1 | 2 | ... | 0 | 0 | 2 | 1 | 0 | 2 | 0 | 1 | 1 | 2 |
| 715 | GCA_023347555.1 | Brenneria | tiliae | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 1 | 0 |
| 716 | GCA_000406085.2 | Dickeya | sp. CSL RW240 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 2 | 3 |
717 rows × 120 columns
fam_freq_df.to_csv("../results/cazy_families/cazy_fam_freqs.csv")
Index fam_freq_df so that each row name contains the genome, Genus and Species, so that the genomic accession, genus and species is included in the clustermap.
# index the taxonomy data and genome (ggs=genome_genus_species)
fam_freq_df_ggs = copy(fam_freq_df) # so does not alter fam_freq_df
fam_freq_df_ggs = fam_freq_df_ggs.set_index(['Genome','Genus','Species'])
fam_freq_df_ggs.head(1)
| AA10 | AA3 | CBM0 | CBM13 | CBM18 | CBM3 | CBM32 | CBM4 | CBM48 | CBM5 | ... | PL11 | PL17 | PL2 | PL22 | PL26 | PL3 | PL35 | PL38 | PL4 | PL9 | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Genome | Genus | Species | |||||||||||||||||||||
| GCA_009931295.1 | Pectobacterium | odoriferum | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 2 | 1 | ... | 0 | 0 | 2 | 1 | 1 | 2 | 0 | 1 | 1 | 2 |
1 rows × 117 columns
Define a colour scheme to colour code the rows by, in this case by the genus of the species.
To do this, add a column containing the data to be used to colour code each row, e.g. a genus. This extra column is removed by build_row_colours().
# define a colour scheme to colour code rows by genus
fam_freq_df_ggs['Genus'] = list(fam_freq_df['Genus']) # add column to use for colour scheme, is removed
fam_freq_genus_row_colours, fam_g_lut = build_row_colours(fam_freq_df_ggs, 'Genus', 'Set2')
Use the function build_family_clustermap() from cazomevolve to build clustermaps of the CAZy family frequencies, with different combinations of additional row colours. For example, the row colours could list the genus and/or species classification of each genome.
# make a figure that is full size, and all data is legible
print("""
A large version of a cluster map of CAZy family frequencies with each row representing a unique
genome (and colour coded by the genus classification of the genome), and each column representing
a unique CAZy family.
This full sized figure is generated for human readability. However, this figure is extremely large, and
potentially too large for publication. Therefore, a smaller figure is generated below.
""")
large_fam_clustermap = build_family_clustermap(
fam_freq_df_ggs,
row_colours=fam_freq_genus_row_colours,
fig_size=(40,120),
file_path="../results/cazy_families/fam_freq_clustermap.svg",
file_format='svg',
lut=fam_g_lut,
legend_title='Genus',
dendrogram_ratio=(0.2,0.05),
title_fontsize=28,
legend_fontsize=24,
cbar_pos=(0, 0.95, 0.05, 0.05),
)
A large version of a cluster map of CAZy family frequencies with each row representing a unique genome (and colour coded by the genus classification of the genome), and each column representing a unique CAZy family. This full sized figure is generated for human readability. However, this figure is extremely large, and potentially too large for publication. Therefore, a smaller figure is generated below.
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
# make a figure the optimal size to fit in a paper
build_family_clustermap(
fam_freq_df_ggs,
row_colours=fam_freq_genus_row_colours,
fig_size=(20,35),
file_path="../results/cazy_families/paper_fam_freq_clustermap.png",
file_format='png',
font_scale=0.5,
lut=fam_g_lut,
legend_title='Genus',
dendrogram_ratio=(0.1,0.05),
title_fontsize=18,
legend_fontsize=16,
cbar_pos=(0, 0.95, 0.05, 0.05),
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f9d2c493190>
Looking at the species names in the clustermap, there appears to be clustering of the genomes in a manner that correlates not only with their genus classificaiton but also their species classification. Therefore, add an additional row of row-colours, marking the species classification of each genome.
# define a colour scheme to colour code rows by SPECIES
fam_freq_df_ggs['Species'] = list(fam_freq_df['Species']) # add column to use for colour scheme, is removed
fam_freq_species_row_colours, fam_s_lut = build_row_colours(fam_freq_df_ggs, 'Species', 'rainbow')
# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
df=fam_freq_df_ggs,
row_colours=[fam_freq_genus_row_colours,fam_freq_species_row_colours],
luts=[fam_g_lut, fam_s_lut],
legend_titles=['Genus', 'Species'],
bbox_to_anchors=[(0.2,1.045), (0.63,1.05)],
legend_cols=[1,5],
fig_size=(20,40),
file_path="../results/cazy_families/paper_genus_species_fam_freq_clustermap.png",
file_format='png',
font_scale=1,
dendrogram_ratio=(0.1,0.05),
title_fontsize=18,
legend_fontsize=16,
cbar_pos=(0.01, 0.96, 0.1, 0.1), #left, bottom, width, height
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f9d2d64c710>
# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
df=fam_freq_df_ggs,
row_colours=[fam_freq_genus_row_colours,fam_freq_species_row_colours],
luts=[fam_g_lut, fam_s_lut],
legend_titles=['Genus', 'Species'],
bbox_to_anchors=[(0.2,1.04), (0.63,1.05)],
legend_cols=[1,5],
fig_size=(45,150),
file_path="../results/cazy_families/paper_genus_species_fam_freq_clustermap.pdf",
file_format='pdf',
font_scale=1,
dendrogram_ratio=(0.1,0.05),
title_fontsize=14,
legend_fontsize=12,
cbar_pos=(0.01, 0.96, 0.1, 0.1), #left, bottom, width, height
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f9cff7e13d0>
The clustering of the genomes into their respective genera appears to also correlate with the soft versus hard plant tissue targeting phenotypes. Therefore, add an additional row colour that colour codes each row by the genomes preference for soft or hard plant tissues.
# define a colour scheme to colour code SOFT vs HARD plant tissue targeting genomes
phenotype_col = []
for ri in range(len(fam_freq_df_ggs)):
if list(fam_freq_df['Genus'])[ri] in ['Pectobacterium', 'Dickeya', 'Musicola']:
phenotype_col.append('Soft tissue targeting')
else:
phenotype_col.append('Hard tissue targeting')
fam_freq_df_ggs['Phenotype'] = phenotype_col
fam_freq_pheno_row_colours, fam_p_lut = build_row_colours(fam_freq_df_ggs, 'Phenotype', "Set1")
build_family_clustermap_multi_legend(
df=fam_freq_df_ggs,
row_colours=[fam_freq_pheno_row_colours, fam_freq_genus_row_colours],
luts=[fam_p_lut, fam_g_lut],
legend_titles=['Phenotype', 'Genus'],
bbox_to_anchors=[(0.225,1.045), (0.63,1.04)],
legend_cols=[1,5],
fig_size=(27,41),
file_path="../results/cazy_families/paper_pheno_genus_fam_freq_clustermap.png",
file_format='png',
font_scale=0.7,
dendrogram_ratio=(0.1,0.05),
title_fontsize=18,
legend_fontsize=16,
cbar_pos=(0.01, 0.96, 0.1, 0.1), #left, bottom, width, height
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f9d14438e90>
In the clustermaps the genomes GCA_029023745.1 (Pectobacterium colocasium), GCA_000749925.1 and GCA_000749945.1 (Pectobacterium betavasulorum) contained under estimated representations of their respective CAZomes.
Six Pectobacterium genomes were not included within the main Pectobacterium subtree (dendrogram on the RHS of clustermap):
Extracted from the paper:
The genomes appeared to contain fewer total CAZymes (inferred from the lower CAZy family frequencies) than other Pectobacterium genomes, inferring a potential underestimation of their CAZyme features. Genomes GCA_000749925.1, GCA_000749845.1, and GCA_000803215.1 were were listed with the assembly status 'contig' in NCBI (June 2021). Genomic assemblies with the assembly status of 'contig' may contain incomplete genomic sequences. Indeed, the reported CheckM (Parks et al 2015, Genome Res) analysis listed the GCA_000749925.1 and GCA_000749845.1 as missing 5% (100th percentile) of their genomes with 2.25-2.5% contamination, and GCA_000803215.1 as missing 10% (100th percentile). Furthermore, although listed with the assembly status 'complete genome', assembly GCA_025946765.1 was listed as missing 19% (100th percentile) of its genome by CheckM, and the scaffold GCA_004137815.1 was listed as missing 11% (33rd percentile) with 9% contamination. Therefore, the annotated proteomes potentially underestimates the number of features (including CAZymes) in the genomes, and were excluded from the downstream analyses. The genome GCA_029023745.1 was listed with the assembly status 'complete genome', but the NCBI Prokaryotic Genome Annotation Pipeline (PGAGP) output contained a suspiciously high number of frameshifted proteins (greater than 30%), inferring a potentially poor annotation of the genome that may have resulted in an underestimation of its CAZyme features. Therefore, this genome was also excluded from downstream analyses.
genomes_to_remove = [
'GCA_000749925.1',
'GCA_000749845.1',
'GCA_000803215.1',
'GCA_025946765.1',
'GCA_004137815.1',
'GCA_029023745.1',
]
fam_freq_filtered_df = fam_freq_df[~fam_freq_df['Genome'].isin(genomes_to_remove)]
print(f"Original df length: {len(fam_freq_df)}\nLength after removing genome: {len(fam_freq_filtered_df)}")
Original df length: 717 Length after removing genome: 711
Replot the clustermap, exlucding the removed genomes.
fam_freq_filtered_df_ggs = fam_freq_filtered_df.set_index(['Genome', 'Genus', 'Species'])
# make a figure the optimal size to fit in a paper
build_family_clustermap(
fam_freq_df_ggs,
row_colours=fam_freq_genus_row_colours,
fig_size=(20,70),
file_path="../results/cazy_families/paper_fam_freq_clustermap_FILTERED.svg",
file_format='svg',
font_scale=0.5,
lut=fam_g_lut,
legend_title='Genus',
dendrogram_ratio=(0.1,0.05),
title_fontsize=18,
legend_fontsize=16,
cbar_pos=(0, 0.95, 0.05, 0.05),
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f9cf9127990>
Identify CAZy families that are only present in one group, e.g. one Genus, using the function get_group_specific_fams from cazomevolve.
Specifically, get_group_specific_fams returns two dicts:
{group: {only unique fams}}{group: {all fams}}all_families = list(fam_freq_df.columns)[3:]
# dict {group: {only unique fams}} and dict {group: {all fams}}
unique_grp_fams, group_fams = get_group_specific_fams(fam_freq_filtered_df, 'Genus', all_families)
unique_grp_fams
Identifying fams in each Genus: 100%|█████████████████████████████████████████████████████████████| 711/711 [00:09<00:00, 72.41it/s] Identifying Genus specific fams: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 12363.46it/s]
{'Pectobacterium': {'AA10',
'CBM13',
'GH121',
'GH146',
'GH18',
'GT101',
'GT102',
'GT11',
'GT111',
'GT14',
'GT24',
'GT52',
'PL11'},
'Dickeya': {'CBM4', 'CE2', 'GH113', 'GH148', 'GH25', 'GH91', 'GT97', 'PL10'},
'Brenneria': {'GH106', 'GT21', 'PL17'},
'Acerihabitans': {'GH127', 'GH15'}}
Identify families that are only found in hard plant tissue targeting genomes, and those families only found in soft plant tissue targeting species.
hard_soft_fams_dict = {'hard': set(), 'soft': set(), 'unknown': set()}
for ri in tqdm(range(len(fam_freq_filtered_df)), desc="Identifying Soft and Hard plant tissue targeting families"):
genus = fam_freq_filtered_df.iloc[ri]['Genus']
if genus in ['Pectobacterium','Dickeya']:
grp = 'soft'
elif genus == 'Musicola':
grp = 'unknown'
else:
grp = 'hard'
for fam in fam_freq_filtered_df.columns[3:]:
if fam_freq_filtered_df.iloc[ri][fam] >= 1:
hard_soft_fams_dict[grp].add(fam)
unique_hard_fams = hard_soft_fams_dict['hard'].difference(hard_soft_fams_dict['soft'])
unique_soft_fams = hard_soft_fams_dict['soft'].difference(hard_soft_fams_dict['hard'])
print("Hard plant tissue targeting specific families:")
for fam in unique_hard_fams:
print(fam)
print("Soft plant tissue targeting specific families:")
for fam in unique_soft_fams:
print(fam)
Identifying Soft and Hard plant tissue targeting families: 0%| | 0/711 [00:00<?, ?it/s]
Hard plant tissue targeting specific families: GH140 GH15 PL17 GH127 GT21 GH37 GH51 GH39 GH106 Soft plant tissue targeting specific families: GT24 GH146 CBM4 GH18 GT25 CBM0 GT14 GT111 PL11 GH113 GH91 CBM91 PL10 PL35 GT101 GT52 CBM13 GH16 GH148 GT11 CE2 AA10 GH25 GH121 GT97 GT102
# find the number of soft plant tissue targeting genomes that contain GH68
# find unique counts - if all 1 or 0 then can use sum to get genome count
print(set(fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(['Pectobacterium','Dickeya'])]['GH68']))
# get num of genomes
sum(fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(['Pectobacterium','Dickeya'])]['GH68'])
{0, 1}
4
Compile all this data on genus and phenotype specific CAZy families into a single dataframe that will be similar to one presented in a paper/report.
# convert into df
unique_grp_data = []
unique_grp_fams['Hard tissue'] = unique_hard_fams
unique_grp_fams['Soft tissue'] = unique_soft_fams
for genus in unique_grp_fams:
new_data = [genus]
for cazy_class in ['GH', 'GT', 'CE', 'PL', 'AA', 'CBM']:
added = False
class_data = []
for fam in unique_grp_fams[genus]:
if fam.startswith(cazy_class):
class_data.append(fam)
added = True
if added is False:
class_data.append("")
class_data.sort()
new_data.append(", ".join(class_data))
unique_grp_data.append(new_data)
unique_grp_df = pd.DataFrame(unique_grp_data, columns=['Genus', 'GH', 'GT', 'CE', 'PL', 'AA', 'CBM'])
unique_grp_df.to_csv("../results/cazy_families/unique_grp_fams.tsv", sep='\t')
unique_grp_df
| Genus | GH | GT | CE | PL | AA | CBM | |
|---|---|---|---|---|---|---|---|
| 0 | Pectobacterium | GH121, GH146, GH18 | GT101, GT102, GT11, GT111, GT14, GT24, GT52 | PL11 | AA10 | CBM13 | |
| 1 | Dickeya | GH113, GH148, GH25, GH91 | GT97 | CE2 | PL10 | CBM4 | |
| 2 | Brenneria | GH106 | GT21 | PL17 | |||
| 3 | Acerihabitans | GH127, GH15 | |||||
| 4 | Hard tissue | GH106, GH127, GH140, GH15, GH37, GH39, GH51 | GT21 | PL17 | |||
| 5 | Soft tissue | GH113, GH121, GH146, GH148, GH16, GH18, GH25, ... | GT101, GT102, GT11, GT111, GT14, GT24, GT25, G... | CE2 | PL10, PL11, PL35 | AA10 | CBM0, CBM13, CBM4, CBM91 |
Pectobacterium and Dickeya share a similar plant host range but show notable diversity in the compositions of the CAZyme-complements. drop all genomes not from Pectobacterium and Dickeya from the fam_freq_df dataframe, and repeat the analysis to identify genus specific CAZy families.
all_families = list(fam_freq_filtered_df.columns)[3:]
pd_fam_freq_df_filtered = fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(
['Pectobacterium', 'Dickeya']
)]
# dict {group: {only unique fams}} and dict {group: {all fams}}
pd_unique_grp_fams, pd_group_fams = get_group_specific_fams(pd_fam_freq_df_filtered, 'Genus', all_families)
pd_unique_grp_fams
Identifying fams in each Genus: 100%|█████████████████████████████████████████████████████████████| 632/632 [00:08<00:00, 72.62it/s] Identifying Genus specific fams: 100%|█████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11244.78it/s]
{'Pectobacterium': {'AA10',
'CBM13',
'CBM18',
'CBM3',
'CBM67',
'GH108',
'GH12',
'GH121',
'GH146',
'GH153',
'GH154',
'GH18',
'GH38',
'GH65',
'GH68',
'GT101',
'GT102',
'GT11',
'GT111',
'GT14',
'GT20',
'GT24',
'GT32',
'GT52',
'GT73',
'PL11'},
'Dickeya': {'CBM4',
'CE2',
'GH113',
'GH148',
'GH25',
'GH26',
'GH91',
'GT97',
'PL0',
'PL10'}}
To estimate if the CAZome has been exhausted, rare faction curves where generated for all Pectobacteriaceae genoems, and for each genus represented by multiple genomes:
def plot_rarefaction_curve(
df,
all_families,
num_of_runs=100,
):
"""Build rare faction curve
:param df: pandas df, cazy fam freq df
:param all_families: list of CAZy families to be analysed
:param num_of_runs: number of times to resample dataset
Return
* rare faction plot
* df of the number of families added with new additional genome
* df of number of families in total fam pool with each additional genome
"""
genomes = list(df['Genome'])
run_outputs_addings = {} # num of run: [num of new fams found ADDED a new genome to the pool]
run_outputs_counts = {} # num of run: [num of unique fams in total fam pool after adding a new genome to the pool]
for run in tqdm(range(num_of_runs), desc="Counting added families"):
random.shuffle(genomes)
fams_to_parse = set(copy(all_families))
num_added_families = [0] # one item for each genome, int, num of new families added
family_pool_size = [0] # number of unqiue fams seen after adding each genome to the pool
families = set() # to track which families have been seen
for genome in genomes:
fams_added = 0 # num of fams added to total pool for this genome
genome_row = df[df['Genome'] == genome]
for fam in fams_to_parse:
if genome_row[fam].values[0] > 0: # fam in genome
if fam not in families:
fams_added += 1
families.add(fam)
# parsed all families
# record number of new families found in this genome
num_added_families.append(fams_added)
# record number of unique fams seen after analysing genomes so far
family_pool_size.append(len(families))
# find which families are in fams_to_parse but are not in families
# do not reparse fams that have already been found
fams_to_parse = fams_to_parse.difference(families)
# parsed all genomes
run_outputs_addings[run] = num_added_families
run_outputs_counts[run] = family_pool_size
run_outputs_df_addings = pd.DataFrame(run_outputs_addings)
run_outputs_df_counts = pd.DataFrame(run_outputs_counts)
# calculate the averages
run_count_avg = [] # avg num of fams after adding each genome
genomes = list(range(1, len(run_outputs_df_counts)+1))
for i in range(len(run_outputs_df_counts)):
run_count_avg.append(np.mean(run_outputs_df_counts.iloc[i]))
genomes.append
run_avg_df = pd.DataFrame(
{
'Number of CAZy families': run_count_avg,
'Number of genomes': genomes,
}
)
# Generate plot
# subplot 1 = all runs
# subplot 2 = averages of all runs
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7.5))
#add DataFrames to subplots
run_outputs_df_counts.plot(
ax=axes[0],
xlabel='Number of genomes',
ylabel='Number of CAZy families',
legend=False,
);
run_avg_df.plot(
ax=axes[1],
x='Number of genomes',
y='Number of CAZy families',
legend=False,
);
# raref_curve = sns.lineplot(
# data=run_outputs_df_counts,
# legend=False,
# ); # linestyle='solid',
# plt.xlabel('Number of genomes');
# plt.ylabel('Number of CAZy families');
return fig, run_outputs_df_addings, run_outputs_df_counts
all_families = list(fam_freq_df.columns)[3:]
# calculate rare factions for all of pectobacteriaceae
pecto_rarefact_plt, pecto_runs_addings, pecto_runs_counts = plot_rarefaction_curve(
fam_freq_filtered_df,
all_families,
)
Counting added families: 0%| | 0/100 [00:00<?, ?it/s]
# calculate rare factions for Pectobacterium
pectobact_rarefact_plt, pectobact_runs_addings, pectobact_runs_counts = plot_rarefaction_curve(
fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Pectobacterium'],
all_families,
)
Counting added families: 0%| | 0/100 [00:00<?, ?it/s]
# calculate rare factions for Dickeya
dic_rarefact_plt, dic_runs_addings, dic_runs_counts = plot_rarefaction_curve(
fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Dickeya'],
all_families,
)
Counting added families: 0%| | 0/100 [00:00<?, ?it/s]
# calculate rare factions for Musicola
mus_rarefact_plt, mus_runs_addings, mus_runs_counts = plot_rarefaction_curve(
fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Musicola'],
all_families,
)
Counting added families: 0%| | 0/100 [00:00<?, ?it/s]
# calculate rare factions for Brenneria
ben_rarefact_plt, ben_runs_addings, ben_runs_counts = plot_rarefaction_curve(
fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Brenneria'],
all_families,
)
Counting added families: 0%| | 0/100 [00:00<?, ?it/s]
# calculate rare factions for Lonsdalea
lon_rarefact_plt, lon_runs_addings, lon_runs_counts = plot_rarefaction_curve(
fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Lonsdalea'],
all_families,
)
Counting added families: 0%| | 0/100 [00:00<?, ?it/s]
Plot all the averages (i.e. curves representing the average number of families in total family pool after randomly adding a new genome to the dataset) on a single plot.
def calc_run_avg(run_outputs_df_counts):
"""Calc average num of families in total fam pool after adding a genome to the dataset"""
run_count_avg = [] # avg num of fams after adding each genome
genomes = list(range(1, len(run_outputs_df_counts)+1))
for i in range(len(run_outputs_df_counts)):
run_count_avg.append(np.mean(run_outputs_df_counts.iloc[i]))
genomes.append
run_avg_df = pd.DataFrame(
{
'Number of CAZy families': run_count_avg,
'Number of genomes': genomes,
}
)
return run_avg_df
# calculate average num of fams in total fam pool after adding a new genome to the data set
pecto_run_avg = calc_run_avg(pecto_runs_counts)
pectobact_run_avg = calc_run_avg(pectobact_runs_counts)
dic_run_avg = calc_run_avg(dic_runs_counts)
mus_run_avg = calc_run_avg(mus_runs_counts)
ben_run_avg = calc_run_avg(ben_runs_counts)
lon_run_avg = calc_run_avg(lon_runs_counts)
# build into a single df
df_len = len(pecto_run_avg) # len all cols need to be
run_avg_data = {
'Pectobacteriaceae': pecto_run_avg['Number of CAZy families'],
'Pectobacterium': list(pectobact_run_avg['Number of CAZy families']) + (
[None] * (df_len - len(pectobact_run_avg['Number of CAZy families']))
),
'Dickeya': list(dic_run_avg['Number of CAZy families']) + (
[None] * (df_len - len(dic_run_avg['Number of CAZy families']))
),
'Musicola': list(mus_run_avg['Number of CAZy families']) + (
[None] * (df_len - len(mus_run_avg['Number of CAZy families']))
),
'Brenneria': list(ben_run_avg['Number of CAZy families']) + (
[None] * (df_len - len(ben_run_avg['Number of CAZy families']))
),
'Lonsdalea': list(lon_run_avg['Number of CAZy families']) + (
[None] * (df_len - len(lon_run_avg['Number of CAZy families']))
),
}
run_avg_df = pd.DataFrame(run_avg_data)
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
run_avg_df.plot(ax=axes);
Identify CAzy families that are present in every genome in the dataset using the identify_core_cazome() function from cazomevolve. These families form the 'core CAZome'.
The function takes one positional argument, a dataframe of CAZy family frequencies (with only CAZy families included in the columns, i.e no taxonomy columns).
# make output directory
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/core_cazome/'), force=True, nodelete=True)
Output directory ../results/core_cazome exists, nodelete is True. Adding output to output directory.
fam_freq_filtered_df_ggs = fam_freq_filtered_df.set_index(['Genome', 'Genus', 'Species'])
core_cazome = identify_core_cazome(fam_freq_filtered_df_ggs)
core_cazome = list(core_cazome)
core_cazome.sort()
print(f"Total families: {len(all_families)}")
print("The core CAZy families are:")
for fam in core_cazome:
print('-', fam)
Identifying core CAZome: 100%|██████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 9004.62it/s]
Total families: 117 The core CAZy families are: - CBM5 - CBM50 - GH23 - GH3 - GT2 - GT51 - GT9
The boxplot shows the frequency of each CAZy family across all genomes in the dataframe. We can also break down this data by genus, and build a dataframe of Family, Genus (or tax rank of choice), genome, and frequency.
This dataframe can then be used to build a second dataframe of:
# filter the famil freq df to include only those families in the core CAZome
core_cazome_df = fam_freq_filtered_df_ggs[core_cazome]
plot_fam_boxplot(core_cazome_df, font_scale=0.8, fig_size=(12,6))
The boxplot shows the frequency of each core CAZy family across all Pectobacteriaceae. To break down the frequency by genus, build a dataframe with the mean (and SD) of frequency of each family in the core CAZome per genus. This dataframe can then be used to plot a proportional area plot of the mean frequency of each CAZy family per genus, for exampling using RawGraphs.
core_cazome_df_genus = copy(core_cazome_df) # to ensure core_cazome_df is not altereted
core_cazome_df_genus = add_tax_column_from_row_index(core_cazome_df_genus, 'Genus', 1)
core_cazome_df_genus.head()
| CBM5 | CBM50 | GH23 | GH3 | GT2 | GT51 | GT9 | Genus | |||
|---|---|---|---|---|---|---|---|---|---|---|
| Genome | Genus | Species | ||||||||
| GCA_009931295.1 | Pectobacterium | odoriferum | 1 | 5 | 10 | 3 | 7 | 3 | 3 | Pectobacterium |
| GCA_009874305.1 | Dickeya | solani | 2 | 6 | 5 | 3 | 11 | 3 | 4 | Dickeya |
| GCA_016944275.1 | Pectobacterium | brasiliense | 1 | 6 | 10 | 2 | 9 | 3 | 4 | Pectobacterium |
| GCA_003403135.1 | Dickeya | dianthicola | 2 | 6 | 9 | 3 | 11 | 3 | 4 | Dickeya |
| GCA_922011735.1 | Pectobacterium | versatile | 1 | 6 | 5 | 2 | 8 | 3 | 3 | Pectobacterium |
core_cazome_fggf_df, core_cazome_mean_freq_df = build_fam_mean_freq_df(
core_cazome_df_genus,
'Genus',
round_by=2,
)
# add rows showing the means across all pectobacteriaceae
all_pecto_core_fam_data = []
for fam in core_cazome_df_genus.columns:
try:
mean_freq = np.mean(core_cazome_df_genus[fam]).round(2)
sd_freq = np.std(core_cazome_df_genus[fam]).round(2)
all_pecto_core_fam_data.append([fam, 'Pectobacteriaceae', mean_freq, sd_freq])
except TypeError: # tax column
continue
temp_df = pd.DataFrame(all_pecto_core_fam_data, columns=['Family','Genus','MeanFreq','SdFreq'])
core_cazome_mean_freq_df = pd.concat([core_cazome_mean_freq_df, temp_df])
core_cazome_mean_freq_df.to_csv("../results/core_cazome/core_cazome_freqs.csv")
core_cazome_mean_freq_df
Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████| 711/711 [00:00<00:00, 5870.25it/s] Building [Fam, grp, mean freq, sd freq] df: 100%|████████████████████████████████████████████████████| 8/8 [00:00<00:00, 118.05it/s]
| Family | Genus | MeanFreq | SdFreq | |
|---|---|---|---|---|
| 0 | CBM5 | Lonsdalea | 1.00 | 0.00 |
| 1 | CBM50 | Lonsdalea | 5.79 | 0.52 |
| 2 | GH23 | Lonsdalea | 5.21 | 1.04 |
| 3 | GH3 | Lonsdalea | 1.97 | 0.28 |
| 4 | GT2 | Lonsdalea | 7.03 | 0.86 |
| ... | ... | ... | ... | ... |
| 2 | GH23 | Pectobacteriaceae | 6.49 | 1.55 |
| 3 | GH3 | Pectobacteriaceae | 2.49 | 0.60 |
| 4 | GT2 | Pectobacteriaceae | 8.15 | 2.10 |
| 5 | GT51 | Pectobacteriaceae | 3.08 | 0.37 |
| 6 | GT9 | Pectobacteriaceae | 3.70 | 0.56 |
63 rows × 4 columns
As well as looking at the core CAZome across all Pectobacteriaceae, we can identify the core CAZome of each genus. This is still done using the identify_core_cazome() function from cazomevolve, however, we filter the dataframe each time to retain only rows with data for the genera of interest.
These data (listing the genus specific core CAZomes) is used to aupsetplot to highlight the differences and similarities between the core CAZomes.
genera_of_interest = ['Pectobacterium', 'Dickeya', 'Musicola', 'Brenneria', 'Lonsdalea']
all_families = fam_freq_filtered_df_ggs.columns
core_cazomes = {} # {genus: {fams}}
for genus in genera_of_interest:
filtered_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'] == genus]
temp_core_cazome = identify_core_cazome(filtered_df[all_families])
temp_core_cazome = list(temp_core_cazome)
temp_core_cazome.sort()
core_cazomes[genus] = {'fams': temp_core_cazome, 'freqs': {len(filtered_df)}}
core_cazomes
Identifying core CAZome: 100%|██████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 7996.18it/s] Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 17363.72it/s] Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 29897.26it/s] Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 25178.74it/s] Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 29642.62it/s]
{'Pectobacterium': {'fams': ['CBM5',
'CBM50',
'GH1',
'GH103',
'GH23',
'GH28',
'GH3',
'GH43',
'GT2',
'GT51',
'GT9',
'PL1',
'PL2',
'PL22',
'PL3',
'PL9'],
'freqs': {426}},
'Dickeya': {'fams': ['CBM48',
'CBM5',
'CBM50',
'CE4',
'CE8',
'GH1',
'GH103',
'GH105',
'GH13',
'GH23',
'GH28',
'GH3',
'GH33',
'GH73',
'GH77',
'GH8',
'GT1',
'GT19',
'GT2',
'GT28',
'GT35',
'GT4',
'GT5',
'GT51',
'GT9',
'PL1',
'PL9'],
'freqs': {206}},
'Musicola': {'fams': ['CBM48',
'CBM5',
'CBM50',
'CE1',
'CE11',
'CE12',
'CE4',
'CE8',
'CE9',
'GH1',
'GH102',
'GH103',
'GH104',
'GH105',
'GH13',
'GH19',
'GH2',
'GH23',
'GH28',
'GH3',
'GH30',
'GH31',
'GH32',
'GH33',
'GH38',
'GH5',
'GH73',
'GH77',
'GH8',
'GT0',
'GT1',
'GT19',
'GT2',
'GT26',
'GT28',
'GT30',
'GT35',
'GT4',
'GT5',
'GT51',
'GT56',
'GT81',
'GT83',
'GT9',
'PL1',
'PL2',
'PL22',
'PL9'],
'freqs': {4}},
'Brenneria': {'fams': ['CBM5',
'CBM50',
'CE11',
'CE12',
'CE9',
'GH1',
'GH102',
'GH103',
'GH13',
'GH23',
'GH28',
'GH3',
'GH32',
'GH4',
'GH68',
'GH73',
'GH94',
'GT0',
'GT19',
'GT2',
'GT26',
'GT28',
'GT30',
'GT35',
'GT4',
'GT5',
'GT51',
'GT56',
'GT8',
'GT81',
'GT84',
'GT9'],
'freqs': {33}},
'Lonsdalea': {'fams': ['CBM32',
'CBM5',
'CBM50',
'CE11',
'CE4',
'GH19',
'GH23',
'GH3',
'GH32',
'GH37',
'GH68',
'GH77',
'GH8',
'GT19',
'GT2',
'GT20',
'GT26',
'GT28',
'GT4',
'GT51',
'GT56',
'GT9'],
'freqs': {39}}}
core_cazome_upsetplot_membership = []
core_cazome_upsetplot_membership = add_to_upsetplot_membership(
core_cazome_upsetplot_membership,
core_cazomes,
)
len(core_cazome_upsetplot_membership)
708
core_cazome_upsetplot = build_upsetplot(
core_cazome_upsetplot_membership,
sort_by='input',
file_path='../results/core_cazome/genera_core_cazome.svg',
)
Again by filtering the rows in the dataframe of CAZyme family frequencies, we can identify a core CAZome in any custom subset of genomes. This includes identifing the core CAZomes in soft and hard plant tissue targeting Pectobacteriaceae CAZomes.
soft_genera = ['Pectobacterium', 'Dickeya', 'Musicola']
hard_genera = ['Brenneria', 'Lonsdalea', 'Samsonia', 'Affinibrenneria', 'Acerihabitans']
grps = [[soft_genera, 'Soft tissue targeting'], [hard_genera, 'Hard tissue targeting']]
all_families = fam_freq_filtered_df_ggs.columns
soft_hard_core_cazomes = {} # {grp: {fams}}
for grp in tqdm(grps):
# gather all rows containing the genera of interest
filtered_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(grp[0])]
temp_core_cazome = identify_core_cazome(filtered_df[all_families])
temp_core_cazome = list(temp_core_cazome)
temp_core_cazome.sort()
try:
soft_hard_core_cazomes[grp[1]]
except KeyError:
soft_hard_core_cazomes[grp[1]] = {'fams': set(), 'freqs': [0]}
soft_hard_core_cazomes[grp[1]]['fams'] = soft_hard_core_cazomes[grp[1]]['fams'].union(
set(temp_core_cazome)
)
soft_hard_core_cazomes[grp[1]]['freqs'][0] += len(filtered_df)
soft_hard_core_cazomes
0%| | 0/2 [00:00<?, ?it/s]
Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 10814.83it/s] Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 27753.28it/s]
{'Soft tissue targeting': {'fams': {'CBM5',
'CBM50',
'GH1',
'GH103',
'GH23',
'GH28',
'GH3',
'GT2',
'GT51',
'GT9',
'PL1',
'PL9'},
'freqs': [636]},
'Hard tissue targeting': {'fams': {'CBM5',
'CBM50',
'CE11',
'GH23',
'GH3',
'GT19',
'GT2',
'GT26',
'GT28',
'GT4',
'GT51',
'GT56',
'GT9'},
'freqs': [75]}}
soft_hard_core_cazomes.update(core_cazomes)
core_cazome_upsetplot_membership = []
core_cazome_upsetplot_membership = add_to_upsetplot_membership(
core_cazome_upsetplot_membership,
soft_hard_core_cazomes,
)
len(core_cazome_upsetplot_membership)
1419
core_cazome_upsetplot = build_upsetplot(
core_cazome_upsetplot_membership,
file_path='../results/core_cazome/genera_soft_hard_core_cazome.svg',
)
We can use cazomevolve to identify CAZy families that are always present in a genome together, although each group of always co-occurring CAZy families may not be present in every genomes.
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/cooccurring_families/'), force=True, nodelete=True)
# reminder of the structure of the df
fam_freq_filtered_df.head(1)
Output directory ../results/cooccurring_families exists, nodelete is True. Adding output to output directory.
| Genome | Genus | Species | AA10 | AA3 | CBM0 | CBM13 | CBM18 | CBM3 | CBM32 | ... | PL11 | PL17 | PL2 | PL22 | PL26 | PL3 | PL35 | PL38 | PL4 | PL9 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | GCA_009931295.1 | Pectobacterium | odoriferum | 0 | 1 | 0 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 2 | 1 | 1 | 2 | 0 | 1 | 1 | 2 |
1 rows × 120 columns
There are two approaches are implemented by cazomevolve to idenitfy CAZy families that always co-occur together. The first generates a correlation matrix.
CAZy families that always appear together can be identified by generating a correlation matrix using the Python package pandas, CAZy families that are always present together will have a correlation matrix of 1.
This can be done using the identify_cooccurring_fams_corrM() function. CAZy families that are always present in the genome (i.e. the core CAZome), or are absent from all genomes will be calulcated to have a correlation score of nan.
identify_cooccurring_fams_corrM() returns:
all_families = list(fam_freq_filtered_df.columns[3:])
cooccurring_families, fam_corr_M_filled = identify_cooccurring_fams_corrM(
fam_freq_filtered_df,
all_families,
core_cazome=[],
corrM_path="../results/cooccurring_families/fam_corr_M_filled.csv",
fill_value=2,
)
Building binary fam freq df: 100%|██████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 2514.93it/s] Delete absent families: 100%|███████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 7081.50it/s] Identifying always co-occurring families: 100%|█████████████████████████████████████████████████| 117/117 [00:00<00:00, 3121.93it/s]
cooccurring_families
{('CBM4', 'GH148'), ('GH121', 'GH146'), ('GH127', 'GH15'), ('GH94', 'GT84')}
Generate a clustermap of the correlation matrix.
sns.clustermap(
fam_corr_M_filled,
cmap=sns.cubehelix_palette(rot=0, dark=2, light=0, reverse=True, as_cmap=True),
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg) /home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance. warnings.warn(msg)
<seaborn.matrix.ClusterGrid at 0x7f9cf91ffad0>
The second method implemented by cazomevolve to identify CAZy families that always co-occur together is an iterative approach:
Iterate through the dataframe of CAZy family frequencies in Pectobacteriaceae (fam_freq_df_filtered) and identify the groups of always co-occurring CAZy families (i.e. those families that are always present together) and count the number of genomes in which the families are present together.
This is done using the cazomevolve function calc_cooccuring_fam_freqs(), which returns a dictionary of groups of co-occurring CAZy families. The function takes as input:
cooccurring_fams_dict = calc_cooccuring_fam_freqs(
fam_freq_filtered_df,
list(all_families),
exclude_core_cazome=False,
)
cooccurring_fams_dict
Identifying pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 117/117 [00:01<00:00, 93.38it/s] Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 25/25 [00:00<00:00, 108323.97it/s]
{0: {'fams': {'CBM4', 'GH148'}, 'freqs': {8}},
1: {'fams': {'CBM5', 'CBM50', 'GH23', 'GH3', 'GT2', 'GT51', 'GT9'},
'freqs': {711}},
2: {'fams': {'GH121', 'GH146'}, 'freqs': {1}},
3: {'fams': {'GH127', 'GH15'}, 'freqs': {1}},
4: {'fams': {'GH94', 'GT84'}, 'freqs': {309}}}
Similar to above when exploring core CAZomes, we can filiter the dataframe of CAZy family frequencies and run the analysis to identify always co-occurring CAZy families on custom subsets of genomes.
In this instance we reran the analysis for each genus in Pectobacteriaceae to identify genus specific groups of always co-occurring CAZy families.
genera_cooccuring_fams = {} # {genus: cooccurring_fams_dict}
for genus in tqdm(
['Pectobacterium', 'Dickeya', 'Musicola', 'Lonsdalea', 'Brenneria'],
desc="Identifying genus specific co-occurring fams",
):
genus_fam_freq_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'] == genus]
genus_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
genus_fam_freq_df,
list(all_families),
exclude_core_cazome=False,
)
genera_cooccuring_fams[genus] = genus_cooccurring_fams_dict
genera_cooccuring_fams
Identifying genus specific co-occurring fams: 0%| | 0/5 [00:00<?, ?it/s]
Identifying pairs of co-occurring families: 0%| | 0/117 [00:00<?, ?it/s] Identifying pairs of co-occurring families: 11%|█████▍ | 13/117 [00:00<00:00, 126.59it/s] Identifying pairs of co-occurring families: 22%|██████████▉ | 26/117 [00:00<00:00, 111.33it/s] Identifying pairs of co-occurring families: 39%|███████████████████▎ | 46/117 [00:00<00:00, 145.54it/s] Identifying pairs of co-occurring families: 52%|█████████████████████████▌ | 61/117 [00:00<00:00, 138.33it/s] Identifying pairs of co-occurring families: 65%|███████████████████████████████▊ | 76/117 [00:00<00:00, 139.37it/s] Identifying pairs of co-occurring families: 78%|██████████████████████████████████████ | 91/117 [00:00<00:00, 129.62it/s] Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 131.17it/s] Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 135/135 [00:00<00:00, 322638.77it/s] Identifying pairs of co-occurring families: 0%| | 0/117 [00:00<?, ?it/s] Identifying pairs of co-occurring families: 21%|██████████ | 24/117 [00:00<00:00, 239.91it/s] Identifying pairs of co-occurring families: 41%|████████████████████ | 48/117 [00:00<00:00, 205.63it/s] Identifying pairs of co-occurring families: 69%|█████████████████████████████████▉ | 81/117 [00:00<00:00, 253.84it/s] Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 242.16it/s] Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 361/361 [00:00<00:00, 558106.80it/s] Identifying pairs of co-occurring families: 100%|███████████████████████████████████████████████| 117/117 [00:00<00:00, 1857.02it/s] Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████| 1130/1130 [00:00<00:00, 412135.96it/s] Identifying pairs of co-occurring families: 100%|███████████████████████████████████████████████| 117/117 [00:00<00:00, 1456.48it/s] Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 255/255 [00:00<00:00, 694962.65it/s] Identifying pairs of co-occurring families: 0%| | 0/117 [00:00<?, ?it/s] Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 716.76it/s] Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 500/500 [00:00<00:00, 835851.73it/s]
{'Pectobacterium': {0: {'fams': {'CBM3', 'GH5'}, 'freqs': {425}},
1: {'fams': {'CBM48', 'CE8', 'CE9', 'GH13'}, 'freqs': {425}},
2: {'fams': {'CBM5',
'CBM50',
'GH1',
'GH103',
'GH23',
'GH28',
'GH3',
'GH43',
'GT2',
'GT51',
'GT9',
'PL1',
'PL2',
'PL22',
'PL3',
'PL9'},
'freqs': {426}},
3: {'fams': {'CE11', 'GH102', 'GH32'}, 'freqs': {425}},
4: {'fams': {'GH105', 'GT56'}, 'freqs': {425}},
5: {'fams': {'GH121', 'GH146', 'GH154'}, 'freqs': {1}},
6: {'fams': {'GH94', 'GT84'}, 'freqs': {152}}},
'Dickeya': {0: {'fams': {'CBM4', 'GH148'}, 'freqs': {8}},
1: {'fams': {'CBM48',
'CBM5',
'CBM50',
'CE4',
'CE8',
'GH1',
'GH103',
'GH105',
'GH13',
'GH23',
'GH28',
'GH3',
'GH33',
'GH73',
'GH77',
'GH8',
'GT1',
'GT19',
'GT2',
'GT28',
'GT35',
'GT4',
'GT5',
'GT51',
'GT9',
'PL1',
'PL9'},
'freqs': {206}},
2: {'fams': {'CE11', 'GT83'}, 'freqs': {204}},
3: {'fams': {'GH16', 'GT25'}, 'freqs': {1}},
4: {'fams': {'GH19', 'GH5', 'PL4'}, 'freqs': {203}},
5: {'fams': {'GH88', 'PL35'}, 'freqs': {3}},
6: {'fams': {'GH94', 'GT84'}, 'freqs': {89}},
7: {'fams': {'GT30', 'PL3'}, 'freqs': {205}},
8: {'fams': {'PL2', 'PL22'}, 'freqs': {205}}},
'Musicola': {0: {'fams': {'CBM32', 'CBM63'}, 'freqs': {2}},
1: {'fams': {'CBM48',
'CBM5',
'CBM50',
'CE1',
'CE11',
'CE12',
'CE4',
'CE8',
'CE9',
'GH1',
'GH102',
'GH103',
'GH104',
'GH105',
'GH13',
'GH19',
'GH2',
'GH23',
'GH28',
'GH3',
'GH30',
'GH31',
'GH32',
'GH33',
'GH38',
'GH5',
'GH73',
'GH77',
'GH8',
'GT0',
'GT1',
'GT19',
'GT2',
'GT26',
'GT28',
'GT30',
'GT35',
'GT4',
'GT5',
'GT51',
'GT56',
'GT81',
'GT83',
'GT9',
'PL1',
'PL2',
'PL22',
'PL9'},
'freqs': {4}},
2: {'fams': {'GH24', 'GH36'}, 'freqs': {2}}},
'Lonsdalea': {0: {'fams': {'CBM32',
'CBM5',
'CBM50',
'CE11',
'CE4',
'GH19',
'GH23',
'GH3',
'GH32',
'GH37',
'GH68',
'GH77',
'GH8',
'GT19',
'GT2',
'GT20',
'GT26',
'GT28',
'GT4',
'GT51',
'GT56',
'GT9'},
'freqs': {39}},
1: {'fams': {'GH1', 'GH28', 'GH4', 'GH73', 'GT0'}, 'freqs': {38}},
2: {'fams': {'GH13', 'GH39', 'GT30', 'PL1', 'PL3'}, 'freqs': {38}},
3: {'fams': {'GH26', 'GH51'}, 'freqs': {9}},
4: {'fams': {'GH31', 'GT81'}, 'freqs': {38}},
5: {'fams': {'GH78', 'GT1'}, 'freqs': {10}},
6: {'fams': {'GH94', 'GT84'}, 'freqs': {33}}},
'Brenneria': {0: {'fams': {'CBM3', 'GH5'}, 'freqs': {25}},
1: {'fams': {'CBM5',
'CBM50',
'CE11',
'CE12',
'CE9',
'GH1',
'GH102',
'GH103',
'GH13',
'GH23',
'GH28',
'GH3',
'GH32',
'GH4',
'GH68',
'GH73',
'GH94',
'GT0',
'GT19',
'GT2',
'GT26',
'GT28',
'GT30',
'GT35',
'GT4',
'GT5',
'GT51',
'GT56',
'GT8',
'GT81',
'GT84',
'GT9'},
'freqs': {33}},
2: {'fams': {'GH106', 'PL38'}, 'freqs': {1}},
3: {'fams': {'GH8', 'GT83'}, 'freqs': {15}},
4: {'fams': {'GT73', 'PL17'}, 'freqs': {1}}}}
Identify families that always co-occurring in soft and hard plant tissue genera.
soft_genera = ['Pectobacterium', 'Dickeya', 'Musicola']
hard_genera = ['Brenneria', 'Lonsdalea', 'Samsonia', 'Affinibrenneria', 'Acerihabitans']
# hard_genera = ['Brenneria', 'Lonsdalea']
grps = [[soft_genera, 'Soft tissue targeting'], [hard_genera, 'Hard tissue targeting']]
for grp in tqdm(grps):
# gather all rows containing the genera of interest
grp_fam_freq_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(grp[0])]
grp_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
grp_fam_freq_df,
list(all_families),
exclude_core_cazome=False,
)
genera_cooccuring_fams[grp[1]] = grp_cooccurring_fams_dict
0%| | 0/2 [00:00<?, ?it/s]
Identifying pairs of co-occurring families: 0%| | 0/117 [00:00<?, ?it/s] Identifying pairs of co-occurring families: 4%|██▏ | 5/117 [00:00<00:02, 40.72it/s] Identifying pairs of co-occurring families: 10%|█████▏ | 12/117 [00:00<00:02, 52.20it/s] Identifying pairs of co-occurring families: 16%|████████ | 19/117 [00:00<00:01, 56.27it/s] Identifying pairs of co-occurring families: 25%|████████████▍ | 29/117 [00:00<00:01, 70.28it/s] Identifying pairs of co-occurring families: 36%|█████████████████▉ | 42/117 [00:00<00:00, 88.73it/s] Identifying pairs of co-occurring families: 44%|██████████████████████▏ | 52/117 [00:00<00:00, 89.34it/s] Identifying pairs of co-occurring families: 53%|██████████████████████████▍ | 62/117 [00:00<00:00, 88.79it/s] Identifying pairs of co-occurring families: 62%|███████████████████████████████▏ | 73/117 [00:00<00:00, 94.14it/s] Identifying pairs of co-occurring families: 71%|███████████████████████████████████▍ | 83/117 [00:00<00:00, 94.53it/s] Identifying pairs of co-occurring families: 80%|████████████████████████████████████████▏ | 94/117 [00:01<00:00, 97.29it/s] Identifying pairs of co-occurring families: 89%|███████████████████████████████████████████▌ | 104/117 [00:01<00:00, 97.81it/s] Identifying pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 117/117 [00:01<00:00, 86.75it/s] Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 75/75 [00:00<00:00, 125178.19it/s] Identifying pairs of co-occurring families: 0%| | 0/117 [00:00<?, ?it/s] Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 604.14it/s] Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 91/91 [00:00<00:00, 415775.23it/s]
One of the best ways to visualise the groups of always co-occurring CAZy families is to generate an upset plot.
Build an upsetplot (using the Python package upsetplot) to visulise the groups of always co-occurring CAZy families, additionally it will plot the number of genomes in which each group of co-occurring CAZy families were present.
First compile the data/membership for the upset plot by:
add_to_upsetplot_membership() functionupsetplot_membership = []
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, cooccurring_fams_dict)
for genus in genera_cooccuring_fams:
upsetplot_membership = add_to_upsetplot_membership(
upsetplot_membership,
genera_cooccuring_fams[genus],
)
len(upsetplot_membership)
7233
Building the upset plot will include the core CAZomes across Pectobacteriaceae, per genus, and per all soft plant tissue targeting genera and all hard plant tissue targeting genera.
pectobact_upsetplot = build_upsetplot(
upsetplot_membership,
file_path='../results/cooccurring_families/pecto-cooccurring-families.svg',
)
The upset plot generates a bar chart showing the number of genomes that each group of co-occuring CAZy families appeared in. However, this plots the total number across each of the groups (i.e. Pectobacterium, Dickeya, etc.).
To break down the indidence (i.e. the number of genomes that each group of co-occurring CAZy families were present in) per group, a dataframe listing each group of co-occurring CAZy families, the group (i.e. genus), and the respective frequency must be generated. This dataframe can then be used to generate a proportional area plot (or matrix plot), breaking down the incidence per group (i.e. genus).
The groups of co-occurring CAZy families must be listed in the same order as they are presented in the upset plot.
upset_plot_groups = get_upsetplot_grps(upsetplot_membership)
100%|███████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:01<00:00, 29.69it/s]
Compiling the data of the incidence of each grp of co-occurring CAZy families per group of interest (e.g. per genus), into a single dataframe.
Create an empty list to store all data for the dataframe, then use add_upsetplot_grp_freqs to add data of the incidence per group of co-occurring CAZy families to the list. build_upsetplot_matrix is then used to build the dataframe.
cooccurring_grp_freq_data = [] # empty list to store data for the df
# add pectobacteriaceae data
genera_cooccuring_fams['Pectobacteriaceae'] = cooccurring_fams_dict
# add data for each genus, all soft plant targeting and hard plant tissue targeting
cooccurring_grp_freq_data = add_upsetplot_grp_freqs(
upset_plot_groups,
cooccurring_grp_freq_data,
genera_cooccuring_fams,
genus,
grp_sep=True,
grp_order=[
'Pectobacteriaceae',
'Pectobacterium', 'Dickeya', 'Musicola', 'Soft tissue targeting',
'Brenneria', 'Lonsdalea', 'Hard tissue targeting',
],
include_none=True,
)
Compiling co-occurring families incidence data: 100%|████████████████████████████████████████████| 38/38 [00:00<00:00, 24853.20it/s]
Build a single dataframe of co-occurring families, freq and group (e.g. genus).
But also list the information for each group in the same order the groups of CAZy families are listed in the upset plot. This allows a proportional area plot to be generated (for example, by using RawGraphs), which can then be combined with the upset plot (for example, using inkscape).
# build the dataframe
cooccurring_fams_freq_df = build_upsetplot_matrix(
cooccurring_grp_freq_data,
'Genus',
file_path='../results/cooccurring_families/cooccurring_fams_freqs.csv',
)
cooccurring_fams_freq_df
| Families | Genus | Incidence | |
|---|---|---|---|
| 0 | PL2+PL22 | Pectobacteriaceae | NaN |
| 1 | PL2+PL22 | Pectobacterium | NaN |
| 2 | PL2+PL22 | Dickeya | 205.0 |
| 3 | PL2+PL22 | Musicola | NaN |
| 4 | PL2+PL22 | Soft tissue targeting | 635.0 |
| ... | ... | ... | ... |
| 299 | GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28... | Musicola | 4.0 |
| 300 | GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28... | Soft tissue targeting | NaN |
| 301 | GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28... | Brenneria | NaN |
| 302 | GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28... | Lonsdalea | NaN |
| 303 | GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28... | Hard tissue targeting | NaN |
304 rows × 3 columns
After analysing the data, mannually group of the soft and hard tissue targeting specific groups of CAZy families together and mannually define the order the groups are presented in the final upset plot (by setting the param sort_by to 'input').
grp_order = {
'soft_grps': [ # grps only found in soft plant tissue targeting genomes
'GH13+CBM48+CE8', # S
'PL2+PL22', # # S D
'GH148+CBM4', # S D
'PL3+GT30', # D
'PL35+GH88', # D
'CE11+GT83', # D
'GT25+GH16', # D
'GH5+GH19+PL4', # D
'GH121+GH146+GH154', # S P
'GH121+GH146',
'GH105+GT56', # P
'GH13+CBM48+CE8+CE9', # P
'CE11+GH32+GH102', # P
],
'musicola': [ # grps found only in musicola
'CBM32+CBM63',
'GH24+GH36',
],
'both_grps': [ # grps found in soft and hard plant tissue targeting genomes
'GT84+GH94', #
'GH5+CBM3', #
],
'hard_musicola_grps': [ # grps only found in hard plant tissue targeting genomes and Musicola
'GH13+GT30',
'GH1+GH73+GT0',
'GT5+GT35+GT8',
'GH15+GH127',
'GT81+GH31', # L
'GH8+GT83', # B
],
'hard_grps': [ # grps only found in hard plant tissue targeting genomes
'CBM67+GH65', # H
'PL17+GT73', # L B
'PL38+GH106', # L B
'GT1+GH78', # L
'GH26+GH51', # L
'GH1+GH28+GH73+GT0+GH4',
'GH13+PL1+PL3+GT30+GH39',
],
'all_core_cazomes': [ # then core cazomes at the end
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9', # pectobacteriaceae
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH28+PL1+GH103+PL9', # soft plant tissue targeting
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28+PL1+GH103+PL9+CBM48+CE8+GH105+GT4+GT28+GT19+GH73+GT5+GT35+GH8+CE4+GH77+GT1+GH33', # dickeya
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH28+PL1+GH103+PL9+PL2+PL22+PL3+GH43', # pectobacter
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28+PL1+GH103+PL9+PL2+PL22+CBM48+CE8+CE11+GH5+GH105+GT56+GH32+GH102+CE9+GT4+GT28+GT19+GH73+GT30+GT5+GT35+GH8+CE4+GH77+GH19+GT83+GT1+GH33+GT26+GT0+GT81+GH31+CE12+GH38+GH30+CE1+GH2+GH104', # musicola
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+CE11+GT56+GT4+GT28+GT19+GT26', # hard
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+CE11+GT56+GH32+GT4+GT28+GT19+GH8+CE4+GH77+GH19+GT26+GH68+CBM32+GT20+GH37', # lonsdalea
'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28+GH103+GT84+GH94+CE11+GT56+GH32+GH102+CE9+GT4+GT28+GT19+GH73+GT30+GT5+GT35+GT26+GT0+GT81+GH68+GH4+GT8+CE12', # bren
],
}
for grp in grp_order:
grps = []
for fams_str in grp_order[grp]:
fams_list = fams_str.split("+")
fams_list.sort()
fams = "+".join(fams_list)
grps.append(fams)
grp_order[grp] = grps
paper_cooccurring_fams = {} # {grp_num: {'fams': {fams}, 'freqs': {int}}}
num_of_grp = 0
for pheno_grp in grp_order:
for fam_grp in grp_order[pheno_grp]:
fams = fam_grp.split("+")
fams.sort()
for genus in ['Pectobacterium', 'Dickeya', 'Musicola', 'Soft tissue targeting', 'Hard tissue targeting', 'Brenneria', 'Lonsdalea']:
# {grp num: {'fams': {fams}, 'freqs': {int}}}
for grp_num in genera_cooccuring_fams[genus]:
grp_fams = list(genera_cooccuring_fams[genus][grp_num]['fams'])
grp_fams.sort()
if grp_fams == fams:
this_grp_num = None
for co_grp_num in paper_cooccurring_fams:
if paper_cooccurring_fams[co_grp_num]['fams'] == genera_cooccuring_fams[genus][grp_num]['fams']:
this_grp_num = co_grp_num
if this_grp_num is None:
this_grp_num = copy(num_of_grp)
paper_cooccurring_fams[this_grp_num] = {
'fams': genera_cooccuring_fams[genus][grp_num]['fams'],
'freqs': genera_cooccuring_fams[genus][grp_num]['freqs']
}
num_of_grp += 1
upsetplot_membership = []
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, paper_cooccurring_fams)
len(upsetplot_membership)
5076
pectobact_upsetplot = build_upsetplot(
upsetplot_membership,
file_path='../results/cooccurring_families/paper-pecto-cooccurring-families.svg',
sort_by='input',
)
Calculate the frequency of each group per genus to then build a matrix plot (or proportional area plot).
paper_cooccurring_freqs = [] # [fams, genus/grp, incidence/freq]
num_of_grp = 0
for grp_name in grp_order:
for fams in grp_order[grp_name]:
fams = fams.split("+")
fams.sort()
for genus in ['Soft tissue targeting', 'Pectobacterium', 'Dickeya', 'Musicola', 'Hard tissue targeting', 'Brenneria', 'Lonsdalea']:
# {grp num: {'fams': {fams}, 'freqs': {int}}}
for grp_num in genera_cooccuring_fams[genus]:
grp_fams = list(genera_cooccuring_fams[genus][grp_num]['fams'])
grp_fams.sort()
if grp_fams == fams:
# found fams in genus
paper_cooccurring_freqs.append(
[
genera_cooccuring_fams[genus][grp_num]['fams'],
genus,
list(genera_cooccuring_fams[genus][grp_num]['freqs'])[0],
]
)
# build the dataframe
cooccurring_fams_freq_df = build_upsetplot_matrix(
paper_cooccurring_freqs,
'Genus',
file_path='../results/cooccurring_families/paper-cooccurring_fams_freqs.csv',
)
cooccurring_fams_freq_df
| Families | Genus | Incidence | |
|---|---|---|---|
| 0 | {CBM48, CE8, GH13} | Soft tissue targeting | 635 |
| 1 | {PL22, PL2} | Soft tissue targeting | 635 |
| 2 | {PL22, PL2} | Dickeya | 205 |
| 3 | {GH148, CBM4} | Soft tissue targeting | 8 |
| 4 | {GH148, CBM4} | Dickeya | 8 |
| 5 | {GT30, PL3} | Dickeya | 205 |
| 6 | {GH88, PL35} | Dickeya | 3 |
| 7 | {GT83, CE11} | Dickeya | 204 |
| 8 | {GH16, GT25} | Dickeya | 1 |
| 9 | {GH5, GH19, PL4} | Dickeya | 203 |
| 10 | {GH154, GH121, GH146} | Soft tissue targeting | 1 |
| 11 | {GH154, GH121, GH146} | Pectobacterium | 1 |
| 12 | {GT56, GH105} | Pectobacterium | 425 |
| 13 | {CBM48, CE8, CE9, GH13} | Pectobacterium | 425 |
| 14 | {GH32, GH102, CE11} | Pectobacterium | 425 |
| 15 | {CBM63, CBM32} | Musicola | 2 |
| 16 | {GH36, GH24} | Musicola | 2 |
| 17 | {GH94, GT84} | Soft tissue targeting | 241 |
| 18 | {GH94, GT84} | Pectobacterium | 152 |
| 19 | {GH94, GT84} | Dickeya | 89 |
| 20 | {GH94, GT84} | Hard tissue targeting | 68 |
| 21 | {GH94, GT84} | Lonsdalea | 33 |
| 22 | {CBM3, GH5} | Pectobacterium | 425 |
| 23 | {CBM3, GH5} | Hard tissue targeting | 25 |
| 24 | {CBM3, GH5} | Brenneria | 25 |
| 25 | {GT30, GH13} | Hard tissue targeting | 74 |
| 26 | {GH73, GT0, GH1} | Hard tissue targeting | 74 |
| 27 | {GT35, GT8, GT5} | Hard tissue targeting | 36 |
| 28 | {GH127, GH15} | Hard tissue targeting | 1 |
| 29 | {GH31, GT81} | Lonsdalea | 38 |
| 30 | {GT83, GH8} | Brenneria | 15 |
| 31 | {CBM67, GH65} | Hard tissue targeting | 1 |
| 32 | {PL17, GT73} | Hard tissue targeting | 1 |
| 33 | {PL17, GT73} | Brenneria | 1 |
| 34 | {PL38, GH106} | Hard tissue targeting | 1 |
| 35 | {PL38, GH106} | Brenneria | 1 |
| 36 | {GH78, GT1} | Lonsdalea | 10 |
| 37 | {GH26, GH51} | Lonsdalea | 9 |
| 38 | {GH73, GH4, GH28, GH1, GT0} | Lonsdalea | 38 |
| 39 | {PL1, GH39, GT30, GH13, PL3} | Lonsdalea | 38 |
| 40 | {GT2, GH23, GT9, GT51, GH28, GH103, PL1, GH3, ... | Soft tissue targeting | 636 |
| 41 | {GT35, GH23, GH103, GT1, GT2, GT9, CE4, GT5, G... | Dickeya | 206 |
| 42 | {GH43, GT2, GH23, GT9, GT51, GH28, GH103, PL1,... | Pectobacterium | 426 |
| 43 | {GT35, GH23, GH103, GT1, GH102, GH5, GT0, GT2,... | Musicola | 4 |
| 44 | {GT19, GH23, GT2, GT26, GT51, GT9, GH3, GT56, ... | Hard tissue targeting | 75 |
| 45 | {GH23, CBM32, GT2, GT26, GT9, CE4, GH77, GT28,... | Lonsdalea | 39 |
| 46 | {GT35, GH23, GH103, GT84, GH102, GT0, GT2, GT2... | Brenneria | 33 |
Use principal component analysis to identify individual and groups of CAZy families that are strongly associated with divergence between the Pectobacteriaceae genera CAZomes in terms of CAZy family frequencies.
What is PCA?:
Principal component analysis (PCA) is a statistical method that transforms a high dimensional data set into a low dimensional data set, while retaining as much information as possible. This dimensional reduction is achieved through the generation of principal components (PCs). Each PC is a direction along which variation in the data set is maximal. The first PC (PC1) captures the greatest diversity in the data set, the second PC (PC2) the second greatest direversity, and so on. Each PC represents a group of variables in the original data set, with each variable contributing a different weighting to the PC. A single variable can be associated with more than one PC. The relationships between variables and PCs can be visualised by plotting the loadings (or weightings) for each varaible along each PC.
Use the cazomevolve function perform_pca() to perform a PCA on a dataframe where each row is a genome, and each column the frequency of a unique CAZy family - the columns in the dataframe must only contain numerical data (i.e. no taxonomic data).
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/pca/'), force=True, nodelete=True)
Output directory ../results/pca exists, nodelete is True. Adding output to output directory.
num_of_components = len(fam_freq_filtered_df_ggs.columns)
pectobact_pca, X_scaled = perform_pca(fam_freq_filtered_df_ggs, num_of_components)
pectobact_pca
PCA(n_components=117)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA(n_components=117)
Explore the amount of variance in the dataset that is captured by the dimensional reduction performed by the PCA.
print(
f"{round(pectobact_pca.explained_variance_ratio_.sum() * 100, 2)}% "
"of the variance in the data set was catpured by the PCA"
)
cumExpVar = plot_explained_variance(
pectobact_pca,
num_of_components,
file_path="../results/pca/pca_explained_variance.png",
)
100.0% of the variance in the data set was catpured by the PCA Number of features needed to explain 0.95 fraction of total variance is 59.
Explore the variance in the data that is captured by each PC.
plot_scree(pectobact_pca, nComp=10, file_path="../results/pca/pectobact_pca_scree.png")
Explained variance for 1PC: 0.1578167077631129 Explained variance for 2PC: 0.11714531745192669 Explained variance for 3PC: 0.05535353923303329 Explained variance for 4PC: 0.04804711615760438 Explained variance for 5PC: 0.04031600741870654 Explained variance for 6PC: 0.02931399017337997 Explained variance for 7PC: 0.02766383466136205 Explained variance for 8PC: 0.021653599705441385 Explained variance for 9PC: 0.02118856319918563 Explained variance for 10PC: 0.019638464318788077
PC1 (15%) and PC2 (11%) capture a signficantly greater degree of the varaince in the data set than all other PCs.
PC3 (6%) and PC4 (5%) capture comparable degrees of the variance
To explore the variance captured by each PC, plot different combinations of PCs onto a scatter plot where each axis represents a different PC and each point on the plot is a genome in the data set, using the plot_pca() function.
plot_pca() takes 6 positional argumets:
peform_pca()perform_pca()Owing to the majoirty of the variance captured by the PCA being captured by PCs 1-4, all possible combinations of these PCs were explored.
The PCs 1-4 capture more diversity in the data set than the other PCs, therefore, plot all combinations of these PCs against each other, projecting the genomes onto these PCs.
A pairplot is generated using Seaborn, plotting each potential pairs between PCs 1-4. A KDE plot (a special type of density or histogram plot) is generated on the diagonal.
First colour code and style each point (where each point represents a genome) by its genus classification.
fam_freq_filtered_df_ggs['Genus'] = list(fam_freq_filtered_df['Genus'])
fam_freq_filtered_df_ggs['Species'] = list(fam_freq_filtered_df['Species'])
X_pca = pectobact_pca.transform(X_scaled)
fam_freq_df_ggs_pc = copy(fam_freq_filtered_df_ggs)
colnames = []
for i in range(4):
fam_freq_df_ggs_pc[f'PC{i+1} ({round(pectobact_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
colnames.append(f'PC{i+1} ({round(pectobact_pca.explained_variance_ratio_[i] * 100, 2)}%)')
g = sns.pairplot(
fam_freq_df_ggs_pc,
vars=colnames,
hue="Genus",
diag_kind="kde",
markers=['o','X', '^', 'P', 'v', 'D', '<', 's'],
height=3,
);
i = 0
for ax in g.axes.ravel():
if ax is None:
continue
if i not in [0,5,10,15]:
ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
else:
ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
i += 1
plt.savefig(
'../results/pca/pca_pc_screen_genus.svg',
bbox_inches='tight',
format='svg'
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
Colour code each point by its species classification.
g = sns.pairplot(
fam_freq_df_ggs_pc,
vars=colnames,
hue="Species",
diag_kind="kde",
height=3,
);
i = 0
for ax in g.axes.ravel():
if ax is None:
continue
if i not in [0,5,10,15]:
ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
else:
ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
i += 1
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=6, title='Species', frameon=False);
plt.savefig(
'../results/pca/pca_pc_screen_species.svg',
bbox_inches='tight',
format='svg'
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
There are so many species that it is difficult to tell the colours apart. Therefore, pick out some stories of interest, colour code these genomes and leave the rest grey.
# pick out some stories and colour code the species plot accordingly
# pick out some stories and colour code the species plot accordingly
species_of_interest = [
'aquatica',
'dianthicola',
'oryzae',
'zeae',
'britannica',
'populi',
'aquaticum',
'carotovorum',
'parmentieri',
'quasiaquaticum',
]
# Already have geus and species columns added to fam_freq_filtered_df_ggs
markers_dict = {
'Acerihabitans': 's',
'Affinibrenneria': '>',
'Brenneria': '^',
'Dickeya': 'X',
'Lonsdalea': 'P',
'Musicola': 'v',
'Pectobacterium': 'o',
'Samsonia': 'D',
}
# build lists for species and hue colouring and markers
new_sp_col = []
for i in range(len(fam_freq_filtered_df_ggs['Species'])):
sp = fam_freq_filtered_df_ggs.iloc[i]['Species'].split(" ")[0]
if sp in species_of_interest:
new_sp_col.append(f"{fam_freq_filtered_df_ggs.iloc[i]['Genus']} {sp}")
else:
new_sp_col.append(f'{fam_freq_filtered_df_ggs.iloc[i]["Genus"]} sp.')
fam_freq_df_ggs_pc['Selected_Species'] = new_sp_col
colours = [
"#808080", "#D3D3D3", "#4ce0e6", "#e391be",
"#13add4",
"#550fa6",
"#E5E4E2",
"#c99c14",
"#708090",
"#f5d018", # dark green PC 1a4a07
"#0c1669",
"#1a4a07",
"#d41320",
"#899499",
"#71797E",
"#636263",
"#262626",
"#000000"
]
# Set custom color palette
sns.set_palette(sns.color_palette(colours))
g = sns.pairplot(
fam_freq_df_ggs_pc,
vars=colnames,
hue="Selected_Species",
diag_kind="kde",
height=3,
markers = [
'X', 'o', '^',
'X', 'X', 'o',
'X', 'o', 'o',
'v', 'X', 'D',
'P', 'P', 'P',
'o', 's', '>',
],
);
i = 0
for ax in g.axes.ravel():
if ax is None:
continue
if i not in [0,5,10,15]:
ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
else:
ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
i += 1
plt.savefig(
'../results/pca/pca_pc_screen_species_selected.svg',
bbox_inches='tight',
format='svg'
)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
PC1 separates out the genomes in a manner that correlates with their genus classification: Pectobacterium genomes are locataed in the negative PC1 axis, and Dickeya genomes are located in the positive PC1 axis.
PCs 2-4 do not correlate with the genus classification. In plots (from PC1-PC4) show the genome clustering correlating with species classification.
Having made a pairwise plot. Generate a scatter and loadings plot for each pair of PCs from PC1-PC4. This will make it easier to look at the details in each plot, and will make it easier to tell species apart.
Additional pauses are placed into the code to allow time for the notebook to render to figure before generating the next. This does not impact the figures that are saved to disk, but if the pauses (time.sleep()) are excluded, data from one figure may appear in another.
pc_pair = (1,2)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)
for job in ['genus', 'species', 'loadings']:
if job == 'genus':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Genus',
style='Genus',
file_path=out,
);
time.sleep(2)
elif job == 'species':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Species',
style='Species',
file_path=out,
);
time.sleep(2)
elif job == 'loadings': # loadings plot
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
g = plot_loadings(
pectobact_pca,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
threshold=0.3,
fig_size=(10,10),
file_path=out,
font_size=11,
);
Output directory ../results/pca/PC1-vs-PC2 exists, nodelete is True. Adding output to output directory.
PC1 vs PC2 - plotting Genera Not applying hue order Applying style Not applying style order PC1 vs PC2 - plotting Species Not applying hue order Applying style Not applying style order PC1 vs PC2 - Loadings plot
Regenerate the scatter plot of PC1 vs PC2, labelling the Dickeya genomes that are clustered with Musicola, and the Pectobacterium genomes that are on the PC1 +ve axis.
X_pca = pectobact_pca.transform(X_scaled)
plt.figure(figsize=(15,15))
sns.set(font_scale=1.15)
g = sns.scatterplot(
x=X_pca[:,0],
y=X_pca[:,1],
data=fam_freq_filtered_df_ggs,
hue='Genus',
style='Genus',
s=100,
markers=True,
)
g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);
plt.ylabel(f"PC2 {100 * pectobact_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pectobact_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
genome_lbls = ["-".join(_) for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]
texts = [
plt.text(
xval,
yval,
lbl,
ha='center',
va='center',
fontsize=12,
) for (xval, yval, lbl) in zip(
x_vals, y_vals, genome_lbls
) if ((xval > 2) and (yval < 3.5) and (yval > 0) and (xval < 4)) or ((xval > 0.1) and (xval < 2.5) and (yval < 0))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));
plt.savefig('../results/pca/pca_pc1_vs_pc2_musicola_annotated.png', bbox_inches='tight', format='png')
pc_pair = (1,3)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)
for job in ['genus', 'species', 'loadings']:
if job == 'genus':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Genus',
style='Genus',
file_path=out,
figsize=(10,8),
);
time.sleep(2)
elif job == 'species':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Species',
style='Species',
file_path=out,
figsize=(10,8),
);
time.sleep(2)
elif job == 'loadings': # loadings plot
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
g = plot_loadings(
pectobact_pca,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
threshold=0.3,
fig_size=(10,10),
file_path=out,
font_size=11,
);
Output directory ../results/pca/PC1-vs-PC3 exists, nodelete is True. Adding output to output directory.
PC1 vs PC3 - plotting Genera Not applying hue order Applying style Not applying style order PC1 vs PC3 - plotting Species Not applying hue order Applying style Not applying style order PC1 vs PC3 - Loadings plot
pc_pair = (1,4)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)
for job in ['genus', 'species', 'loadings']:
if job == 'genus':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Genus',
style='Genus',
file_path=out,
figsize=(10,8),
);
time.sleep(2)
elif job == 'species':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Species',
style='Species',
file_path=out,
figsize=(10,8),
);
time.sleep(2)
elif job == 'loadings': # loadings plot
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
g = plot_loadings(
pectobact_pca,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
threshold=0.3,
fig_size=(10,10),
file_path=out,
font_size=11,
);
Output directory ../results/pca/PC1-vs-PC4 exists, nodelete is True. Adding output to output directory.
PC1 vs PC4 - plotting Genera Not applying hue order Applying style Not applying style order PC1 vs PC4 - plotting Species Not applying hue order Applying style Not applying style order PC1 vs PC4 - Loadings plot
pc_pair = (2,3)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)
for job in ['genus', 'species', 'loadings']:
if job == 'genus':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Genus',
style='Genus',
file_path=out,
figsize=(8,10),
);
time.sleep(2)
elif job == 'species':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Species',
style='Species',
file_path=out,
figsize=(8,10),
);
time.sleep(2)
elif job == 'loadings': # loadings plot
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
g = plot_loadings(
pectobact_pca,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
threshold=0.3,
fig_size=(10,10),
file_path=out,
font_size=11,
);
Output directory ../results/pca/PC2-vs-PC3 exists, nodelete is True. Adding output to output directory.
PC2 vs PC3 - plotting Genera Not applying hue order Applying style Not applying style order PC2 vs PC3 - plotting Species Not applying hue order Applying style Not applying style order PC2 vs PC3 - Loadings plot
pc_pair = (2,4)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)
for job in ['genus', 'species', 'loadings']:
if job == 'genus':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Genus',
style='Genus',
file_path=out,
figsize=(10,10),
);
time.sleep(2)
elif job == 'species':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Species',
style='Species',
file_path=out,
figsize=(10,10),
);
time.sleep(2)
elif job == 'loadings': # loadings plot
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
g = plot_loadings(
pectobact_pca,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
threshold=0.3,
fig_size=(10,10),
file_path=out,
font_size=11,
);
Output directory ../results/pca/PC2-vs-PC4 exists, nodelete is True. Adding output to output directory.
PC2 vs PC4 - plotting Genera Not applying hue order Applying style Not applying style order PC2 vs PC4 - plotting Species Not applying hue order Applying style Not applying style order PC2 vs PC4 - Loadings plot
pc_pair = (3,4)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)
for job in ['genus', 'species', 'loadings']:
if job == 'genus':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Genus',
style='Genus',
file_path=out,
);
time.sleep(2)
elif job == 'species':
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
g = plot_pca(
pectobact_pca,
X_scaled,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
'Species',
style='Species',
file_path=out,
);
time.sleep(2)
elif job == 'loadings': # loadings plot
print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
g = plot_loadings(
pectobact_pca,
fam_freq_filtered_df_ggs,
pc_pair[0],
pc_pair[1],
threshold=0.3,
fig_size=(10,10),
file_path=out,
font_size=11,
);
Output directory ../results/pca/PC3-vs-PC4 exists, nodelete is True. Adding output to output directory.
PC3 vs PC4 - plotting Genera Not applying hue order Applying style Not applying style order PC3 vs PC4 - plotting Species Not applying hue order Applying style Not applying style order PC3 vs PC4 - Loadings plot
Generate the scatter plot of PC3 vs PC4 again with only Pectobacterium genomes to identify the species that are diverging from the centre of the plot.
X_pca = pectobact_pca.transform(X_scaled)
plt.figure(figsize=(10,7.5))
sns.set(font_scale=1.15)
temp_d_fam_freq_df_ggs = fam_freq_filtered_df_ggs[fam_freq_filtered_df_ggs['Genus'] == 'Pectobacterium']
g = sns.scatterplot(
x=X_pca[:,2][fam_freq_filtered_df_ggs['Genus'] == 'Pectobacterium'],
y=X_pca[:,3][fam_freq_filtered_df_ggs['Genus'] == 'Pectobacterium'],
data=temp_d_fam_freq_df_ggs,
hue='Species',
style='Species',
s=100,
markers=True,
)
g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);
plt.ylabel(f"PC4 {100 * pectobact_pca.explained_variance_ratio_[3]:.2f}%");
plt.xlabel(f"PC3 {100 * pectobact_pca.explained_variance_ratio_[2]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=4, title='Species', frameon=False);
plt.savefig(
'../results/pca/p_pc3_pc4_pectobacterium_sp.svg',
bbox_inches='tight',
format='svg',
)