import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import statistics
import re
import time

from copy import copy
from matplotlib.patches import Patch
from pathlib import Path

import adjustText
import statsmodels.api as sm
import upsetplot

from Bio import SeqIO
from saintBioutils.utilities.file_io.get_paths import get_file_paths
from saintBioutils.utilities.file_io import make_output_directory
from scipy.stats import ttest_ind, f_oneway
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from tqdm.notebook import tqdm

%matplotlib inline


# loading and parsing data
from cazomevolve.cazome.explore.parse_data import (
    load_fgp_data,
    load_tax_data,
    add_tax_data_from_tax_df,
    add_tax_column_from_row_index,
)

# functions for exploring the sizes of CAZomes
from cazomevolve.cazome.explore.cazome_sizes import (
    calc_proteome_representation,
    count_items_in_cazome,
    get_proteome_sizes,
    count_cazyme_fam_ratio,
)

# explore the frequency of CAZymes per CAZy class
from cazomevolve.cazome.explore.cazy_classes import calculate_class_sizes

# explore the frequencies of CAZy families and identify the co-cazome
from cazomevolve.cazome.explore.cazy_families import (
    build_fam_freq_df,
    build_row_colours,
    build_family_clustermap,
    identify_core_cazome,
    plot_fam_boxplot,
    build_fam_mean_freq_df,
    get_group_specific_fams,
    build_family_clustermap_multi_legend,
)

# functions to identify and explore CAZy families that are always present together
from cazomevolve.cazome.explore.cooccurring_families import (
    identify_cooccurring_fams_corrM,
    calc_cooccuring_fam_freqs,
    identify_cooccurring_fam_pairs,
    add_to_upsetplot_membership,
    build_upsetplot,
    get_upsetplot_grps,
    add_upsetplot_grp_freqs,
    build_upsetplot_matrix,
)

# functions to perform PCA
from cazomevolve.cazome.explore.pca import (
    perform_pca,
    plot_explained_variance,
    plot_scree,
    plot_pca,
    plot_loadings,
)


# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results'), force=True, nodelete=True)

Output directory ../results exists, nodelete is True. Adding output to output directory.


fgp_file = "../data/cazomes/pecto_fam_genomes_proteins"
fgp_df = load_fgp_data(fgp_file)
fgp_df.head(3)


print(f"Total CAZymes (i.e. the number of unique protein IDs): {len(set(fgp_df['Protein']))}")

Total CAZymes (i.e. the number of unique protein IDs): 78132


tax_csv_path = "../data/cazomes/fg_genome_taxs.csv"
tax_df = load_tax_data(tax_csv_path, genus=True, species=True)
tax_df.head(3)


fgp_df = add_tax_data_from_tax_df(
    fgp_df,
    tax_df,
    genus=True,
    species=True,
)
fgp_df.head(3)

Collecting Genus data: 100%|████████████████████████████████████████████████████████████████| 83143/83143 [00:40<00:00, 2073.90it/s]
Collecting Species data: 100%|██████████████████████████████████████████████████████████████| 83143/83143 [00:40<00:00, 2033.06it/s]


print(f"Total CAZymes: {len(set(fgp_df['Protein']))}")

Total CAZymes: 78132


# check all genomes are represented in the fgp_df
f"Examining {len(set(fgp_df['Genome']))} genomes"

'Examining 717 genomes'


print(f"Examining {len(set(fgp_df['Genus']))} genera:")
for genus in set(fgp_df['Genus']):
    print(f'- {genus}')

Examining 8 genera:
- Lonsdalea
- Samsonia
- Pectobacterium
- Musicola
- Brenneria
- Affinibrenneria
- Dickeya
- Acerihabitans


# Calculate CAZymes per genome
cazome_sizes_dict, cazome_sizes_df = count_items_in_cazome(fgp_df, 'Protein', 'Genus', round_by=2)
cazome_sizes_df

Gathering CAZy families per genome: 100%|██████████████████████████████████████████████████| 83143/83143 [00:05<00:00, 15652.38it/s]
Calculating num of Protein per genome and per Genus: 100%|██████████████████████████████████████████| 8/8 [00:00<00:00, 3528.70it/s]


# calculate mean across pectobacteriaceae
pectobact_cazome_sizes = []
for genus in cazome_sizes_dict:
    for genome in cazome_sizes_dict[genus]:
        pectobact_cazome_sizes.append(cazome_sizes_dict[genus][genome]['numOfProteins'])

pd.concat(
    [
        cazome_sizes_df, 
        pd.DataFrame(
            [[
                'Pectobacteriaceae',
                np.mean(pectobact_cazome_sizes),
                np.std(pectobact_cazome_sizes),
                len(set(fgp_df['Genome'])),
            ]], 
            columns=cazome_sizes_df.columns
        ),
    ],
    axis=0,
)


print(f"The total number of CAZymes is {len(set(fgp_df['Protein']))}")

for genus in set(fgp_df['Genus']):
    genus_df = fgp_df[fgp_df['Genus'] == genus]
    print(f"The total number of {genus} CAZymes is {len(set(genus_df['Protein']))}")

The total number of CAZymes is 78132
The total number of Lonsdalea CAZymes is 3009
The total number of Samsonia CAZymes is 81
The total number of Pectobacterium CAZymes is 48663
The total number of Musicola CAZymes is 369
The total number of Brenneria CAZymes is 2897
The total number of Affinibrenneria CAZymes is 108
The total number of Dickeya CAZymes is 22899
The total number of Acerihabitans CAZymes is 106


# Calculate CAZy families per genome
cazome_fam_dict, cazome_fams_df = count_items_in_cazome(fgp_df, 'Family', 'Genus', round_by=2)
cazome_fams_df

Gathering CAZy families per genome: 100%|██████████████████████████████████████████████████| 83143/83143 [00:05<00:00, 15878.20it/s]
Calculating num of Family per genome and per Genus: 100%|███████████████████████████████████████████| 8/8 [00:00<00:00, 3446.43it/s]


# calculate mean across pectobacteriaceae
pectobact_fam_nums = []
for genus in cazome_fam_dict:
    for genome in cazome_fam_dict[genus]:
        pectobact_fam_nums.append(cazome_fam_dict[genus][genome]['numOfFamilys'])

pd.concat(
    [
        cazome_fams_df, 
        pd.DataFrame(
            [[
                'Pectobacteriaceae',
                np.mean(pectobact_fam_nums),
                np.std(pectobact_fam_nums),
                len(set(fgp_df['Genome'])),
            ]], 
            columns=cazome_fams_df.columns
        ),
    ],
    axis=0,
)


cazome_ratio_dict, cazome_ratio_df = count_cazyme_fam_ratio(fgp_df, 'Genus', round_by=2)
cazome_ratio_df

Gathering CAZymes and CAZy families per genome: 100%|██████████████████████████████████████| 83143/83143 [00:05<00:00, 15141.59it/s]
Calculating CAZyme/CAZy family ratio: 100%|█████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3781.63it/s]


pecto_ratios = []
for genus in cazome_sizes_dict:
    for genome in cazome_sizes_dict[genus]:
        ratio = (
            cazome_sizes_dict[genus][genome]['numOfProteins'] / cazome_fam_dict[genus][genome]['numOfFamilys']
        )
        pecto_ratios.append(ratio)

pd.concat(
    [
        cazome_ratio_df,
        pd.DataFrame(
            [[
                'Pectobacteriaceae',
                np.mean(pecto_ratios),
                np.std(pecto_ratios),
                len(set(fgp_df['Genome'])),
            ]],
            columns=cazome_ratio_df.columns
        )
    ],
    axis=0,
)


# Get the size of the proteome (the number of protein acc) per genome
grp = 'Genus'
proteome_dir = "../data/proteomes"
proteome_dict = get_proteome_sizes(proteome_dir, fgp_df, grp)

Getting proteome sizes: 100%|█████████████████████████████████████████████████████████████████████| 717/717 [00:38<00:00, 18.52it/s]


# get total number of proteins across all proteomes
total_proteins = 0
for genus in proteome_dict:
    for genome in proteome_dict[genus]:
        total_proteins += proteome_dict[genus][genome]['numOfProteins']
print(f"Total number of proteins across all genomes: {total_proteins}")

Total number of proteins across all genomes: 2994018


# Calculate the mean proteome size by genus and the proportion of the proteome represented by the CAZome
proteome_perc_df = calc_proteome_representation(proteome_dict, cazome_sizes_dict, grp, round_by=2)
proteome_perc_df

Getting proteome size: 100%|███████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 12069.94it/s]
Calc proteome perc: 100%|███████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 3333.77it/s]


pectobact_average = ['Pectobacteriaceae']
for col in proteome_perc_df.columns[1:]:
    pectobact_average.append(np.mean(list(proteome_perc_df[col])))
pectobact_average[-1] == 660

df = pd.DataFrame([pectobact_average], columns=proteome_perc_df.columns)
pd.concat([proteome_perc_df, df], ignore_index=True, axis=0).round(2)


all_df = pd.concat([proteome_perc_df, cazome_sizes_df, cazome_fams_df, cazome_ratio_df], axis=1, join='inner')
make_output_directory(Path('../results/cazome_size'), force=True, nodelete=True)
all_df.to_csv('../results/cazome_size/cazome_sizes.csv')
all_df

Output directory ../results/cazome_size exists, nodelete is True. Adding output to output directory.


# calculate means for Pectobacteriaceae
for col in all_df:
    if col == 'Genus' or col == 'NumOfGenomes':
        continue
    print(col, '--', np.mean(list(all_df[col])).round(2))

MeanProteomeSize -- 4170.51
SdProteomeSize -- 129.9
MeanProteomePerc -- 2.34
SdProteomePerc -- 0.07
MeanProteins -- 97.0
SdProteins -- 3.63
MeanFamilys -- 51.58
SdFamilys -- 1.6
MeanCAZymeToFamRatio -- 1.89
SdCAZymeToFamRatio -- 0.04


# go down to one Genus column
genus = []
for i in range(len(all_df)):
    genus.append(
        all_df.iloc[i]['Genus'].values[0]
    )

try:
    all_df = all_df.drop('Genus', axis=1)
except KeyError:
    pass

all_df['Genus'] = genus

# Define the soft and hard plant tissue targeting genera that are represented by multiple genomes
soft = ['Pectobacterium','Dickeya']
hard = ['Lonsdalea', 'Brenneria']


# build df with each genome represented as a different row / observation
proteome_data = []
for genus in proteome_dict:
    for genome in proteome_dict[genus]:
        proteome_data.append([genus, proteome_dict[genus][genome]['numOfProteins']])
proteome_size_df = pd.DataFrame(proteome_data, columns=['Genus','ProteomeSize'])

# One way Anova using scipy
f_oneway(
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Pectobacterium'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Dickeya'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Musicola'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Lonsdalea'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Brenneria'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Affinibrenneria'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Acerihabitans'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=142.25683602203867, pvalue=1.5932408551086004e-130)


proteome_size_turkey = pairwise_tukeyhsd(
    endog=proteome_size_df['ProteomeSize'],
    groups=proteome_size_df['Genus'],
    alpha=0.05,   # cut off
)
prot_size_turkey_df = pd.DataFrame(
    proteome_size_turkey._results_table[1:],
    columns=proteome_size_turkey._results_table.data[0]
)
prot_size_turkey_df.to_csv("../results/cazome_size/proteome_size_turkey_results.csv")
prot_size_turkey_df


g = sns.stripplot(
    x=proteome_size_df['Genus'],
    y=proteome_size_df['ProteomeSize'],
    color='black',
    alpha=0.75,
    size=3,
);
g = sns.boxplot(
    x=proteome_size_df['Genus'],
    y=proteome_size_df['ProteomeSize'],
    fliersize=0,
);
g.set_ylabel(ylabel='Number of proteins in the CAZome')
plt.xticks(rotation=90)
plt.savefig(
    "../results/cazome_size/proteome_size_boxplot.png",
    bbox_inches='tight',
    format='png'
)


soft_hard_dict = {
    'Pectobacterium': 'Soft',
    'Dickeya': 'Soft',
    'Musicola': 'Unknown',
    'Brenneria': 'Hard',
    'Lonsdalea': 'Hard',
    'Acerihabitans': 'Hard',
    'Affinibrenneria': 'Hard',
    'Samsonia': 'Hard',
}
pheno_col = []
for i in range(len(proteome_size_df)):
    pheno_col.append(soft_hard_dict[proteome_size_df.iloc[i]['Genus']])
proteome_size_df['Phenotype'] = pheno_col

f_oneway(
    proteome_size_df['ProteomeSize'][proteome_size_df['Phenotype'] == 'Soft'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Phenotype'] == 'Hard'],
    proteome_size_df['ProteomeSize'][proteome_size_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=115.73644297310237, pvalue=2.907639003391272e-44)


proteome_size_turkey = pairwise_tukeyhsd(
    endog=proteome_size_df['ProteomeSize'],
    groups=proteome_size_df['Phenotype'],
    alpha=0.05,   # cut off
)
prot_size_turkey_pheno_df = pd.DataFrame(
    proteome_size_turkey._results_table[1:],
    columns=proteome_size_turkey._results_table.data[0]
)
prot_size_turkey_pheno_df.to_csv("../results/cazome_size/proteome_size_turkey_phenotype_results.csv")
prot_size_turkey_pheno_df


# build df with each genome represented as a different row / observation
proteome_perc_data = []
for genus in proteome_dict:
    for genome in proteome_dict[genus]:
        perc = (
            cazome_sizes_dict[genus][genome]['numOfProteins'] / 
            proteome_dict[genus][genome]['numOfProteins']
        ) * 100
        proteome_perc_data.append([genus, perc])
proteome_perc_df = pd.DataFrame(proteome_perc_data, columns=['Genus','ProteomePerc'])

# One way Anova using scipy
f_oneway(
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Pectobacterium'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Dickeya'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Musicola'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Lonsdalea'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Brenneria'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Affinibrenneria'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Acerihabitans'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=100.22498879010466, pvalue=1.569439830366814e-101)


# follow up with a post hoc test to see which genera differ 
proteome_perc_turkey = pairwise_tukeyhsd(
    endog=proteome_perc_df['ProteomePerc'],
    groups=proteome_perc_df['Genus'],
    alpha=0.05,   # cut off
)
prot_per_turkey_df = pd.DataFrame(
    proteome_perc_turkey._results_table[1:],
    columns=proteome_perc_turkey._results_table.data[0]
)
prot_per_turkey_df.to_csv("../results/cazome_size/proteome_perc_turkey_results.csv")
prot_per_turkey_df


g = sns.stripplot(
    x=proteome_perc_df['Genus'],
    y=proteome_perc_df['ProteomePerc'],
    color='black',
    alpha=0.75,
    size=3,
);
g = sns.boxplot(
    x=proteome_perc_df['Genus'],
    y=proteome_perc_df['ProteomePerc'],
    fliersize=0,
);
g.set_ylabel(ylabel='Percentage of proteome in the CAZome')
plt.xticks(rotation=90)
plt.savefig(
    "../results/cazome_size/proteome_perc_boxplot.png",
    bbox_inches='tight',
    format='png'
)


pheno_col = []
for i in range(len(proteome_perc_df)):
    pheno_col.append(soft_hard_dict[proteome_perc_df.iloc[i]['Genus']])
proteome_perc_df['Phenotype'] = pheno_col

f_oneway(
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Phenotype'] == 'Soft'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Phenotype'] == 'Hard'],
    proteome_perc_df['ProteomePerc'][proteome_perc_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=226.22200801781082, pvalue=7.940529938618575e-77)


# follow up with a post hoc test to see which genera differ 
proteome_perc_pheno_turkey = pairwise_tukeyhsd(
    endog=proteome_perc_df['ProteomePerc'],
    groups=proteome_perc_df['Phenotype'],
    alpha=0.05,   # cut off
)
prot_per_pheno_turkey_df = pd.DataFrame(
    proteome_perc_pheno_turkey._results_table[1:],
    columns=proteome_perc_pheno_turkey._results_table.data[0]
)
prot_per_pheno_turkey_df.to_csv("../results/cazome_size/proteome_perc_turkey_phenotype_results.csv")
prot_per_pheno_turkey_df


# build df with each genome represented as a different row / observation
cazome_s_data = []
for genus in cazome_sizes_dict:
    for genome in cazome_sizes_dict[genus]:
        cazome_s_data.append([genus, cazome_sizes_dict[genus][genome]['numOfProteins']])
cazome_s_df = pd.DataFrame(cazome_s_data, columns=['Genus','CAZomeSize'])

# One way Anova using scipy
f_oneway(
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Pectobacterium'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Dickeya'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Musicola'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Lonsdalea'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Brenneria'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Affinibrenneria'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Acerihabitans'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=161.55655161375054, pvalue=3.282343275946478e-142)


# follow up with a post hoc test to see which genera differ 
cazome_size_turkey = pairwise_tukeyhsd(
    endog=cazome_s_df['CAZomeSize'],
    groups=cazome_s_df['Genus'],
    alpha=0.05,   # cut off
)
cazome_size_turkey_df = pd.DataFrame(
    cazome_size_turkey._results_table[1:],
    columns=cazome_size_turkey._results_table.data[0]
)
cazome_size_turkey_df.to_csv("../results/cazome_size/cazome_size_turkey_results.csv")
cazome_size_turkey_df


g = sns.stripplot(
    x=cazome_s_df['Genus'],
    y=cazome_s_df['CAZomeSize'],
    color='black',
    alpha=0.75,
    size=3,
);
g = sns.boxplot(
    x=cazome_s_df['Genus'],
    y=cazome_s_df['CAZomeSize'],
    fliersize=0,
);
g.set_ylabel(ylabel='Number of proteins in the CAZome')
plt.xticks(rotation=90)
plt.savefig(
    "../results/cazome_size/cazome_size_boxplot.png",
    bbox_inches='tight',
    format='png'
)


pheno_col = []
for i in range(len(cazome_s_df)):
    pheno_col.append(soft_hard_dict[cazome_s_df.iloc[i]['Genus']])
cazome_s_df['Phenotype'] = pheno_col

f_oneway(
    cazome_s_df['CAZomeSize'][cazome_s_df['Phenotype'] == 'Soft'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Phenotype'] == 'Hard'],
    cazome_s_df['CAZomeSize'][cazome_s_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=493.10468604523317, pvalue=3.0262355426671687e-135)


# follow up with a post hoc test to see which genera differ 
cazome_s_pheno_turkey = pairwise_tukeyhsd(
    endog=cazome_s_df['CAZomeSize'],
    groups=cazome_s_df['Phenotype'],
    alpha=0.05,   # cut off
)
cazome_s_pheno_turkey_df = pd.DataFrame(
    cazome_s_pheno_turkey._results_table[1:],
    columns=cazome_s_pheno_turkey._results_table.data[0]
)
cazome_s_pheno_turkey_df.to_csv("../results/cazome_size/cazome_size_turkey_phenotype_results.csv")
cazome_s_pheno_turkey_df


# build df with each genome represented as a different row / observation
cazy_fam_data = []
for genus in cazome_fam_dict:
    for genome in cazome_fam_dict[genus]:
        cazy_fam_data.append([genus, cazome_fam_dict[genus][genome]['numOfFamilys']])
cazy_f_df = pd.DataFrame(cazy_fam_data, columns=['Genus','CAZyFamilies'])

# One way Anova using scipy
f_oneway(
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Pectobacterium'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Dickeya'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Musicola'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Lonsdalea'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Brenneria'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Affinibrenneria'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Acerihabitans'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=208.2841856410692, pvalue=2.6418322599463134e-167)


# follow up with a post hoc test to see which genera differ 
cazome_fam_turkey = pairwise_tukeyhsd(
    endog=cazy_f_df['CAZyFamilies'],
    groups=cazy_f_df['Genus'],
    alpha=0.05,   # cut off
)
cazome_fam_turkey_df = pd.DataFrame(
    cazome_fam_turkey._results_table[1:],
    columns=cazome_fam_turkey._results_table.data[0]
)
cazome_fam_turkey_df.to_csv("../results/cazome_size/cazome_fam_turkey_results.csv")
cazome_fam_turkey_df


g = sns.stripplot(
    x=cazy_f_df['Genus'],
    y=cazy_f_df['CAZyFamilies'],
    color='black',
    alpha=0.75,
    size=3,
);
g = sns.boxplot(
    x=cazy_f_df['Genus'],
    y=cazy_f_df['CAZyFamilies'],
    fliersize=0,
);
g.set_ylabel(ylabel='Number of CAZyme families')
plt.xticks(rotation=90)
plt.savefig(
    "../results/cazome_size/cazy_fams_boxplot.png",
    bbox_inches='tight',
    format='png'
)


pheno_col = []
for i in range(len(cazy_f_df)):
    pheno_col.append(soft_hard_dict[cazy_f_df.iloc[i]['Genus']])
cazy_f_df['Phenotype'] = pheno_col

f_oneway(
    cazy_f_df['CAZyFamilies'][cazy_f_df['Phenotype'] == 'Soft'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Phenotype'] == 'Hard'],
    cazy_f_df['CAZyFamilies'][cazy_f_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=395.8267787310159, pvalue=2.1009567376244338e-116)


# follow up with a post hoc test to see which genera differ 
cazome_f_pheno_turkey = pairwise_tukeyhsd(
    endog=cazy_f_df['CAZyFamilies'],
    groups=cazy_f_df['Phenotype'],
    alpha=0.05,   # cut off
)
cazome_f_pheno_turkey_df = pd.DataFrame(
    cazome_f_pheno_turkey._results_table[1:],
    columns=cazome_f_pheno_turkey._results_table.data[0]
)
cazome_f_pheno_turkey_df.to_csv("../results/cazome_size/cazome_fam_turkey_phenotype_results.csv")
cazome_f_pheno_turkey_df


# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/cazy_classes/'), force=True, nodelete=True)

Output directory ../results/cazy_classes exists, nodelete is True. Adding output to output directory.


class_df, class_size_dict = calculate_class_sizes(fgp_df, 'Genus', round_by=2)

Getting CAZy class sizes: 100%|█████████████████████████████████████████████████████████████| 83143/83143 [00:19<00:00, 4309.82it/s]
Calculating CAZy class sizes: 100%|██████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 120.67it/s]


# add values with means across all genera to represent pectobacteriaceae
pectobact_class_means = []

for cazy_class in set(class_df['CAZyClass']):
    df = class_df[class_df['CAZyClass'] == cazy_class]
    new_row = [cazy_class, 'Pectobacteriaceae']
    
    for col in class_df.columns[2:]:
        mean = np.mean(df[col])
        new_row.append(mean)
    
    new_row[-1] = 660
    
    pectobact_class_means.append(new_row)

df = pd.DataFrame(pectobact_class_means, columns = class_df.columns)
all_class_df = pd.concat([class_df, df], axis=0, ignore_index=True)
all_class_df = all_class_df.round(2)
# replace nan with 0
all_class_df = all_class_df.fillna(0)

filtered_class_df = all_class_df[all_class_df['Genus'] != 'Haf']
all_class_df.to_csv('../results/cazy_classes/cazy_class_sizes.csv')

all_class_df


# provide each observation (genome) separately , not just the mean
class_freq_obs = []  # observations, one obs per genome - class pair
for genome in tqdm(set(fgp_df['Genome']), desc='Getting freq per class per genome'):
    class_freqs = {
        'GH': 0,
        'GT': 0,
        'PL': 0,
        'CE': 0,
        'AA': 0,
        'CBM': 0,
    }  # {class: count}
    g_rows = fgp_df[fgp_df['Genome'] == genome]
    for i in range(len(g_rows)):
        if g_rows.iloc[i]['Family'].startswith('GH'):
            class_freqs['GH'] += 1
        elif g_rows.iloc[i]['Family'].startswith('GT'):
            class_freqs['GT'] += 1
        elif g_rows.iloc[i]['Family'].startswith('PL'):
            class_freqs['PL'] += 1
        elif g_rows.iloc[i]['Family'].startswith('CE'):
            class_freqs['CE'] += 1
        elif g_rows.iloc[i]['Family'].startswith('AA'):
            class_freqs['AA'] += 1
        else:
            class_freqs['CBM'] += 1
            
    for cazy_class in class_freqs:
        class_freq_obs.append(
            [
                g_rows.iloc[0]['Genus'],
                cazy_class,
                class_freqs[cazy_class],
                soft_hard_dict[g_rows.iloc[0]['Genus']],
            ]
        )
        
class_freq_obs_df = pd.DataFrame(
    class_freq_obs,
    columns=['Genus','CAZyClass','Frequency', 'Phenotype'],
)
class_freq_obs_df.head(2)

Getting freq per class per genome:   0%|          | 0/717 [00:00<?, ?it/s]


# Test for differences between the genera and soft/hard plant tissue targeting phenotype

# perform two-way ANOVA
model = ols(
    'Frequency ~ + C(Genus) + C(CAZyClass) + C(Genus):C(CAZyClass)',
    data=class_freq_obs_df,
).fit()
sm.stats.anova_lm(model, typ=2)


# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'GH']
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=70.48729402779328, pvalue=3.718837585881247e-77)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gh_class_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Genus'],
    alpha=0.05,   # cut off
)
gh_class_turkey_df = pd.DataFrame(
    gh_class_turkey._results_table[1:],
    columns=gh_class_turkey._results_table.data[0]
)
gh_class_turkey_df.to_csv("../results/cazy_classes/gh_class_turkey_results.csv")
gh_class_turkey_df


f_oneway(
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=84.86443453434632, pvalue=8.597326941700086e-34)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gh_class_phenot_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Phenotype'],
    alpha=0.05,   # cut off
)
gh_class_turkey_pheno_df = pd.DataFrame(
    gh_class_phenot_turkey._results_table[1:],
    columns=gh_class_phenot_turkey._results_table.data[0]
)
gh_class_turkey_pheno_df.to_csv("../results/cazy_classes/gh_class_turkey_phenotype_results.csv")
gh_class_turkey_pheno_df


# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'GT']
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=21.25128713661445, pvalue=4.428956585432569e-26)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gt_class_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Genus'],
    alpha=0.05,   # cut off
)
gt_class_turkey_df = pd.DataFrame(
    gt_class_turkey._results_table[1:],
    columns=gt_class_turkey._results_table.data[0]
)
gt_class_turkey_df.to_csv("../results/cazy_classes/gt_class_turkey_results.csv")
gt_class_turkey_df


# one-way ANOVA the phenotypes
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=2.6331141394580087, pvalue=0.07255204343497902)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
gt_class_phenot_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Phenotype'],
    alpha=0.05,   # cut off
)
gt_class_turkey_pheno_df = pd.DataFrame(
    gt_class_phenot_turkey._results_table[1:],
    columns=gt_class_phenot_turkey._results_table.data[0]
)
gt_class_turkey_pheno_df.to_csv("../results/cazy_classes/gt_class_turkey_phenotype_results.csv")
gt_class_turkey_pheno_df


# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'PL']
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=222.45853144424663, pvalue=3.5647020997539887e-174)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
pl_class_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Genus'],
    alpha=0.05,   # cut off
)
pl_class_turkey_df = pd.DataFrame(
    pl_class_turkey._results_table[1:],
    columns=pl_class_turkey._results_table.data[0]
)
pl_class_turkey_df.to_csv("../results/cazy_classes/pl_class_turkey_results.csv")
pl_class_turkey_df


# one-way ANOVA the phenotypes
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=689.0535233630009, pvalue=2.0980145132039776e-167)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
pl_class_phenot_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Phenotype'],
    alpha=0.05,   # cut off
)
pl_class_turkey_pheno_df = pd.DataFrame(
    pl_class_phenot_turkey._results_table[1:],
    columns=pl_class_phenot_turkey._results_table.data[0]
)
pl_class_turkey_pheno_df.to_csv("../results/cazy_classes/pl_class_turkey_phenotype_results.csv")
pl_class_turkey_pheno_df


# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'CE']
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=103.88807201350583, pvalue=2.7639451642084564e-104)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
ce_class_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Genus'],
    alpha=0.05,   # cut off
)
ce_class_turkey_df = pd.DataFrame(
    ce_class_turkey._results_table[1:],
    columns=ce_class_turkey._results_table.data[0]
)
ce_class_turkey_df.to_csv("../results/cazy_classes/ce_class_turkey_results.csv")
ce_class_turkey_df


# one-way ANOVA the phenotypes
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=321.59905675699997, pvalue=2.6098906666833877e-100)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
ce_class_phenot_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Phenotype'],
    alpha=0.05,   # cut off
)
ce_class_turkey_pheno_df = pd.DataFrame(
    ce_class_phenot_turkey._results_table[1:],
    columns=ce_class_phenot_turkey._results_table.data[0]
)
ce_class_turkey_pheno_df.to_csv("../results/cazy_classes/ce_class_turkey_phenotype_results.csv")
ce_class_turkey_pheno_df


# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'AA']
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=48.76173592829987, pvalue=1.3727065409992368e-56)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
aa_class_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Genus'],
    alpha=0.05,   # cut off
)
aa_class_turkey_df = pd.DataFrame(
    aa_class_turkey._results_table[1:],
    columns=aa_class_turkey._results_table.data[0]
)
aa_class_turkey_df.to_csv("../results/cazy_classes/aa_class_turkey_results.csv")
aa_class_turkey_df


# one-way ANOVA the phenotypes
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=61.148610732738945, pvalue=3.072615168467825e-25)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
aa_class_phenot_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Phenotype'],
    alpha=0.05,   # cut off
)
aa_class_turkey_pheno_df = pd.DataFrame(
    aa_class_phenot_turkey._results_table[1:],
    columns=aa_class_phenot_turkey._results_table.data[0]
)
aa_class_turkey_pheno_df.to_csv("../results/cazy_classes/aa_class_turkey_phenotype_results.csv")
aa_class_turkey_pheno_df


# One way Anova using scipy
sub_class_df = class_freq_obs_df[class_freq_obs_df['CAZyClass'] == 'CBM']
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Pectobacterium'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Dickeya'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Musicola'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Lonsdalea'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Brenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Affinibrenneria'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Acerihabitans'],
    sub_class_df['Frequency'][sub_class_df['Genus'] == 'Samsonia'],    
)

F_onewayResult(statistic=80.88764296424834, pvalue=4.027926882977604e-86)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
cbm_class_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Genus'],
    alpha=0.05,   # cut off
)
cbm_class_turkey_df = pd.DataFrame(
    cbm_class_turkey._results_table[1:],
    columns=cbm_class_turkey._results_table.data[0]
)
cbm_class_turkey_df.to_csv("../results/cazy_classes/cbm_class_turkey_results.csv")
cbm_class_turkey_df


# one-way ANOVA the phenotypes
f_oneway(
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Soft'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Hard'],
    sub_class_df['Frequency'][sub_class_df['Phenotype'] == 'Unknown'],
)

F_onewayResult(statistic=185.10059849921967, pvalue=1.7220017140436788e-65)


# post hoc Turkey HDS test to determine between which genera there is a stat sig diff
cbm_class_phenot_turkey = pairwise_tukeyhsd(
    endog=sub_class_df['Frequency'],
    groups=sub_class_df['Phenotype'],
    alpha=0.05,   # cut off
)
cbm_class_turkey_pheno_df = pd.DataFrame(
    cbm_class_phenot_turkey._results_table[1:],
    columns=cbm_class_phenot_turkey._results_table.data[0]
)
cbm_class_turkey_pheno_df.to_csv("../results/cazy_classes/cbm_class_turkey_phenotype_results.csv")
cbm_class_turkey_pheno_df


# calc genomes with no AAs
no_aa_genomes = 0
for genus in class_size_dict['AA']:
    for genome in class_size_dict['AA'][genus]:
        no_aa_genomes+=1
print(f"{no_aa_genomes} genomes have no AAs")

aa_counts = [0] * no_aa_genomes
for genus in class_size_dict['AA']:
    for genome in class_size_dict['AA'][genus]:
        aa_counts.append(len(class_size_dict['AA'][genus][genome]['proteins']))
print(f"Max: {max(aa_counts)}\nMin: {min(aa_counts)}\nMode: {statistics.mode(aa_counts)}")

465 genomes have no AAs
Max: 2
Min: 0
Mode: 0


# find genomes with 2 AAs
two_aa_genomes = {}
one_aa_genomes = {}

for genus in class_size_dict['AA']:
    for genome in class_size_dict['AA'][genus]:
        if len(class_size_dict['AA'][genus][genome]['proteins']) == 2:
            try:
                two_aa_genomes[genus].add(genome)
            except KeyError:
                two_aa_genomes[genus] = {genome}
                
        elif len(class_size_dict['AA'][genus][genome]['proteins']) == 1:
            try:
                one_aa_genomes[genus].add(genome)
            except KeyError:
                one_aa_genomes[genus] = {genome}

two_aa_genomes

{'Pectobacterium': {'GCA_000738125.1',
  'GCA_000749915.1',
  'GCA_011378985.1',
  'GCA_011379045.1',
  'GCA_020971565.1',
  'GCA_024343355.1',
  'GCA_024722495.1',
  'GCA_028335745.1',
  'GCA_900195285.2',
  'GCA_900195295.2'}}


for genus in one_aa_genomes:
    print(f"{genus}: {len(one_aa_genomes[genus])}")

Pectobacterium: 361
Dickeya: 85
Brenneria: 8
Samsonia: 1


# make output directory
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/cazy_families/'), force=True, nodelete=True)

Output directory ../results/cazy_families exists, nodelete is True. Adding output to output directory.


fam_freq_df = build_fam_freq_df(fgp_df, ['Genus', 'Species'])
fam_freq_df

The dataset contains 117 CAZy families

Counting fam frequencies: 100%|███████████████████████████████████████████████████████████████████| 717/717 [00:45<00:00, 15.80it/s]


fam_freq_df.to_csv("../results/cazy_families/cazy_fam_freqs.csv")


# index the taxonomy data and genome (ggs=genome_genus_species)
fam_freq_df_ggs = copy(fam_freq_df)  # so does not alter fam_freq_df
fam_freq_df_ggs = fam_freq_df_ggs.set_index(['Genome','Genus','Species'])
fam_freq_df_ggs.head(1)


# define a colour scheme to colour code rows by genus
fam_freq_df_ggs['Genus'] = list(fam_freq_df['Genus'])  # add column to use for colour scheme, is removed
fam_freq_genus_row_colours, fam_g_lut = build_row_colours(fam_freq_df_ggs, 'Genus', 'Set2')


# make a figure that is full size, and all data is legible
print("""
A large version of a cluster map of CAZy family frequencies with each row representing a unique 
genome (and colour coded by the genus classification of the genome), and each column representing 
a unique CAZy family.

This full sized figure is generated for human readability. However, this figure is extremely large, and 
potentially too large for publication. Therefore, a smaller figure is generated below.
""")
large_fam_clustermap = build_family_clustermap(
    fam_freq_df_ggs,
    row_colours=fam_freq_genus_row_colours,
    fig_size=(40,120),
    file_path="../results/cazy_families/fam_freq_clustermap.svg",
    file_format='svg',
    lut=fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.2,0.05),
    title_fontsize=28,
    legend_fontsize=24,
    cbar_pos=(0, 0.95, 0.05, 0.05),
)

A large version of a cluster map of CAZy family frequencies with each row representing a unique 
genome (and colour coded by the genus classification of the genome), and each column representing 
a unique CAZy family.

This full sized figure is generated for human readability. However, this figure is extremely large, and 
potentially too large for publication. Therefore, a smaller figure is generated below.

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)


# make a figure the optimal size to fit in a paper
build_family_clustermap(
    fam_freq_df_ggs,
    row_colours=fam_freq_genus_row_colours,
    fig_size=(20,35),
    file_path="../results/cazy_families/paper_fam_freq_clustermap.png",
    file_format='png',
    font_scale=0.5,
    lut=fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0, 0.95, 0.05, 0.05),
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f9d2c493190>


# define a colour scheme to colour code rows by SPECIES
fam_freq_df_ggs['Species'] = list(fam_freq_df['Species'])  # add column to use for colour scheme, is removed
fam_freq_species_row_colours, fam_s_lut = build_row_colours(fam_freq_df_ggs, 'Species', 'rainbow')


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=fam_freq_df_ggs,
    row_colours=[fam_freq_genus_row_colours,fam_freq_species_row_colours],
    luts=[fam_g_lut, fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.045), (0.63,1.05)],
    legend_cols=[1,5],
    fig_size=(20,40),
    file_path="../results/cazy_families/paper_genus_species_fam_freq_clustermap.png",
    file_format='png',
    font_scale=1,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.96, 0.1, 0.1),  #left, bottom, width, height
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f9d2d64c710>


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=fam_freq_df_ggs,
    row_colours=[fam_freq_genus_row_colours,fam_freq_species_row_colours],
    luts=[fam_g_lut, fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.04), (0.63,1.05)],
    legend_cols=[1,5],
    fig_size=(45,150),
    file_path="../results/cazy_families/paper_genus_species_fam_freq_clustermap.pdf",
    file_format='pdf',
    font_scale=1,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=14,
    legend_fontsize=12,
    cbar_pos=(0.01, 0.96, 0.1, 0.1),  #left, bottom, width, height
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f9cff7e13d0>


# define a colour scheme to colour code SOFT vs HARD plant tissue targeting genomes
phenotype_col = []
for ri in range(len(fam_freq_df_ggs)):
    if list(fam_freq_df['Genus'])[ri] in ['Pectobacterium', 'Dickeya', 'Musicola']:
        phenotype_col.append('Soft tissue targeting')
    else:
        phenotype_col.append('Hard tissue targeting')
fam_freq_df_ggs['Phenotype'] = phenotype_col
fam_freq_pheno_row_colours, fam_p_lut = build_row_colours(fam_freq_df_ggs, 'Phenotype', "Set1")


build_family_clustermap_multi_legend(
    df=fam_freq_df_ggs,
    row_colours=[fam_freq_pheno_row_colours, fam_freq_genus_row_colours],
    luts=[fam_p_lut, fam_g_lut],
    legend_titles=['Phenotype', 'Genus'],
    bbox_to_anchors=[(0.225,1.045), (0.63,1.04)],
    legend_cols=[1,5],
    fig_size=(27,41),
    file_path="../results/cazy_families/paper_pheno_genus_fam_freq_clustermap.png",
    file_format='png',
    font_scale=0.7,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.96, 0.1, 0.1),  #left, bottom, width, height
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f9d14438e90>


genomes_to_remove = [
    'GCA_000749925.1',
    'GCA_000749845.1',
    'GCA_000803215.1',
    'GCA_025946765.1',
    'GCA_004137815.1',
    'GCA_029023745.1',
]
fam_freq_filtered_df = fam_freq_df[~fam_freq_df['Genome'].isin(genomes_to_remove)]
print(f"Original df length: {len(fam_freq_df)}\nLength after removing genome: {len(fam_freq_filtered_df)}")

Original df length: 717
Length after removing genome: 711


fam_freq_filtered_df_ggs = fam_freq_filtered_df.set_index(['Genome', 'Genus', 'Species'])


# make a figure the optimal size to fit in a paper
build_family_clustermap(
    fam_freq_df_ggs,
    row_colours=fam_freq_genus_row_colours,
    fig_size=(20,70),
    file_path="../results/cazy_families/paper_fam_freq_clustermap_FILTERED.svg",
    file_format='svg',
    font_scale=0.5,
    lut=fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0, 0.95, 0.05, 0.05),
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f9cf9127990>


all_families = list(fam_freq_df.columns)[3:]
# dict {group: {only unique fams}} and dict {group: {all fams}}
unique_grp_fams, group_fams = get_group_specific_fams(fam_freq_filtered_df, 'Genus', all_families)
unique_grp_fams

Identifying fams in each Genus: 100%|█████████████████████████████████████████████████████████████| 711/711 [00:09<00:00, 72.41it/s]
Identifying Genus specific fams: 100%|█████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 12363.46it/s]

{'Pectobacterium': {'AA10',
  'CBM13',
  'GH121',
  'GH146',
  'GH18',
  'GT101',
  'GT102',
  'GT11',
  'GT111',
  'GT14',
  'GT24',
  'GT52',
  'PL11'},
 'Dickeya': {'CBM4', 'CE2', 'GH113', 'GH148', 'GH25', 'GH91', 'GT97', 'PL10'},
 'Brenneria': {'GH106', 'GT21', 'PL17'},
 'Acerihabitans': {'GH127', 'GH15'}}


hard_soft_fams_dict = {'hard': set(), 'soft': set(), 'unknown': set()}

for ri in tqdm(range(len(fam_freq_filtered_df)), desc="Identifying Soft and Hard plant tissue targeting families"):
    genus = fam_freq_filtered_df.iloc[ri]['Genus']
    
    if genus in ['Pectobacterium','Dickeya']:
        grp = 'soft'
    elif genus == 'Musicola':
        grp = 'unknown'
    else:
        grp = 'hard'
    
    for fam in fam_freq_filtered_df.columns[3:]:
        if fam_freq_filtered_df.iloc[ri][fam] >= 1:
            hard_soft_fams_dict[grp].add(fam)

unique_hard_fams = hard_soft_fams_dict['hard'].difference(hard_soft_fams_dict['soft'])
unique_soft_fams = hard_soft_fams_dict['soft'].difference(hard_soft_fams_dict['hard'])

print("Hard plant tissue targeting specific families:")
for fam in unique_hard_fams:
    print(fam)
print("Soft plant tissue targeting specific families:")
for fam in unique_soft_fams:
    print(fam)

Identifying Soft and Hard plant tissue targeting families:   0%|          | 0/711 [00:00<?, ?it/s]

Hard plant tissue targeting specific families:
GH140
GH15
PL17
GH127
GT21
GH37
GH51
GH39
GH106
Soft plant tissue targeting specific families:
GT24
GH146
CBM4
GH18
GT25
CBM0
GT14
GT111
PL11
GH113
GH91
CBM91
PL10
PL35
GT101
GT52
CBM13
GH16
GH148
GT11
CE2
AA10
GH25
GH121
GT97
GT102


# find the number of soft plant tissue targeting genomes that contain GH68
# find unique counts - if all 1 or 0 then can use sum to get genome count
print(set(fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(['Pectobacterium','Dickeya'])]['GH68']))
# get num of genomes
sum(fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(['Pectobacterium','Dickeya'])]['GH68'])

{0, 1}

4


# convert into df
unique_grp_data = []

unique_grp_fams['Hard tissue'] = unique_hard_fams
unique_grp_fams['Soft tissue'] = unique_soft_fams

for genus in unique_grp_fams:
    new_data = [genus]
    for cazy_class in ['GH', 'GT', 'CE', 'PL', 'AA', 'CBM']:
        added = False
        class_data = []
        for fam in unique_grp_fams[genus]:
            if fam.startswith(cazy_class):
                class_data.append(fam)
                added = True
        if added is False:
            class_data.append("")
        class_data.sort()
        new_data.append(", ".join(class_data))
            
    unique_grp_data.append(new_data)
    
unique_grp_df = pd.DataFrame(unique_grp_data, columns=['Genus', 'GH', 'GT', 'CE', 'PL', 'AA', 'CBM'])
unique_grp_df.to_csv("../results/cazy_families/unique_grp_fams.tsv", sep='\t')
unique_grp_df


all_families = list(fam_freq_filtered_df.columns)[3:]
pd_fam_freq_df_filtered = fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(
    ['Pectobacterium', 'Dickeya']
)]
# dict {group: {only unique fams}} and dict {group: {all fams}}
pd_unique_grp_fams, pd_group_fams = get_group_specific_fams(pd_fam_freq_df_filtered, 'Genus', all_families)
pd_unique_grp_fams

Identifying fams in each Genus: 100%|█████████████████████████████████████████████████████████████| 632/632 [00:08<00:00, 72.62it/s]
Identifying Genus specific fams: 100%|█████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 11244.78it/s]

{'Pectobacterium': {'AA10',
  'CBM13',
  'CBM18',
  'CBM3',
  'CBM67',
  'GH108',
  'GH12',
  'GH121',
  'GH146',
  'GH153',
  'GH154',
  'GH18',
  'GH38',
  'GH65',
  'GH68',
  'GT101',
  'GT102',
  'GT11',
  'GT111',
  'GT14',
  'GT20',
  'GT24',
  'GT32',
  'GT52',
  'GT73',
  'PL11'},
 'Dickeya': {'CBM4',
  'CE2',
  'GH113',
  'GH148',
  'GH25',
  'GH26',
  'GH91',
  'GT97',
  'PL0',
  'PL10'}}


def plot_rarefaction_curve(
    df,
    all_families,
    num_of_runs=100,
):
    """Build rare faction curve
    
    :param df: pandas df, cazy fam freq df
    :param all_families: list of CAZy families to be analysed
    :param num_of_runs: number of times to resample dataset
    
    Return 
    * rare faction plot
    * df of the number of families added with new additional genome
    * df of number of families in total fam pool with each additional genome
    """
    genomes = list(df['Genome'])
    
    run_outputs_addings = {}  # num of run: [num of new fams found ADDED a new genome to the pool]
    run_outputs_counts = {}  # num of run: [num of unique fams in total fam pool after adding a new genome to the pool]

    for run in tqdm(range(num_of_runs), desc="Counting added families"):
        random.shuffle(genomes)
        fams_to_parse = set(copy(all_families))

        num_added_families = [0]  # one item for each genome, int, num of new families added
        family_pool_size = [0]  # number of unqiue fams seen after adding each genome to the pool
        families = set()  # to track which families have been seen

        for genome in genomes:
            fams_added = 0  # num of fams added to total pool for this genome

            genome_row = df[df['Genome'] == genome]
            for fam in fams_to_parse:
                if genome_row[fam].values[0] > 0:  # fam in genome
                    if fam not in families:
                        fams_added += 1
                        families.add(fam)

            # parsed all families
            
            # record number of new families found in this genome
            num_added_families.append(fams_added)
            
            # record number of unique fams seen after analysing genomes so far
            family_pool_size.append(len(families))
            
            # find which families are in fams_to_parse but are not in families
            # do not reparse fams that have already been found
            fams_to_parse = fams_to_parse.difference(families)

        # parsed all genomes
        run_outputs_addings[run] = num_added_families
        run_outputs_counts[run] = family_pool_size

    run_outputs_df_addings = pd.DataFrame(run_outputs_addings)    
    run_outputs_df_counts = pd.DataFrame(run_outputs_counts)
    
    # calculate the averages
    run_count_avg = []  # avg num of fams after adding each genome
    genomes = list(range(1, len(run_outputs_df_counts)+1))
    for i in range(len(run_outputs_df_counts)):
        run_count_avg.append(np.mean(run_outputs_df_counts.iloc[i]))
        genomes.append
    run_avg_df = pd.DataFrame(
        {
            'Number of CAZy families': run_count_avg,
            'Number of genomes': genomes,
        }
    )
    
    # Generate plot
    # subplot 1 = all runs
    # subplot 2 = averages of all runs
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7.5))

    #add DataFrames to subplots
    run_outputs_df_counts.plot(
        ax=axes[0],
        xlabel='Number of genomes',
        ylabel='Number of CAZy families',
        legend=False,
    );
    
    run_avg_df.plot(
        ax=axes[1],
        x='Number of genomes',
        y='Number of CAZy families',
        legend=False,
    );
    
    # raref_curve = sns.lineplot(
    #     data=run_outputs_df_counts,
    #     legend=False,
    # );    #     linestyle='solid',

    # plt.xlabel('Number of genomes');
    # plt.ylabel('Number of CAZy families');
    
    return fig, run_outputs_df_addings, run_outputs_df_counts


all_families = list(fam_freq_df.columns)[3:]
# calculate rare factions for all of pectobacteriaceae
pecto_rarefact_plt, pecto_runs_addings, pecto_runs_counts = plot_rarefaction_curve(
    fam_freq_filtered_df,
    all_families,
)

Counting added families:   0%|          | 0/100 [00:00<?, ?it/s]


# calculate rare factions for Pectobacterium
pectobact_rarefact_plt, pectobact_runs_addings, pectobact_runs_counts = plot_rarefaction_curve(
    fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Pectobacterium'],
    all_families,
)

Counting added families:   0%|          | 0/100 [00:00<?, ?it/s]


# calculate rare factions for Dickeya
dic_rarefact_plt, dic_runs_addings, dic_runs_counts = plot_rarefaction_curve(
    fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Dickeya'],
    all_families,
)

Counting added families:   0%|          | 0/100 [00:00<?, ?it/s]


# calculate rare factions for Musicola
mus_rarefact_plt, mus_runs_addings, mus_runs_counts = plot_rarefaction_curve(
    fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Musicola'],
    all_families,
)

Counting added families:   0%|          | 0/100 [00:00<?, ?it/s]


# calculate rare factions for Brenneria
ben_rarefact_plt, ben_runs_addings, ben_runs_counts = plot_rarefaction_curve(
    fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Brenneria'],
    all_families,
)

Counting added families:   0%|          | 0/100 [00:00<?, ?it/s]


# calculate rare factions for Lonsdalea
lon_rarefact_plt, lon_runs_addings, lon_runs_counts = plot_rarefaction_curve(
    fam_freq_filtered_df[fam_freq_filtered_df['Genus']=='Lonsdalea'],
    all_families,
)

Counting added families:   0%|          | 0/100 [00:00<?, ?it/s]


def calc_run_avg(run_outputs_df_counts):
    """Calc average num of families in total fam pool after adding a genome to the dataset"""
    run_count_avg = []  # avg num of fams after adding each genome
    genomes = list(range(1, len(run_outputs_df_counts)+1))
    
    for i in range(len(run_outputs_df_counts)):
        run_count_avg.append(np.mean(run_outputs_df_counts.iloc[i]))
        genomes.append
    run_avg_df = pd.DataFrame(
        {
            'Number of CAZy families': run_count_avg,
            'Number of genomes': genomes,
        }
    )
    
    return run_avg_df


# calculate average num of fams in total fam pool after adding a new genome to the data set
pecto_run_avg = calc_run_avg(pecto_runs_counts)
pectobact_run_avg = calc_run_avg(pectobact_runs_counts)
dic_run_avg = calc_run_avg(dic_runs_counts)
mus_run_avg = calc_run_avg(mus_runs_counts)
ben_run_avg = calc_run_avg(ben_runs_counts)
lon_run_avg = calc_run_avg(lon_runs_counts)

# build into a single df
df_len = len(pecto_run_avg)  # len all cols need to be

run_avg_data = {
    'Pectobacteriaceae': pecto_run_avg['Number of CAZy families'],
    'Pectobacterium': list(pectobact_run_avg['Number of CAZy families']) + (
        [None] * (df_len - len(pectobact_run_avg['Number of CAZy families']))
    ),
    'Dickeya': list(dic_run_avg['Number of CAZy families']) + (
        [None] * (df_len - len(dic_run_avg['Number of CAZy families']))
    ),
    'Musicola': list(mus_run_avg['Number of CAZy families']) + (
        [None] * (df_len - len(mus_run_avg['Number of CAZy families']))
    ),
    'Brenneria': list(ben_run_avg['Number of CAZy families']) + (
        [None] * (df_len - len(ben_run_avg['Number of CAZy families']))
    ),
    'Lonsdalea': list(lon_run_avg['Number of CAZy families']) + (
        [None] * (df_len - len(lon_run_avg['Number of CAZy families']))
    ),
}

run_avg_df = pd.DataFrame(run_avg_data)
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
run_avg_df.plot(ax=axes);


# make output directory
# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/core_cazome/'), force=True, nodelete=True)

Output directory ../results/core_cazome exists, nodelete is True. Adding output to output directory.


fam_freq_filtered_df_ggs = fam_freq_filtered_df.set_index(['Genome', 'Genus', 'Species'])


core_cazome = identify_core_cazome(fam_freq_filtered_df_ggs)

core_cazome = list(core_cazome)
core_cazome.sort()

print(f"Total families: {len(all_families)}")
print("The core CAZy families are:")
for fam in core_cazome:
    print('-', fam)

Identifying core CAZome: 100%|██████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 9004.62it/s]

Total families: 117
The core CAZy families are:
- CBM5
- CBM50
- GH23
- GH3
- GT2
- GT51
- GT9


# filter the famil freq df to include only those families in the core CAZome
core_cazome_df = fam_freq_filtered_df_ggs[core_cazome]
plot_fam_boxplot(core_cazome_df, font_scale=0.8, fig_size=(12,6))


core_cazome_df_genus = copy(core_cazome_df)  # to ensure core_cazome_df is not altereted
core_cazome_df_genus = add_tax_column_from_row_index(core_cazome_df_genus, 'Genus', 1)
core_cazome_df_genus.head()


core_cazome_fggf_df, core_cazome_mean_freq_df = build_fam_mean_freq_df(
    core_cazome_df_genus,
    'Genus',
    round_by=2,
)

# add rows showing the means across all pectobacteriaceae
all_pecto_core_fam_data = []
for fam in core_cazome_df_genus.columns:
    try:
        mean_freq = np.mean(core_cazome_df_genus[fam]).round(2)
        sd_freq = np.std(core_cazome_df_genus[fam]).round(2)
        all_pecto_core_fam_data.append([fam, 'Pectobacteriaceae', mean_freq, sd_freq])
    except TypeError: # tax column
        continue
    
temp_df = pd.DataFrame(all_pecto_core_fam_data, columns=['Family','Genus','MeanFreq','SdFreq'])
core_cazome_mean_freq_df = pd.concat([core_cazome_mean_freq_df, temp_df])

core_cazome_mean_freq_df.to_csv("../results/core_cazome/core_cazome_freqs.csv")

core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████| 711/711 [00:00<00:00, 5870.25it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|████████████████████████████████████████████████████| 8/8 [00:00<00:00, 118.05it/s]


genera_of_interest = ['Pectobacterium', 'Dickeya', 'Musicola', 'Brenneria', 'Lonsdalea']
all_families = fam_freq_filtered_df_ggs.columns

core_cazomes = {}  # {genus: {fams}}
for genus in genera_of_interest:
    filtered_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'] == genus]
    temp_core_cazome = identify_core_cazome(filtered_df[all_families])
    temp_core_cazome = list(temp_core_cazome)
    temp_core_cazome.sort()
    core_cazomes[genus] = {'fams': temp_core_cazome, 'freqs': {len(filtered_df)}}
    
core_cazomes

Identifying core CAZome: 100%|██████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 7996.18it/s]
Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 17363.72it/s]
Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 29897.26it/s]
Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 25178.74it/s]
Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 29642.62it/s]

{'Pectobacterium': {'fams': ['CBM5',
   'CBM50',
   'GH1',
   'GH103',
   'GH23',
   'GH28',
   'GH3',
   'GH43',
   'GT2',
   'GT51',
   'GT9',
   'PL1',
   'PL2',
   'PL22',
   'PL3',
   'PL9'],
  'freqs': {426}},
 'Dickeya': {'fams': ['CBM48',
   'CBM5',
   'CBM50',
   'CE4',
   'CE8',
   'GH1',
   'GH103',
   'GH105',
   'GH13',
   'GH23',
   'GH28',
   'GH3',
   'GH33',
   'GH73',
   'GH77',
   'GH8',
   'GT1',
   'GT19',
   'GT2',
   'GT28',
   'GT35',
   'GT4',
   'GT5',
   'GT51',
   'GT9',
   'PL1',
   'PL9'],
  'freqs': {206}},
 'Musicola': {'fams': ['CBM48',
   'CBM5',
   'CBM50',
   'CE1',
   'CE11',
   'CE12',
   'CE4',
   'CE8',
   'CE9',
   'GH1',
   'GH102',
   'GH103',
   'GH104',
   'GH105',
   'GH13',
   'GH19',
   'GH2',
   'GH23',
   'GH28',
   'GH3',
   'GH30',
   'GH31',
   'GH32',
   'GH33',
   'GH38',
   'GH5',
   'GH73',
   'GH77',
   'GH8',
   'GT0',
   'GT1',
   'GT19',
   'GT2',
   'GT26',
   'GT28',
   'GT30',
   'GT35',
   'GT4',
   'GT5',
   'GT51',
   'GT56',
   'GT81',
   'GT83',
   'GT9',
   'PL1',
   'PL2',
   'PL22',
   'PL9'],
  'freqs': {4}},
 'Brenneria': {'fams': ['CBM5',
   'CBM50',
   'CE11',
   'CE12',
   'CE9',
   'GH1',
   'GH102',
   'GH103',
   'GH13',
   'GH23',
   'GH28',
   'GH3',
   'GH32',
   'GH4',
   'GH68',
   'GH73',
   'GH94',
   'GT0',
   'GT19',
   'GT2',
   'GT26',
   'GT28',
   'GT30',
   'GT35',
   'GT4',
   'GT5',
   'GT51',
   'GT56',
   'GT8',
   'GT81',
   'GT84',
   'GT9'],
  'freqs': {33}},
 'Lonsdalea': {'fams': ['CBM32',
   'CBM5',
   'CBM50',
   'CE11',
   'CE4',
   'GH19',
   'GH23',
   'GH3',
   'GH32',
   'GH37',
   'GH68',
   'GH77',
   'GH8',
   'GT19',
   'GT2',
   'GT20',
   'GT26',
   'GT28',
   'GT4',
   'GT51',
   'GT56',
   'GT9'],
  'freqs': {39}}}


core_cazome_upsetplot_membership = []
core_cazome_upsetplot_membership = add_to_upsetplot_membership(
    core_cazome_upsetplot_membership,
    core_cazomes,
)
len(core_cazome_upsetplot_membership)

708


core_cazome_upsetplot = build_upsetplot(
    core_cazome_upsetplot_membership,
    sort_by='input',
    file_path='../results/core_cazome/genera_core_cazome.svg',
)


soft_genera = ['Pectobacterium', 'Dickeya', 'Musicola']
hard_genera = ['Brenneria', 'Lonsdalea', 'Samsonia', 'Affinibrenneria', 'Acerihabitans']
grps = [[soft_genera, 'Soft tissue targeting'], [hard_genera, 'Hard tissue targeting']]

all_families = fam_freq_filtered_df_ggs.columns

soft_hard_core_cazomes = {}  # {grp: {fams}}
for grp in tqdm(grps):
    # gather all rows containing the genera of interest
    filtered_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(grp[0])]
    temp_core_cazome = identify_core_cazome(filtered_df[all_families])
    temp_core_cazome = list(temp_core_cazome)
    temp_core_cazome.sort()
    try:
        soft_hard_core_cazomes[grp[1]]
    except KeyError:
        soft_hard_core_cazomes[grp[1]] = {'fams': set(), 'freqs': [0]}

    soft_hard_core_cazomes[grp[1]]['fams'] = soft_hard_core_cazomes[grp[1]]['fams'].union(
        set(temp_core_cazome)
    )
    soft_hard_core_cazomes[grp[1]]['freqs'][0] += len(filtered_df)
    
soft_hard_core_cazomes

  0%|          | 0/2 [00:00<?, ?it/s]

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 10814.83it/s]

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 27753.28it/s]

{'Soft tissue targeting': {'fams': {'CBM5',
   'CBM50',
   'GH1',
   'GH103',
   'GH23',
   'GH28',
   'GH3',
   'GT2',
   'GT51',
   'GT9',
   'PL1',
   'PL9'},
  'freqs': [636]},
 'Hard tissue targeting': {'fams': {'CBM5',
   'CBM50',
   'CE11',
   'GH23',
   'GH3',
   'GT19',
   'GT2',
   'GT26',
   'GT28',
   'GT4',
   'GT51',
   'GT56',
   'GT9'},
  'freqs': [75]}}


soft_hard_core_cazomes.update(core_cazomes)


core_cazome_upsetplot_membership = []
core_cazome_upsetplot_membership = add_to_upsetplot_membership(
    core_cazome_upsetplot_membership,
    soft_hard_core_cazomes,
)
len(core_cazome_upsetplot_membership)

1419


core_cazome_upsetplot = build_upsetplot(
    core_cazome_upsetplot_membership,
    file_path='../results/core_cazome/genera_soft_hard_core_cazome.svg',
)


# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/cooccurring_families/'), force=True, nodelete=True)

# reminder of the structure of the df
fam_freq_filtered_df.head(1)

Output directory ../results/cooccurring_families exists, nodelete is True. Adding output to output directory.


all_families = list(fam_freq_filtered_df.columns[3:])

cooccurring_families, fam_corr_M_filled = identify_cooccurring_fams_corrM(
    fam_freq_filtered_df,
    all_families,
    core_cazome=[],
    corrM_path="../results/cooccurring_families/fam_corr_M_filled.csv",
    fill_value=2,
)

Building binary fam freq df: 100%|██████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 2514.93it/s]
Delete absent families: 100%|███████████████████████████████████████████████████████████████████| 117/117 [00:00<00:00, 7081.50it/s]
Identifying always co-occurring families: 100%|█████████████████████████████████████████████████| 117/117 [00:00<00:00, 3121.93it/s]


cooccurring_families

{('CBM4', 'GH148'), ('GH121', 'GH146'), ('GH127', 'GH15'), ('GH94', 'GT84')}


sns.clustermap(
    fam_corr_M_filled,
    cmap=sns.cubehelix_palette(rot=0, dark=2, light=0, reverse=True, as_cmap=True),
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f9cf91ffad0>


cooccurring_fams_dict = calc_cooccuring_fam_freqs(
    fam_freq_filtered_df,
    list(all_families),
    exclude_core_cazome=False,
)
cooccurring_fams_dict

Identifying pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 117/117 [00:01<00:00, 93.38it/s]
Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 25/25 [00:00<00:00, 108323.97it/s]

{0: {'fams': {'CBM4', 'GH148'}, 'freqs': {8}},
 1: {'fams': {'CBM5', 'CBM50', 'GH23', 'GH3', 'GT2', 'GT51', 'GT9'},
  'freqs': {711}},
 2: {'fams': {'GH121', 'GH146'}, 'freqs': {1}},
 3: {'fams': {'GH127', 'GH15'}, 'freqs': {1}},
 4: {'fams': {'GH94', 'GT84'}, 'freqs': {309}}}


genera_cooccuring_fams = {}  # {genus: cooccurring_fams_dict}

for genus in tqdm(
    ['Pectobacterium', 'Dickeya', 'Musicola', 'Lonsdalea', 'Brenneria'],
    desc="Identifying genus specific co-occurring fams",
):
    genus_fam_freq_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'] == genus]
    genus_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
        genus_fam_freq_df,
        list(all_families),
        exclude_core_cazome=False,
    )
    genera_cooccuring_fams[genus] = genus_cooccurring_fams_dict
genera_cooccuring_fams

Identifying genus specific co-occurring fams:   0%|          | 0/5 [00:00<?, ?it/s]

Identifying pairs of co-occurring families:   0%|                                                           | 0/117 [00:00<?, ?it/s]
Identifying pairs of co-occurring families:  11%|█████▍                                           | 13/117 [00:00<00:00, 126.59it/s]
Identifying pairs of co-occurring families:  22%|██████████▉                                      | 26/117 [00:00<00:00, 111.33it/s]
Identifying pairs of co-occurring families:  39%|███████████████████▎                             | 46/117 [00:00<00:00, 145.54it/s]
Identifying pairs of co-occurring families:  52%|█████████████████████████▌                       | 61/117 [00:00<00:00, 138.33it/s]
Identifying pairs of co-occurring families:  65%|███████████████████████████████▊                 | 76/117 [00:00<00:00, 139.37it/s]
Identifying pairs of co-occurring families:  78%|██████████████████████████████████████           | 91/117 [00:00<00:00, 129.62it/s]
Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 131.17it/s]

Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 135/135 [00:00<00:00, 322638.77it/s]

Identifying pairs of co-occurring families:   0%|                                                           | 0/117 [00:00<?, ?it/s]
Identifying pairs of co-occurring families:  21%|██████████                                       | 24/117 [00:00<00:00, 239.91it/s]
Identifying pairs of co-occurring families:  41%|████████████████████                             | 48/117 [00:00<00:00, 205.63it/s]
Identifying pairs of co-occurring families:  69%|█████████████████████████████████▉               | 81/117 [00:00<00:00, 253.84it/s]
Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 242.16it/s]

Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 361/361 [00:00<00:00, 558106.80it/s]

Identifying pairs of co-occurring families: 100%|███████████████████████████████████████████████| 117/117 [00:00<00:00, 1857.02it/s]

Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████| 1130/1130 [00:00<00:00, 412135.96it/s]

Identifying pairs of co-occurring families: 100%|███████████████████████████████████████████████| 117/117 [00:00<00:00, 1456.48it/s]

Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 255/255 [00:00<00:00, 694962.65it/s]

Identifying pairs of co-occurring families:   0%|                                                           | 0/117 [00:00<?, ?it/s]
Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 716.76it/s]

Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████| 500/500 [00:00<00:00, 835851.73it/s]

{'Pectobacterium': {0: {'fams': {'CBM3', 'GH5'}, 'freqs': {425}},
  1: {'fams': {'CBM48', 'CE8', 'CE9', 'GH13'}, 'freqs': {425}},
  2: {'fams': {'CBM5',
    'CBM50',
    'GH1',
    'GH103',
    'GH23',
    'GH28',
    'GH3',
    'GH43',
    'GT2',
    'GT51',
    'GT9',
    'PL1',
    'PL2',
    'PL22',
    'PL3',
    'PL9'},
   'freqs': {426}},
  3: {'fams': {'CE11', 'GH102', 'GH32'}, 'freqs': {425}},
  4: {'fams': {'GH105', 'GT56'}, 'freqs': {425}},
  5: {'fams': {'GH121', 'GH146', 'GH154'}, 'freqs': {1}},
  6: {'fams': {'GH94', 'GT84'}, 'freqs': {152}}},
 'Dickeya': {0: {'fams': {'CBM4', 'GH148'}, 'freqs': {8}},
  1: {'fams': {'CBM48',
    'CBM5',
    'CBM50',
    'CE4',
    'CE8',
    'GH1',
    'GH103',
    'GH105',
    'GH13',
    'GH23',
    'GH28',
    'GH3',
    'GH33',
    'GH73',
    'GH77',
    'GH8',
    'GT1',
    'GT19',
    'GT2',
    'GT28',
    'GT35',
    'GT4',
    'GT5',
    'GT51',
    'GT9',
    'PL1',
    'PL9'},
   'freqs': {206}},
  2: {'fams': {'CE11', 'GT83'}, 'freqs': {204}},
  3: {'fams': {'GH16', 'GT25'}, 'freqs': {1}},
  4: {'fams': {'GH19', 'GH5', 'PL4'}, 'freqs': {203}},
  5: {'fams': {'GH88', 'PL35'}, 'freqs': {3}},
  6: {'fams': {'GH94', 'GT84'}, 'freqs': {89}},
  7: {'fams': {'GT30', 'PL3'}, 'freqs': {205}},
  8: {'fams': {'PL2', 'PL22'}, 'freqs': {205}}},
 'Musicola': {0: {'fams': {'CBM32', 'CBM63'}, 'freqs': {2}},
  1: {'fams': {'CBM48',
    'CBM5',
    'CBM50',
    'CE1',
    'CE11',
    'CE12',
    'CE4',
    'CE8',
    'CE9',
    'GH1',
    'GH102',
    'GH103',
    'GH104',
    'GH105',
    'GH13',
    'GH19',
    'GH2',
    'GH23',
    'GH28',
    'GH3',
    'GH30',
    'GH31',
    'GH32',
    'GH33',
    'GH38',
    'GH5',
    'GH73',
    'GH77',
    'GH8',
    'GT0',
    'GT1',
    'GT19',
    'GT2',
    'GT26',
    'GT28',
    'GT30',
    'GT35',
    'GT4',
    'GT5',
    'GT51',
    'GT56',
    'GT81',
    'GT83',
    'GT9',
    'PL1',
    'PL2',
    'PL22',
    'PL9'},
   'freqs': {4}},
  2: {'fams': {'GH24', 'GH36'}, 'freqs': {2}}},
 'Lonsdalea': {0: {'fams': {'CBM32',
    'CBM5',
    'CBM50',
    'CE11',
    'CE4',
    'GH19',
    'GH23',
    'GH3',
    'GH32',
    'GH37',
    'GH68',
    'GH77',
    'GH8',
    'GT19',
    'GT2',
    'GT20',
    'GT26',
    'GT28',
    'GT4',
    'GT51',
    'GT56',
    'GT9'},
   'freqs': {39}},
  1: {'fams': {'GH1', 'GH28', 'GH4', 'GH73', 'GT0'}, 'freqs': {38}},
  2: {'fams': {'GH13', 'GH39', 'GT30', 'PL1', 'PL3'}, 'freqs': {38}},
  3: {'fams': {'GH26', 'GH51'}, 'freqs': {9}},
  4: {'fams': {'GH31', 'GT81'}, 'freqs': {38}},
  5: {'fams': {'GH78', 'GT1'}, 'freqs': {10}},
  6: {'fams': {'GH94', 'GT84'}, 'freqs': {33}}},
 'Brenneria': {0: {'fams': {'CBM3', 'GH5'}, 'freqs': {25}},
  1: {'fams': {'CBM5',
    'CBM50',
    'CE11',
    'CE12',
    'CE9',
    'GH1',
    'GH102',
    'GH103',
    'GH13',
    'GH23',
    'GH28',
    'GH3',
    'GH32',
    'GH4',
    'GH68',
    'GH73',
    'GH94',
    'GT0',
    'GT19',
    'GT2',
    'GT26',
    'GT28',
    'GT30',
    'GT35',
    'GT4',
    'GT5',
    'GT51',
    'GT56',
    'GT8',
    'GT81',
    'GT84',
    'GT9'},
   'freqs': {33}},
  2: {'fams': {'GH106', 'PL38'}, 'freqs': {1}},
  3: {'fams': {'GH8', 'GT83'}, 'freqs': {15}},
  4: {'fams': {'GT73', 'PL17'}, 'freqs': {1}}}}


soft_genera = ['Pectobacterium', 'Dickeya', 'Musicola']
hard_genera = ['Brenneria', 'Lonsdalea', 'Samsonia', 'Affinibrenneria', 'Acerihabitans']
# hard_genera = ['Brenneria', 'Lonsdalea']
grps = [[soft_genera, 'Soft tissue targeting'], [hard_genera, 'Hard tissue targeting']]

for grp in tqdm(grps):
    # gather all rows containing the genera of interest
    grp_fam_freq_df = fam_freq_filtered_df[fam_freq_filtered_df['Genus'].isin(grp[0])]
    
    grp_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
        grp_fam_freq_df,
        list(all_families),
        exclude_core_cazome=False,
    )
    genera_cooccuring_fams[grp[1]] = grp_cooccurring_fams_dict

  0%|          | 0/2 [00:00<?, ?it/s]

Identifying pairs of co-occurring families:   0%|                                                           | 0/117 [00:00<?, ?it/s]
Identifying pairs of co-occurring families:   4%|██▏                                                | 5/117 [00:00<00:02, 40.72it/s]
Identifying pairs of co-occurring families:  10%|█████▏                                            | 12/117 [00:00<00:02, 52.20it/s]
Identifying pairs of co-occurring families:  16%|████████                                          | 19/117 [00:00<00:01, 56.27it/s]
Identifying pairs of co-occurring families:  25%|████████████▍                                     | 29/117 [00:00<00:01, 70.28it/s]
Identifying pairs of co-occurring families:  36%|█████████████████▉                                | 42/117 [00:00<00:00, 88.73it/s]
Identifying pairs of co-occurring families:  44%|██████████████████████▏                           | 52/117 [00:00<00:00, 89.34it/s]
Identifying pairs of co-occurring families:  53%|██████████████████████████▍                       | 62/117 [00:00<00:00, 88.79it/s]
Identifying pairs of co-occurring families:  62%|███████████████████████████████▏                  | 73/117 [00:00<00:00, 94.14it/s]
Identifying pairs of co-occurring families:  71%|███████████████████████████████████▍              | 83/117 [00:00<00:00, 94.53it/s]
Identifying pairs of co-occurring families:  80%|████████████████████████████████████████▏         | 94/117 [00:01<00:00, 97.29it/s]
Identifying pairs of co-occurring families:  89%|███████████████████████████████████████████▌     | 104/117 [00:01<00:00, 97.81it/s]
Identifying pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 117/117 [00:01<00:00, 86.75it/s]

Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 75/75 [00:00<00:00, 125178.19it/s]

Identifying pairs of co-occurring families:   0%|                                                           | 0/117 [00:00<?, ?it/s]
Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████| 117/117 [00:00<00:00, 604.14it/s]

Combining pairs of co-occurring families: 100%|█████████████████████████████████████████████████| 91/91 [00:00<00:00, 415775.23it/s]


upsetplot_membership = []
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, cooccurring_fams_dict)

for genus in genera_cooccuring_fams:
    upsetplot_membership = add_to_upsetplot_membership(
        upsetplot_membership,
        genera_cooccuring_fams[genus],
    )

len(upsetplot_membership)

7233


pectobact_upsetplot = build_upsetplot(
    upsetplot_membership,
    file_path='../results/cooccurring_families/pecto-cooccurring-families.svg',
)


upset_plot_groups = get_upsetplot_grps(upsetplot_membership)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:01<00:00, 29.69it/s]


cooccurring_grp_freq_data = []  # empty list to store data for the df

# add pectobacteriaceae data
genera_cooccuring_fams['Pectobacteriaceae'] = cooccurring_fams_dict

# add data for each genus, all soft plant targeting and hard plant tissue targeting
cooccurring_grp_freq_data = add_upsetplot_grp_freqs(
    upset_plot_groups,
    cooccurring_grp_freq_data,
    genera_cooccuring_fams,
    genus,
    grp_sep=True,
    grp_order=[
        'Pectobacteriaceae', 
        'Pectobacterium', 'Dickeya', 'Musicola', 'Soft tissue targeting',
        'Brenneria', 'Lonsdalea', 'Hard tissue targeting',
    ],
    include_none=True,
)

Compiling co-occurring families incidence data: 100%|████████████████████████████████████████████| 38/38 [00:00<00:00, 24853.20it/s]


# build the dataframe
cooccurring_fams_freq_df = build_upsetplot_matrix(
    cooccurring_grp_freq_data,
    'Genus',
    file_path='../results/cooccurring_families/cooccurring_fams_freqs.csv',
)
cooccurring_fams_freq_df


grp_order = {
    'soft_grps': [  # grps only found in soft plant tissue targeting genomes
        'GH13+CBM48+CE8', # S
        'PL2+PL22', # # S D
        'GH148+CBM4', # S D
        'PL3+GT30', # D
        'PL35+GH88', # D
        'CE11+GT83', # D
        'GT25+GH16', # D
        'GH5+GH19+PL4', # D
        'GH121+GH146+GH154', # S P   
        'GH121+GH146',
        'GH105+GT56', # P
        'GH13+CBM48+CE8+CE9',  # P
        'CE11+GH32+GH102', # P
        
    ],
    'musicola': [  # grps found only in musicola
        'CBM32+CBM63',
        'GH24+GH36',
    ],
    'both_grps': [  # grps found in soft and hard plant tissue targeting genomes
        'GT84+GH94', # 
        'GH5+CBM3', #
    ],
    'hard_musicola_grps': [  # grps only found in hard plant tissue targeting genomes and Musicola
        'GH13+GT30',
        'GH1+GH73+GT0',
        'GT5+GT35+GT8',
        'GH15+GH127',
        'GT81+GH31', # L
        'GH8+GT83', # B
    ],
    'hard_grps': [ # grps only found in hard plant tissue targeting genomes
        'CBM67+GH65', # H
        'PL17+GT73', # L B
        'PL38+GH106', # L B
        'GT1+GH78', # L
        'GH26+GH51', # L
        'GH1+GH28+GH73+GT0+GH4',
        'GH13+PL1+PL3+GT30+GH39',
    ],
    'all_core_cazomes': [ # then core cazomes at the end
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9',  # pectobacteriaceae
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH28+PL1+GH103+PL9',  # soft plant tissue targeting
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28+PL1+GH103+PL9+CBM48+CE8+GH105+GT4+GT28+GT19+GH73+GT5+GT35+GH8+CE4+GH77+GT1+GH33',   # dickeya
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH28+PL1+GH103+PL9+PL2+PL22+PL3+GH43', # pectobacter 
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28+PL1+GH103+PL9+PL2+PL22+CBM48+CE8+CE11+GH5+GH105+GT56+GH32+GH102+CE9+GT4+GT28+GT19+GH73+GT30+GT5+GT35+GH8+CE4+GH77+GH19+GT83+GT1+GH33+GT26+GT0+GT81+GH31+CE12+GH38+GH30+CE1+GH2+GH104',  # musicola
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+CE11+GT56+GT4+GT28+GT19+GT26',  # hard
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+CE11+GT56+GH32+GT4+GT28+GT19+GH8+CE4+GH77+GH19+GT26+GH68+CBM32+GT20+GH37',   # lonsdalea
        'GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28+GH103+GT84+GH94+CE11+GT56+GH32+GH102+CE9+GT4+GT28+GT19+GH73+GT30+GT5+GT35+GT26+GT0+GT81+GH68+GH4+GT8+CE12', # bren
    ],
}
for grp in grp_order:
    grps = []
    for fams_str in grp_order[grp]:
        fams_list = fams_str.split("+")
        fams_list.sort()
        fams = "+".join(fams_list)
        grps.append(fams)
    grp_order[grp] = grps


paper_cooccurring_fams = {}  # {grp_num: {'fams': {fams}, 'freqs': {int}}}
num_of_grp = 0

for pheno_grp in grp_order:
    for fam_grp in grp_order[pheno_grp]:
        fams = fam_grp.split("+")
        fams.sort()
        
        for genus in ['Pectobacterium', 'Dickeya', 'Musicola', 'Soft tissue targeting', 'Hard tissue targeting', 'Brenneria', 'Lonsdalea']:
            # {grp num: {'fams': {fams}, 'freqs': {int}}}
            
            for grp_num in genera_cooccuring_fams[genus]:
                grp_fams = list(genera_cooccuring_fams[genus][grp_num]['fams'])
                grp_fams.sort()
                
                if grp_fams == fams:
                    
                    this_grp_num = None
                    
                    for co_grp_num in paper_cooccurring_fams:
                        if paper_cooccurring_fams[co_grp_num]['fams'] == genera_cooccuring_fams[genus][grp_num]['fams']:
                            this_grp_num = co_grp_num
                            
                    if this_grp_num is None:
                        this_grp_num = copy(num_of_grp)
                    
                        paper_cooccurring_fams[this_grp_num] = {
                            'fams': genera_cooccuring_fams[genus][grp_num]['fams'],
                            'freqs': genera_cooccuring_fams[genus][grp_num]['freqs']
                        }
                        
                        num_of_grp += 1


upsetplot_membership = []
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, paper_cooccurring_fams)
len(upsetplot_membership)

5076


pectobact_upsetplot = build_upsetplot(
    upsetplot_membership,
    file_path='../results/cooccurring_families/paper-pecto-cooccurring-families.svg',
    sort_by='input',
)


paper_cooccurring_freqs = []  # [fams, genus/grp, incidence/freq]
num_of_grp = 0

for grp_name in grp_order:
    for fams in grp_order[grp_name]:
        fams = fams.split("+")
        fams.sort()

        for genus in ['Soft tissue targeting', 'Pectobacterium', 'Dickeya', 'Musicola', 'Hard tissue targeting', 'Brenneria', 'Lonsdalea']:
            # {grp num: {'fams': {fams}, 'freqs': {int}}}

            for grp_num in genera_cooccuring_fams[genus]:
                grp_fams = list(genera_cooccuring_fams[genus][grp_num]['fams'])
                grp_fams.sort()

                if grp_fams == fams:
                    # found fams in genus

                    paper_cooccurring_freqs.append(
                        [
                            genera_cooccuring_fams[genus][grp_num]['fams'],
                            genus,
                            list(genera_cooccuring_fams[genus][grp_num]['freqs'])[0],
                        ]
                    )


# build the dataframe
cooccurring_fams_freq_df = build_upsetplot_matrix(
    paper_cooccurring_freqs,
    'Genus',
    file_path='../results/cooccurring_families/paper-cooccurring_fams_freqs.csv',
)
cooccurring_fams_freq_df


# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/pca/'), force=True, nodelete=True)

Output directory ../results/pca exists, nodelete is True. Adding output to output directory.


num_of_components = len(fam_freq_filtered_df_ggs.columns)
pectobact_pca, X_scaled = perform_pca(fam_freq_filtered_df_ggs, num_of_components)
pectobact_pca

PCA(n_components=117)

PCA(n_components=117)


print(
    f"{round(pectobact_pca.explained_variance_ratio_.sum() * 100, 2)}% "
    "of the variance in the data set was catpured by the PCA"
)

cumExpVar = plot_explained_variance(
    pectobact_pca,
    num_of_components,
    file_path="../results/pca/pca_explained_variance.png",
)

100.0% of the variance in the data set was catpured by the PCA
Number of features needed to explain 0.95 fraction of total variance is 59.


plot_scree(pectobact_pca, nComp=10, file_path="../results/pca/pectobact_pca_scree.png")

Explained variance for 1PC: 0.1578167077631129
Explained variance for 2PC: 0.11714531745192669
Explained variance for 3PC: 0.05535353923303329
Explained variance for 4PC: 0.04804711615760438
Explained variance for 5PC: 0.04031600741870654
Explained variance for 6PC: 0.02931399017337997
Explained variance for 7PC: 0.02766383466136205
Explained variance for 8PC: 0.021653599705441385
Explained variance for 9PC: 0.02118856319918563
Explained variance for 10PC: 0.019638464318788077


fam_freq_filtered_df_ggs['Genus'] = list(fam_freq_filtered_df['Genus'])
fam_freq_filtered_df_ggs['Species'] = list(fam_freq_filtered_df['Species'])

X_pca = pectobact_pca.transform(X_scaled)

fam_freq_df_ggs_pc = copy(fam_freq_filtered_df_ggs)
colnames = []
for i in range(4):
    fam_freq_df_ggs_pc[f'PC{i+1} ({round(pectobact_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(pectobact_pca.explained_variance_ratio_[i] * 100, 2)}%)')

g = sns.pairplot(
    fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Genus",
    diag_kind="kde",
    markers=['o','X', '^', 'P', 'v', 'D', '<', 's'],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/pca/pca_pc_screen_genus.svg',
    bbox_inches='tight',
    format='svg'
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)


g = sns.pairplot(
    fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Species",
    diag_kind="kde",
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=6, title='Species', frameon=False);
    
plt.savefig(
    '../results/pca/pca_pc_screen_species.svg',
    bbox_inches='tight',
    format='svg'
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)


# pick out some stories and colour code the species plot accordingly
# pick out some stories and colour code the species plot accordingly
species_of_interest = [
    'aquatica',
    'dianthicola',
    'oryzae',
    'zeae',
    'britannica',
    'populi',
    'aquaticum',
    'carotovorum',
    'parmentieri',
    'quasiaquaticum',
]

# Already have geus and species columns added to fam_freq_filtered_df_ggs
markers_dict = {
    'Acerihabitans': 's',
    'Affinibrenneria': '>',
    'Brenneria': '^',
    'Dickeya': 'X',
    'Lonsdalea': 'P',
    'Musicola': 'v',
    'Pectobacterium': 'o',
    'Samsonia': 'D',
}

# build lists for species and hue colouring and markers
new_sp_col = []

for i in range(len(fam_freq_filtered_df_ggs['Species'])):
    sp = fam_freq_filtered_df_ggs.iloc[i]['Species'].split(" ")[0]
    if sp in species_of_interest:
        new_sp_col.append(f"{fam_freq_filtered_df_ggs.iloc[i]['Genus']} {sp}")
    else:
        new_sp_col.append(f'{fam_freq_filtered_df_ggs.iloc[i]["Genus"]} sp.')

fam_freq_df_ggs_pc['Selected_Species'] = new_sp_col

colours = [
    "#808080", "#D3D3D3", "#4ce0e6", "#e391be",
    "#13add4",
    "#550fa6",
    "#E5E4E2",
    "#c99c14",
    "#708090",
    "#f5d018",  # dark green PC 1a4a07
    "#0c1669",
    "#1a4a07",
    "#d41320",
    "#899499",
    "#71797E",
    "#636263",
    "#262626",
    "#000000"
]
# Set custom color palette
sns.set_palette(sns.color_palette(colours))

g = sns.pairplot(
    fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Selected_Species",
    diag_kind="kde",
    height=3,
    markers = [
        'X', 'o', '^',
        'X', 'X', 'o',
        'X', 'o', 'o',
        'v', 'X', 'D',
        'P', 'P', 'P',
        'o', 's', '>',
    ],
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/pca/pca_pc_screen_species_selected.svg',
    bbox_inches='tight',
    format='svg'
)

/home/em/anaconda3/envs/cazomevolve/lib/python3.11/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)


pc_pair = (1,2)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)

for job in ['genus', 'species', 'loadings']:
    if job == 'genus':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Genus',
            style='Genus',
            file_path=out,
        );
        time.sleep(2)

    elif job == 'species':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Species',
            style='Species',
            file_path=out,
        );
        time.sleep(2)

    elif job == 'loadings':  # loadings plot
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
        g = plot_loadings(
            pectobact_pca,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            threshold=0.3,
            fig_size=(10,10),
            file_path=out,
            font_size=11,
        );

Output directory ../results/pca/PC1-vs-PC2 exists, nodelete is True. Adding output to output directory.

PC1 vs PC2 - plotting Genera
Not applying hue order
Applying style
Not applying style order
PC1 vs PC2 - plotting Species
Not applying hue order
Applying style
Not applying style order
PC1 vs PC2 - Loadings plot


X_pca = pectobact_pca.transform(X_scaled)
plt.figure(figsize=(15,15))
sns.set(font_scale=1.15)
g = sns.scatterplot(
    x=X_pca[:,0],
    y=X_pca[:,1],
    data=fam_freq_filtered_df_ggs,
    hue='Genus',
    style='Genus',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC2 {100 * pectobact_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pectobact_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);

genome_lbls = ["-".join(_) for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((xval > 2) and (yval < 3.5) and (yval > 0) and (xval < 4)) or ((xval > 0.1) and (xval < 2.5) and (yval < 0))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pca_pc1_vs_pc2_musicola_annotated.png', bbox_inches='tight', format='png')


pc_pair = (1,3)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)

for job in ['genus', 'species', 'loadings']:
    if job == 'genus':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Genus',
            style='Genus',
            file_path=out,
            figsize=(10,8),
        );
        time.sleep(2)

    elif job == 'species':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Species',
            style='Species',
            file_path=out,
            figsize=(10,8),
        );
        time.sleep(2)

    elif job == 'loadings':  # loadings plot
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
        g = plot_loadings(
            pectobact_pca,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            threshold=0.3,
            fig_size=(10,10),
            file_path=out,
            font_size=11,
        );

Output directory ../results/pca/PC1-vs-PC3 exists, nodelete is True. Adding output to output directory.

PC1 vs PC3 - plotting Genera
Not applying hue order
Applying style
Not applying style order
PC1 vs PC3 - plotting Species
Not applying hue order
Applying style
Not applying style order
PC1 vs PC3 - Loadings plot


pc_pair = (1,4)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)

for job in ['genus', 'species', 'loadings']:
    if job == 'genus':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Genus',
            style='Genus',
            file_path=out,
            figsize=(10,8),
        );
        time.sleep(2)

    elif job == 'species':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Species',
            style='Species',
            file_path=out,
            figsize=(10,8),
        );
        time.sleep(2)

    elif job == 'loadings':  # loadings plot
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
        g = plot_loadings(
            pectobact_pca,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            threshold=0.3,
            fig_size=(10,10),
            file_path=out,
            font_size=11,
        );

Output directory ../results/pca/PC1-vs-PC4 exists, nodelete is True. Adding output to output directory.

PC1 vs PC4 - plotting Genera
Not applying hue order
Applying style
Not applying style order
PC1 vs PC4 - plotting Species
Not applying hue order
Applying style
Not applying style order
PC1 vs PC4 - Loadings plot


pc_pair = (2,3)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)

for job in ['genus', 'species', 'loadings']:
    if job == 'genus':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Genus',
            style='Genus',
            file_path=out,
            figsize=(8,10),
        );
        time.sleep(2)

    elif job == 'species':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Species',
            style='Species',
            file_path=out,
            figsize=(8,10),
        );
        time.sleep(2)

    elif job == 'loadings':  # loadings plot
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
        g = plot_loadings(
            pectobact_pca,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            threshold=0.3,
            fig_size=(10,10),
            file_path=out,
            font_size=11,
        );

Output directory ../results/pca/PC2-vs-PC3 exists, nodelete is True. Adding output to output directory.

PC2 vs PC3 - plotting Genera
Not applying hue order
Applying style
Not applying style order
PC2 vs PC3 - plotting Species
Not applying hue order
Applying style
Not applying style order
PC2 vs PC3 - Loadings plot


pc_pair = (2,4)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)

for job in ['genus', 'species', 'loadings']:
    if job == 'genus':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Genus',
            style='Genus',
            file_path=out,
            figsize=(10,10),
        );
        time.sleep(2)

    elif job == 'species':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Species',
            style='Species',
            file_path=out,
            figsize=(10,10),
        );
        time.sleep(2)

    elif job == 'loadings':  # loadings plot
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
        g = plot_loadings(
            pectobact_pca,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            threshold=0.3,
            fig_size=(10,10),
            file_path=out,
            font_size=11,
        );

Output directory ../results/pca/PC2-vs-PC4 exists, nodelete is True. Adding output to output directory.

PC2 vs PC4 - plotting Genera
Not applying hue order
Applying style
Not applying style order
PC2 vs PC4 - plotting Species
Not applying hue order
Applying style
Not applying style order
PC2 vs PC4 - Loadings plot


pc_pair = (3,4)
output_dir = Path(f'../results/pca/PC{pc_pair[0]}-vs-PC{pc_pair[1]}')
make_output_directory(output_dir, force=True, nodelete=True)

for job in ['genus', 'species', 'loadings']:
    if job == 'genus':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Genera")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-genus.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Genus',
            style='Genus',
            file_path=out,
        );
        time.sleep(2)

    elif job == 'species':
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - plotting Species")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-species.png'
        g = plot_pca(
            pectobact_pca,
            X_scaled,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            'Species',
            style='Species',
            file_path=out,
        );
        time.sleep(2)

    elif job == 'loadings':  # loadings plot
        print(f"PC{pc_pair[0]} vs PC{pc_pair[1]} - Loadings plot")
        out = output_dir/f'pca_pc{pc_pair[0]}_vs_pc{pc_pair[1]}-loadings_plot.png'
        g = plot_loadings(
            pectobact_pca,
            fam_freq_filtered_df_ggs,
            pc_pair[0],
            pc_pair[1],
            threshold=0.3,
            fig_size=(10,10),
            file_path=out,
            font_size=11,
        );

Output directory ../results/pca/PC3-vs-PC4 exists, nodelete is True. Adding output to output directory.

PC3 vs PC4 - plotting Genera
Not applying hue order
Applying style
Not applying style order
PC3 vs PC4 - plotting Species
Not applying hue order
Applying style
Not applying style order
PC3 vs PC4 - Loadings plot


X_pca = pectobact_pca.transform(X_scaled)

plt.figure(figsize=(10,7.5))
sns.set(font_scale=1.15)

temp_d_fam_freq_df_ggs = fam_freq_filtered_df_ggs[fam_freq_filtered_df_ggs['Genus'] == 'Pectobacterium']

g = sns.scatterplot(
    x=X_pca[:,2][fam_freq_filtered_df_ggs['Genus'] == 'Pectobacterium'],
    y=X_pca[:,3][fam_freq_filtered_df_ggs['Genus'] == 'Pectobacterium'],
    data=temp_d_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC4 {100 * pectobact_pca.explained_variance_ratio_[3]:.2f}%");
plt.xlabel(f"PC3 {100 * pectobact_pca.explained_variance_ratio_[2]:.2f}%");

plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=4, title='Species', frameon=False);

plt.savefig(
    '../results/pca/p_pc3_pc4_pectobacterium_sp.svg',
    bbox_inches='tight',
    format='svg',
)

	Genus	MeanProteomeSize	SdProteomeSize	MeanProteomePerc	SdProteomePerc	NumOfGenomes
0	Pectobacterium	4260.71	216.78	2.64	0.15	432
1	Dickeya	4176.86	155.23	2.66	0.11	206
2	Musicola	3992.00	55.76	2.31	0.05	4
3	Brenneria	4270.24	478.85	2.07	0.15	33
4	Lonsdalea	3142.28	132.55	2.45	0.09	39
5	Acerihabitans	4969.00	0.00	2.13	0.00	1
6	Affinibrenneria	5064.00	0.00	2.13	0.00	1
7	Samsonia	3489.00	0.00	2.32	0.00	1

	Genus	MeanProteomeSize	SdProteomeSize	MeanProteomePerc	SdProteomePerc	NumOfGenomes
0	Pectobacterium	4260.71	216.78	2.64	0.15	432.00
1	Dickeya	4176.86	155.23	2.66	0.11	206.00
2	Musicola	3992.00	55.76	2.31	0.05	4.00
3	Brenneria	4270.24	478.85	2.07	0.15	33.00
4	Lonsdalea	3142.28	132.55	2.45	0.09	39.00
5	Acerihabitans	4969.00	0.00	2.13	0.00	1.00
6	Affinibrenneria	5064.00	0.00	2.13	0.00	1.00
7	Samsonia	3489.00	0.00	2.32	0.00	1.00
8	Pectobacteriaceae	4170.51	129.90	2.34	0.07	89.62

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	540.4513	0.0	456.7338	624.1688	True
1	Hard	Unknown	298.8133	0.1144	-53.124	650.7507	False
2	Soft	Unknown	-241.6379	0.2255	-585.623	102.3472	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	0.3753	0.0	0.3331	0.4175	True
1	Hard	Unknown	0.0371	0.8757	-0.1404	0.2145	False
2	Soft	Unknown	-0.3382	0.0	-0.5117	-0.1648	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	29.4861	0.0	27.2598	31.7125	True
1	Hard	Unknown	9.57	0.0437	0.2108	18.9292	True
2	Soft	Unknown	-19.9161	0.0	-29.0639	-10.7684	True

	Family	Genome	Protein
0	CBM50	GCA_003382565.3	UEM40323.1
1	GT35	GCA_003382565.3	UEM39157.1
2	GH5	GCA_003382565.3	UEM41238.1

	Genome	Genus	Species
0	GCA_922021645.1	Pectobacterium	versatile
1	GCA_004296685.1	Pectobacterium	versatile
2	GCA_018094705.1	Pectobacterium	versatile

	Genus	MeanProteins	SdProteins	NumOfGenomes
0	Pectobacterium	112.65	8.02	432
1	Dickeya	111.16	6.60	206
2	Musicola	92.25	2.28	4
3	Brenneria	87.79	7.46	33
4	Lonsdalea	77.15	4.70	39
5	Acerihabitans	106.00	0.00	1
6	Affinibrenneria	108.00	0.00	1
7	Samsonia	81.00	0.00	1

	Genus	MeanProteins	SdProteins	NumOfGenomes
0	Pectobacterium	112.650000	8.020000	432
1	Dickeya	111.160000	6.600000	206
2	Musicola	92.250000	2.280000	4
3	Brenneria	87.790000	7.460000	33
4	Lonsdalea	77.150000	4.700000	39
5	Acerihabitans	106.000000	0.000000	1
6	Affinibrenneria	108.000000	0.000000	1
7	Samsonia	81.000000	0.000000	1
0	Pectobacteriaceae	108.970711	11.958225	717

	Genus	MeanFamilys	SdFamilys	NumOfGenomes
0	Pectobacterium	62.08	3.46	432
1	Dickeya	59.05	3.04	206
2	Musicola	50.25	0.43	4
3	Brenneria	53.67	3.71	33
4	Lonsdalea	42.62	2.14	39
5	Acerihabitans	48.00	0.00	1
6	Affinibrenneria	48.00	0.00	1
7	Samsonia	49.00	0.00	1

	Genus	MeanFamilys	SdFamilys	NumOfGenomes
0	Pectobacterium	62.080000	3.460000	432
1	Dickeya	59.050000	3.040000	206
2	Musicola	50.250000	0.430000	4
3	Brenneria	53.670000	3.710000	33
4	Lonsdalea	42.620000	2.140000	39
5	Acerihabitans	48.000000	0.000000	1
6	Affinibrenneria	48.000000	0.000000	1
7	Samsonia	49.000000	0.000000	1
0	Pectobacteriaceae	59.638773	5.733681	717

	Genus	MeanCAZymeToFamRatio	SdCAZymeToFamRatio	NumOfGenomes
0	Pectobacterium	1.81	0.09	432
1	Dickeya	1.88	0.07	206
2	Musicola	1.84	0.04	4
3	Brenneria	1.64	0.07	33
4	Lonsdalea	1.81	0.07	39
5	Acerihabitans	2.21	0.00	1
6	Affinibrenneria	2.25	0.00	1
7	Samsonia	1.65	0.00	1

	Genus	MeanCAZymeToFamRatio	SdCAZymeToFamRatio	NumOfGenomes
0	Pectobacterium	1.810000	0.09000	432
1	Dickeya	1.880000	0.07000	206
2	Musicola	1.840000	0.04000	4
3	Brenneria	1.640000	0.07000	33
4	Lonsdalea	1.810000	0.07000	39
5	Acerihabitans	2.210000	0.00000	1
6	Affinibrenneria	2.250000	0.00000	1
7	Samsonia	1.650000	0.00000	1
0	Pectobacteriaceae	1.826642	0.09796	717

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	95.0	1.0	-839.9267	1029.9267	False
1	Acerihabitans	Brenneria	-698.7576	0.0344	-1369.7924	-27.7227	True
2	Acerihabitans	Dickeya	-792.1408	0.0072	-1454.8365	-129.4451	True
3	Acerihabitans	Lonsdalea	-1826.7179	0.0	-2496.2329	-1157.203	True
4	Acerihabitans	Musicola	-977.0	0.0017	-1716.1245	-237.8755	True
5	Acerihabitans	Pectobacterium	-708.287	0.0262	-1370.1448	-46.4293	True
6	Acerihabitans	Samsonia	-1480.0	0.0	-2414.9267	-545.0733	True
7	Affinibrenneria	Brenneria	-793.7576	0.0083	-1464.7924	-122.7227	True
8	Affinibrenneria	Dickeya	-887.1408	0.0014	-1549.8365	-224.4451	True
9	Affinibrenneria	Lonsdalea	-1921.7179	0.0	-2591.2329	-1252.203	True
10	Affinibrenneria	Musicola	-1072.0	0.0003	-1811.1245	-332.8755	True
11	Affinibrenneria	Pectobacterium	-803.287	0.0059	-1465.1448	-141.4293	True
12	Affinibrenneria	Samsonia	-1575.0	0.0	-2509.9267	-640.0733	True
13	Brenneria	Dickeya	-93.3832	0.3004	-217.3402	30.5738	False
14	Brenneria	Lonsdalea	-1127.9604	0.0	-1284.3254	-971.5954	True
15	Brenneria	Musicola	-278.2424	0.2348	-628.2492	71.7644	False
16	Brenneria	Pectobacterium	-9.5295	1.0	-128.9256	109.8667	False
17	Brenneria	Samsonia	-781.2424	0.0101	-1452.2773	-110.2076	True
18	Dickeya	Lonsdalea	-1034.5772	0.0	-1150.0234	-919.131	True
19	Dickeya	Musicola	-184.8592	0.6978	-518.5995	148.881	False
20	Dickeya	Pectobacterium	83.8537	0.0002	27.8783	139.8292	True
21	Dickeya	Samsonia	-687.8592	0.0355	-1350.5549	-25.1635	True
22	Lonsdalea	Musicola	849.7179	0.0	502.634	1196.8019	True
23	Lonsdalea	Pectobacterium	1118.4309	0.0	1007.8962	1228.9657	True
24	Lonsdalea	Samsonia	346.7179	0.7657	-322.797	1016.2329	False
25	Musicola	Pectobacterium	268.713	0.2146	-63.3603	600.7863	False
26	Musicola	Samsonia	-503.0	0.4363	-1242.1245	236.1245	False
27	Pectobacterium	Samsonia	-771.713	0.0099	-1433.5707	-109.8552	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	-0.0005	1.0	-0.5762	0.5751	False
1	Acerihabitans	Brenneria	-0.0662	0.9997	-0.4793	0.347	False
2	Acerihabitans	Dickeya	0.5273	0.0024	0.1192	0.9353	True
3	Acerihabitans	Lonsdalea	0.3215	0.2572	-0.0907	0.7337	False
4	Acerihabitans	Musicola	0.1777	0.9356	-0.2774	0.6327	False
5	Acerihabitans	Pectobacterium	0.5105	0.0038	0.103	0.918	True
6	Acerihabitans	Samsonia	0.1884	0.9752	-0.3873	0.764	False
7	Affinibrenneria	Brenneria	-0.0656	0.9997	-0.4788	0.3475	False
8	Affinibrenneria	Dickeya	0.5278	0.0023	0.1198	0.9358	True
9	Affinibrenneria	Lonsdalea	0.322	0.2553	-0.0902	0.7343	False
10	Affinibrenneria	Musicola	0.1782	0.9347	-0.2769	0.6333	False
11	Affinibrenneria	Pectobacterium	0.511	0.0037	0.1035	0.9185	True
12	Affinibrenneria	Samsonia	0.1889	0.9748	-0.3868	0.7645	False
13	Brenneria	Dickeya	0.5934	0.0	0.5171	0.6697	True
14	Brenneria	Lonsdalea	0.3877	0.0	0.2914	0.484	True
15	Brenneria	Musicola	0.2438	0.0142	0.0283	0.4593	True
16	Brenneria	Pectobacterium	0.5767	0.0	0.5031	0.6502	True
17	Brenneria	Samsonia	0.2545	0.5702	-0.1586	0.6677	False
18	Dickeya	Lonsdalea	-0.2057	0.0	-0.2768	-0.1347	True
19	Dickeya	Musicola	-0.3496	0.0	-0.5551	-0.1441	True
20	Dickeya	Pectobacterium	-0.0168	0.8188	-0.0512	0.0177	False
21	Dickeya	Samsonia	-0.3389	0.1866	-0.7469	0.0691	False
22	Lonsdalea	Musicola	-0.1439	0.4512	-0.3576	0.0698	False
23	Lonsdalea	Pectobacterium	0.189	0.0	0.1209	0.257	True
24	Lonsdalea	Samsonia	-0.1332	0.9769	-0.5454	0.2791	False
25	Musicola	Pectobacterium	0.3328	0.0	0.1284	0.5373	True
26	Musicola	Samsonia	0.0107	1.0	-0.4444	0.4658	False
27	Pectobacterium	Samsonia	-0.3221	0.2413	-0.7296	0.0854	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	2.0	1.0	-30.0935	34.0935	False
1	Acerihabitans	Brenneria	-18.2121	0.2411	-41.2469	4.8227	False
2	Acerihabitans	Dickeya	5.1602	0.9973	-17.5883	27.9087	False
3	Acerihabitans	Lonsdalea	-28.8462	0.0037	-51.8288	-5.8635	True
4	Acerihabitans	Musicola	-13.75	0.7211	-39.1221	11.6221	False
5	Acerihabitans	Pectobacterium	6.6458	0.987	-16.0739	29.3656	False
6	Acerihabitans	Samsonia	-25.0	0.2588	-57.0935	7.0935	False
7	Affinibrenneria	Brenneria	-20.2121	0.1344	-43.2469	2.8227	False
8	Affinibrenneria	Dickeya	3.1602	0.9999	-19.5883	25.9087	False
9	Affinibrenneria	Lonsdalea	-30.8462	0.0013	-53.8288	-7.8635	True
10	Affinibrenneria	Musicola	-15.75	0.5603	-41.1221	9.6221	False
11	Affinibrenneria	Pectobacterium	4.6458	0.9986	-18.0739	27.3656	False
12	Affinibrenneria	Samsonia	-27.0	0.1735	-59.0935	5.0935	False
13	Brenneria	Dickeya	23.3723	0.0	19.1172	27.6274	True
14	Brenneria	Lonsdalea	-10.634	0.0	-16.0016	-5.2664	True
15	Brenneria	Musicola	4.4621	0.9504	-7.5527	16.4769	False
16	Brenneria	Pectobacterium	24.858	0.0	20.7594	28.9565	True
17	Brenneria	Samsonia	-6.7879	0.9864	-29.8227	16.2469	False
18	Dickeya	Lonsdalea	-34.0063	0.0	-37.9693	-30.0434	True
19	Dickeya	Musicola	-18.9102	0.0	-30.3666	-7.4538	True
20	Dickeya	Pectobacterium	1.4856	0.2679	-0.4358	3.4071	False
21	Dickeya	Samsonia	-30.1602	0.0016	-52.9087	-7.4117	True
22	Lonsdalea	Musicola	15.0962	0.0032	3.1817	27.0106	True
23	Lonsdalea	Pectobacterium	35.492	0.0	31.6976	39.2863	True
24	Lonsdalea	Samsonia	3.8462	0.9996	-19.1365	26.8288	False
25	Musicola	Pectobacterium	20.3958	0.0	8.9967	31.795	True
26	Musicola	Samsonia	-11.25	0.88	-36.6221	14.1221	False
27	Pectobacterium	Samsonia	-31.6458	0.0007	-54.3656	-8.9261	True

	CAZyClass	Genus	MeanCazyClass	SdCazyClass	MeanClassPerc	SdClassPerc	NumOfGenomes
0	GH	Lonsdalea	30.85	2.28	39.96	1.31	39
1	GH	Samsonia	34.00	0.00	41.98	0.00	1
2	GH	Pectobacterium	50.11	3.91	44.50	1.87	432
3	GH	Musicola	37.00	0.00	40.13	0.99	4
4	GH	Brenneria	42.48	6.60	48.14	4.19	33
5	GH	Affinibrenneria	59.00	0.00	54.63	0.00	1
6	GH	Dickeya	42.53	3.55	38.24	1.85	206
7	GH	Acerihabitans	51.00	0.00	48.11	0.00	1
8	GT	Lonsdalea	32.46	2.30	42.07	1.24	39
9	GT	Samsonia	25.00	0.00	30.86	0.00	1
10	GT	Pectobacterium	31.76	3.86	28.15	2.23	432
11	GT	Musicola	31.00	3.00	33.55	2.44	4
12	GT	Brenneria	32.30	2.55	36.90	2.61	33
13	GT	Affinibrenneria	35.00	0.00	32.41	0.00	1
14	GT	Dickeya	37.21	3.12	33.52	2.62	206
15	GT	Acerihabitans	44.00	0.00	41.51	0.00	1
16	PL	Lonsdalea	3.79	0.56	4.92	0.72	39
17	PL	Samsonia	8.00	0.00	9.88	0.00	1
18	PL	Pectobacterium	14.78	1.36	13.14	0.97	432
19	PL	Musicola	11.25	0.43	12.19	0.33	4
20	PL	Brenneria	4.24	1.23	4.81	1.29	33
21	PL	Affinibrenneria	1.00	0.00	0.93	0.00	1
22	PL	Dickeya	16.29	1.72	14.63	1.19	206
23	PL	Acerihabitans	1.00	0.00	0.94	0.00	1
24	CE	Lonsdalea	3.15	0.48	4.09	0.57	39
25	CE	Samsonia	8.00	0.00	9.88	0.00	1
26	CE	Pectobacterium	7.12	0.83	6.33	0.68	432
27	CE	Musicola	6.00	0.00	6.51	0.16	4
28	CE	Brenneria	4.30	1.06	4.93	1.24	33
29	CE	Affinibrenneria	7.00	0.00	6.48	0.00	1
30	CE	Dickeya	7.16	0.80	6.44	0.60	206
31	CE	Acerihabitans	5.00	0.00	4.72	0.00	1
32	AA	Lonsdalea	0.00	0.00	0.00	0.00	39
33	AA	Samsonia	1.00	0.00	1.23	0.00	1
34	AA	Pectobacterium	1.03	0.16	0.91	0.17	371
35	AA	Musicola	0.00	0.00	0.00	0.00	4
36	AA	Brenneria	1.00	0.00	1.27	0.06	8
37	AA	Affinibrenneria	0.00	0.00	0.00	0.00	1
38	AA	Dickeya	1.00	0.00	0.90	0.06	85
39	AA	Acerihabitans	0.00	0.00	0.00	0.00	1

	sum_sq	df	F	PR(>F)
C(Genus)	1.372953e+04	7.0	137.428160	4.378316e-183
C(CAZyClass)	1.113669e+06	5.0	15606.446141	0.000000e+00
C(Genus):C(CAZyClass)	2.400325e+04	35.0	48.052939	1.698934e-276
Residual	6.071269e+04	4254.0	NaN	NaN

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	8.0	0.9884	-19.8655	35.8655	False
1	Acerihabitans	Brenneria	-8.5152	0.901	-28.5153	11.485	False
2	Acerihabitans	Dickeya	-7.6699	0.9374	-27.4215	12.0817	False
3	Acerihabitans	Lonsdalea	-20.1538	0.0458	-40.1087	-0.199	True
4	Acerihabitans	Musicola	-14.0	0.5293	-36.0296	8.0296	False
5	Acerihabitans	Pectobacterium	-0.1551	1.0	-19.8818	19.5716	False
6	Acerihabitans	Samsonia	-17.0	0.5827	-44.8655	10.8655	False
7	Affinibrenneria	Brenneria	-16.5152	0.1927	-36.5153	3.485	False
8	Affinibrenneria	Dickeya	-15.6699	0.2371	-35.4215	4.0817	False
9	Affinibrenneria	Lonsdalea	-28.1538	0.0005	-48.1087	-8.199	True
10	Affinibrenneria	Musicola	-22.0	0.0506	-44.0296	0.0296	False
11	Affinibrenneria	Pectobacterium	-8.1551	0.9142	-27.8818	11.5716	False
12	Affinibrenneria	Samsonia	-25.0	0.1161	-52.8655	2.8655	False
13	Brenneria	Dickeya	0.8452	0.9971	-2.8493	4.5398	False
14	Brenneria	Lonsdalea	-11.6387	0.0	-16.2992	-6.9782	True
15	Brenneria	Musicola	-5.4848	0.7514	-15.9168	4.9471	False
16	Brenneria	Pectobacterium	8.3601	0.0	4.8015	11.9187	True
17	Brenneria	Samsonia	-8.4848	0.9027	-28.485	11.5153	False
18	Dickeya	Lonsdalea	-12.4839	0.0	-15.9248	-9.0431	True
19	Dickeya	Musicola	-6.3301	0.5274	-16.2772	3.617	False
20	Dickeya	Pectobacterium	7.5148	0.0	5.8465	9.1832	True
21	Dickeya	Samsonia	-9.3301	0.8403	-29.0817	10.4215	False
22	Lonsdalea	Musicola	6.1538	0.6146	-4.191	16.4987	False
23	Lonsdalea	Pectobacterium	19.9988	0.0	16.7043	23.2932	True
24	Lonsdalea	Samsonia	3.1538	0.9997	-16.801	23.1087	False
25	Musicola	Pectobacterium	13.8449	0.0006	3.9475	23.7423	True
26	Musicola	Samsonia	-3.0	0.9999	-25.0296	19.0296	False
27	Pectobacterium	Samsonia	-16.8449	0.159	-36.5716	2.8818	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	0.0	1.0	-14.1792	14.1792	False
1	Acerihabitans	Brenneria	5.6667	0.6921	-4.5103	15.8437	False
2	Acerihabitans	Dickeya	11.0485	0.0197	0.998	21.0991	True
3	Acerihabitans	Lonsdalea	-5.3846	0.743	-15.5386	4.7693	False
4	Acerihabitans	Musicola	2.25	0.9987	-8.9597	13.4597	False
5	Acerihabitans	Pectobacterium	14.0787	0.0006	4.0409	24.1165	True
6	Acerihabitans	Samsonia	1.0	1.0	-13.1792	15.1792	False
7	Affinibrenneria	Brenneria	5.6667	0.6921	-4.5103	15.8437	False
8	Affinibrenneria	Dickeya	11.0485	0.0197	0.998	21.0991	True
9	Affinibrenneria	Lonsdalea	-5.3846	0.743	-15.5386	4.7693	False
10	Affinibrenneria	Musicola	2.25	0.9987	-8.9597	13.4597	False
11	Affinibrenneria	Pectobacterium	14.0787	0.0006	4.0409	24.1165	True
12	Affinibrenneria	Samsonia	1.0	1.0	-13.1792	15.1792	False
13	Brenneria	Dickeya	5.3819	0.0	3.5019	7.2618	True
14	Brenneria	Lonsdalea	-11.0513	0.0	-13.4227	-8.6798	True
15	Brenneria	Musicola	-3.4167	0.5122	-8.7249	1.8916	False
16	Brenneria	Pectobacterium	8.412	0.0	6.6013	10.2228	True
17	Brenneria	Samsonia	-4.6667	0.86	-14.8437	5.5103	False
18	Dickeya	Lonsdalea	-16.4332	0.0	-18.184	-14.6823	True
19	Dickeya	Musicola	-8.7985	0.0	-13.8601	-3.737	True
20	Dickeya	Pectobacterium	3.0302	0.0	2.1812	3.8791	True
21	Dickeya	Samsonia	-10.0485	0.0501	-20.0991	0.002	False
22	Lonsdalea	Musicola	7.6346	0.0003	2.3707	12.8985	True
23	Lonsdalea	Pectobacterium	19.4633	0.0	17.7869	21.1397	True
24	Lonsdalea	Samsonia	6.3846	0.5434	-3.7693	16.5386	False
25	Musicola	Pectobacterium	11.8287	0.0	6.7924	16.865	True
26	Musicola	Samsonia	-1.25	1.0	-12.4597	9.9597	False
27	Pectobacterium	Samsonia	-13.0787	0.0021	-23.1165	-3.0409	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	13.3936	0.0	12.2593	14.528	True
1	Hard	Unknown	2.5433	0.4226	-2.2253	7.312	False
2	Soft	Unknown	-10.8503	0.0	-15.5112	-6.1894	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	11.7652	0.0	9.5977	13.9327	True
1	Hard	Unknown	0.3467	0.9956	-8.7652	9.4586	False
2	Soft	Unknown	-11.4185	0.0076	-20.3245	-2.5125	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	-9.0	0.9555	-33.7332	15.7332	False
1	Acerihabitans	Brenneria	-11.6667	0.484	-29.4187	6.0853	False
2	Acerihabitans	Dickeya	-6.0437	0.9668	-23.5751	11.4877	False
3	Acerihabitans	Lonsdalea	-11.5385	0.4959	-29.2502	6.1733	False
4	Acerihabitans	Musicola	-13.0	0.4682	-32.5533	6.5533	False
5	Acerihabitans	Pectobacterium	-11.7361	0.4571	-29.2453	5.7731	False
6	Acerihabitans	Samsonia	-19.0	0.2758	-43.7332	5.7332	False
7	Affinibrenneria	Brenneria	-2.6667	0.9998	-20.4187	15.0853	False
8	Affinibrenneria	Dickeya	2.9563	0.9996	-14.5751	20.4877	False
9	Affinibrenneria	Lonsdalea	-2.5385	0.9999	-20.2502	15.1733	False
10	Affinibrenneria	Musicola	-4.0	0.9986	-23.5533	15.5533	False
11	Affinibrenneria	Pectobacterium	-2.7361	0.9998	-20.2453	14.7731	False
12	Affinibrenneria	Samsonia	-10.0	0.9232	-34.7332	14.7332	False
13	Brenneria	Dickeya	5.623	0.0	2.3437	8.9022	True
14	Brenneria	Lonsdalea	0.1282	1.0	-4.0084	4.2648	False
15	Brenneria	Musicola	-1.3333	0.9999	-10.5926	7.926	False
16	Brenneria	Pectobacterium	-0.0694	1.0	-3.228	3.0891	False
17	Brenneria	Samsonia	-7.3333	0.9145	-25.0853	10.4187	False
18	Dickeya	Lonsdalea	-5.4948	0.0	-8.5489	-2.4407	True
19	Dickeya	Musicola	-6.9563	0.2452	-15.7853	1.8727	False
20	Dickeya	Pectobacterium	-5.6924	0.0	-7.1732	-4.2116	True
21	Dickeya	Samsonia	-12.9563	0.3252	-30.4877	4.5751	False
22	Lonsdalea	Musicola	-1.4615	0.9997	-10.6435	7.7204	False
23	Lonsdalea	Pectobacterium	-0.1976	1.0	-3.1218	2.7265	False
24	Lonsdalea	Samsonia	-7.4615	0.9059	-25.1733	10.2502	False
25	Musicola	Pectobacterium	1.2639	0.9999	-7.521	10.0488	False
26	Musicola	Samsonia	-6.0	0.9828	-25.5533	13.5533	False
27	Pectobacterium	Samsonia	-7.2639	0.9127	-24.7731	10.2453	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	1.6085	0.0911	-0.1926	3.4097	False
1	Hard	Unknown	-1.4933	0.8885	-9.065	6.0784	False
2	Soft	Unknown	-3.1019	0.587	-10.5025	4.2987	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	0.0	1.0	-10.5548	10.5548	False
1	Acerihabitans	Brenneria	3.2424	0.8984	-4.3332	10.8181	False
2	Acerihabitans	Dickeya	15.6019	0.0	8.1204	23.0834	True
3	Acerihabitans	Lonsdalea	2.7949	0.9515	-4.7636	10.3534	False
4	Acerihabitans	Musicola	10.25	0.005	1.9057	18.5943	True
5	Acerihabitans	Pectobacterium	14.0162	0.0	6.5442	21.4882	True
6	Acerihabitans	Samsonia	7.0	0.4716	-3.5548	17.5548	False
7	Affinibrenneria	Brenneria	3.2424	0.8984	-4.3332	10.8181	False
8	Affinibrenneria	Dickeya	15.6019	0.0	8.1204	23.0834	True
9	Affinibrenneria	Lonsdalea	2.7949	0.9515	-4.7636	10.3534	False
10	Affinibrenneria	Musicola	10.25	0.005	1.9057	18.5943	True
11	Affinibrenneria	Pectobacterium	14.0162	0.0	6.5442	21.4882	True
12	Affinibrenneria	Samsonia	7.0	0.4716	-3.5548	17.5548	False
13	Brenneria	Dickeya	12.3595	0.0	10.9601	13.7589	True
14	Brenneria	Lonsdalea	-0.4476	0.9945	-2.2128	1.3177	False
15	Brenneria	Musicola	7.0076	0.0	3.0562	10.959	True
16	Brenneria	Pectobacterium	10.7738	0.0	9.4259	12.1217	True
17	Brenneria	Samsonia	3.7576	0.8033	-3.8181	11.3332	False
18	Dickeya	Lonsdalea	-12.8071	0.0	-14.1104	-11.5037	True
19	Dickeya	Musicola	-5.3519	0.0005	-9.1197	-1.5842	True
20	Dickeya	Pectobacterium	-1.5857	0.0	-2.2177	-0.9538	True
21	Dickeya	Samsonia	-8.6019	0.0118	-16.0834	-1.1204	True
22	Lonsdalea	Musicola	7.4551	0.0	3.5367	11.3735	True
23	Lonsdalea	Pectobacterium	11.2213	0.0	9.9735	12.4692	True
24	Lonsdalea	Samsonia	4.2051	0.693	-3.3534	11.7636	False
25	Musicola	Pectobacterium	3.7662	0.048	0.0173	7.5151	True
26	Musicola	Samsonia	-3.25	0.9364	-11.5943	5.0943	False
27	Pectobacterium	Samsonia	-7.0162	0.0837	-14.4882	0.4558	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	11.5549	0.0	10.8223	12.2874	True
1	Hard	Unknown	7.2767	0.0	4.1971	10.3562	True
2	Soft	Unknown	-4.2782	0.0026	-7.2882	-1.2683	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	2.0	0.8959	-2.6483	6.6483	False
1	Acerihabitans	Brenneria	-0.697	0.9984	-4.0332	2.6393	False
2	Acerihabitans	Dickeya	2.2767	0.4155	-1.0181	5.5715	False
3	Acerihabitans	Lonsdalea	-1.8462	0.6964	-5.1749	1.4826	False
4	Acerihabitans	Musicola	1.0	0.9916	-2.6748	4.6748	False
5	Acerihabitans	Pectobacterium	2.2176	0.4497	-1.0731	5.5082	False
6	Acerihabitans	Samsonia	3.0	0.5085	-1.6483	7.6483	False
7	Affinibrenneria	Brenneria	-2.697	0.2157	-6.0332	0.6393	False
8	Affinibrenneria	Dickeya	0.2767	1.0	-3.0181	3.5715	False
9	Affinibrenneria	Lonsdalea	-3.8462	0.0111	-7.1749	-0.5174	True
10	Affinibrenneria	Musicola	-1.0	0.9916	-4.6748	2.6748	False
11	Affinibrenneria	Pectobacterium	0.2176	1.0	-3.0731	3.5082	False
12	Affinibrenneria	Samsonia	1.0	0.998	-3.6483	5.6483	False
13	Brenneria	Dickeya	2.9737	0.0	2.3574	3.59	True
14	Brenneria	Lonsdalea	-1.1492	0.0002	-1.9266	-0.3718	True
15	Brenneria	Musicola	1.697	0.062	-0.0432	3.4371	False
16	Brenneria	Pectobacterium	2.9146	0.0	2.3209	3.5082	True
17	Brenneria	Samsonia	3.697	0.018	0.3607	7.0332	True
18	Dickeya	Lonsdalea	-4.1229	0.0	-4.6968	-3.5489	True
19	Dickeya	Musicola	-1.2767	0.2739	-2.936	0.3826	False
20	Dickeya	Pectobacterium	-0.0591	0.9982	-0.3374	0.2192	False
21	Dickeya	Samsonia	0.7233	0.9978	-2.5715	4.0181	False
22	Lonsdalea	Musicola	2.8462	0.0	1.1205	4.5718	True
23	Lonsdalea	Pectobacterium	4.0637	0.0	3.5142	4.6133	True
24	Lonsdalea	Samsonia	4.8462	0.0003	1.5174	8.1749	True
25	Musicola	Pectobacterium	1.2176	0.3279	-0.4334	2.8686	False
26	Musicola	Samsonia	2.0	0.7166	-1.6748	5.6748	False
27	Pectobacterium	Samsonia	0.7824	0.9963	-2.5082	4.0731	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	3.4367	0.0	3.1178	3.7555	True
1	Hard	Unknown	2.2	0.0004	0.8595	3.5405	True
2	Soft	Unknown	-1.2367	0.069	-2.5469	0.0735	False

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Acerihabitans	Affinibrenneria	0.0	1.0	-1.7994	1.7994	False
1	Acerihabitans	Brenneria	0.2424	0.9992	-1.0491	1.5339	False
2	Acerihabitans	Dickeya	0.4126	0.9767	-0.8628	1.6881	False
3	Acerihabitans	Lonsdalea	0.0	1.0	-1.2886	1.2886	False
4	Acerihabitans	Musicola	0.0	1.0	-1.4225	1.4225	False
5	Acerihabitans	Pectobacterium	0.8843	0.4092	-0.3896	2.1581	False
6	Acerihabitans	Samsonia	1.0	0.6942	-0.7994	2.7994	False
7	Affinibrenneria	Brenneria	0.2424	0.9992	-1.0491	1.5339	False
8	Affinibrenneria	Dickeya	0.4126	0.9767	-0.8628	1.6881	False
9	Affinibrenneria	Lonsdalea	0.0	1.0	-1.2886	1.2886	False
10	Affinibrenneria	Musicola	0.0	1.0	-1.4225	1.4225	False
11	Affinibrenneria	Pectobacterium	0.8843	0.4092	-0.3896	2.1581	False
12	Affinibrenneria	Samsonia	1.0	0.6942	-0.7994	2.7994	False
13	Brenneria	Dickeya	0.1702	0.3721	-0.0684	0.4088	False
14	Brenneria	Lonsdalea	-0.2424	0.2197	-0.5434	0.0585	False
15	Brenneria	Musicola	-0.2424	0.958	-0.9161	0.4312	False
16	Brenneria	Pectobacterium	0.6418	0.0	0.412	0.8716	True
17	Brenneria	Samsonia	0.7576	0.6317	-0.5339	2.0491	False
18	Dickeya	Lonsdalea	-0.4126	0.0	-0.6348	-0.1904	True
19	Dickeya	Musicola	-0.4126	0.5148	-1.055	0.2297	False
20	Dickeya	Pectobacterium	0.4716	0.0	0.3639	0.5794	True
21	Dickeya	Samsonia	0.5874	0.8573	-0.6881	1.8628	False
22	Lonsdalea	Musicola	0.0	1.0	-0.668	0.668	False
23	Lonsdalea	Pectobacterium	0.8843	0.0	0.6715	1.097	True
24	Lonsdalea	Samsonia	1.0	0.2634	-0.2886	2.2886	False
25	Musicola	Pectobacterium	0.8843	0.0008	0.2451	1.5234	True
26	Musicola	Samsonia	1.0	0.392	-0.4225	2.4225	False
27	Pectobacterium	Samsonia	0.1157	1.0	-1.1581	1.3896	False

Explore the CAZome of Pectobacteriaceae genomes¶

GitHub¶

Table of Contents¶

0. Imports¶

Packages¶

Output directory¶

Import Data¶

CAZy family annotations¶

Taxonomy data¶

Compile all data into a single dataframe¶

1. CAZome size¶

Count the number of CAZymes¶

Identify the total number of CAZymes¶

Count the number of CAZy families¶

Calculate ratio of CAZymes to CAZy families¶

Calculate proteome sizes¶

Calculate the percentage of the proteome represented by the CAZome¶

Combine into a single dataframe¶

Test if there are statistically signficant differences between the means¶

Test if there is a difference in proteome size¶

Test if there is a difference in percentage of the proteome in the CAZome¶

Test if there is a difference in number of proteins in the CAZome¶

Test if there is a stat sig. difference in the mean number of CAZy families¶

2. CAZy classes¶

Statistically testing if there is a difference between soft and hard plant tissue targeting species¶

GH¶

GT¶

PL¶

CE¶

AA¶

CBM¶

Explore AA CAZymes in Pectobacteriaceae¶

3. CAZy families¶

CAZy family frequency dataframe¶

Clustermaps¶

Prepare the dataframe of CAZy family frequencies¶

Colour scheme¶

Build a clustermap of CAZy family frequencies¶

Add species classifications¶

Add phenotype classification to clustermap¶

Remove genomes¶

Replot the clustermap¶

Genus specific CAZy families¶

Identify phenotype specific CAZy families¶

Pectobacterium and Dickeya genus-specific families¶

Rare faction curves¶

4. The Core CAZome¶

Genus specific core CAZomes¶

Phenotype specific core CAZomes¶

5. Families that always occur together¶

Using a correlation matrix¶

An iterative approach to identify co-occurring families¶

Genus specific groups of always co-occurring CAZy families¶

Phenotype specific groups of always co-occurring CAZy families¶

Build an upset plot of co-occurring CAZy families¶

Build the upset plot¶

Break down the incidences per genus¶

Generate figure for paper¶

6. Principal Component Analysis (PCA)¶

PCA¶

Explained cumulative variance¶

Variance captured per PC¶

Scatter and loadings plots¶

PCs 1 - 4¶

Individual plots¶

PC1 vs PC2¶

PC1 vs PC3¶

PC1 vs PC4¶

PC2 vs PC3¶

PC2 vs PC4¶

PC3 vs PC4¶

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	0.612	0.0	0.4775	0.7464	True
1	Hard	Unknown	-0.12	0.872	-0.6853	0.4453	False
2	Soft	Unknown	-0.732	0.0055	-1.2845	-0.1794	True

	group1	group2	meandiff	p-adj	lower	upper	reject
0	Hard	Soft	4.4937	0.0	3.9416	5.0457	True
1	Hard	Unknown	1.92	0.1275	-0.4008	4.2408	False
2	Soft	Unknown	-2.5737	0.0215	-4.842	-0.3053	True

	Genome	Genus	Species	AA10	AA3	CBM0	CBM13	CBM18	CBM3	CBM32	...	PL11	PL17	PL2	PL22	PL26	PL3	PL35	PL38	PL4	PL9
0	GCA_009931295.1	Pectobacterium	odoriferum	0	1	0	1	0	1	1	...	0	0	2	1	1	2	0	1	1	2
1	GCA_009874305.1	Dickeya	solani	0	1	0	0	0	0	0	...	0	0	1	1	1	2	0	0	1	3
2	GCA_016944275.1	Pectobacterium	brasiliense	0	1	0	1	0	1	3	...	0	0	2	1	0	2	0	1	1	2
3	GCA_003403135.1	Dickeya	dianthicola	0	0	0	0	0	0	1	...	0	0	1	1	1	2	0	1	2	3
4	GCA_922011735.1	Pectobacterium	versatile	0	1	0	1	0	1	1	...	0	0	2	1	1	2	0	0	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
712	GCA_022747695.1	Dickeya	dianthicola	0	0	0	0	0	0	1	...	0	0	1	1	1	2	0	0	2	3
713	GCA_003096475.1	Pectobacterium	versatile	0	1	0	1	0	1	1	...	0	0	2	1	1	2	0	0	1	2
714	GCA_016415585.1	Pectobacterium	carotovorum	1	0	0	0	0	1	2	...	0	0	2	1	0	2	0	1	1	2
715	GCA_023347555.1	Brenneria	tiliae	0	0	0	0	0	0	0	...	0	0	1	2	1	0	0	0	1	0
716	GCA_000406085.2	Dickeya	sp. CSL RW240	0	0	0	0	0	0	1	...	0	0	1	1	1	2	0	0	2	3

	Genus	GH	GT	CE	PL	AA	CBM
0	Pectobacterium	GH121, GH146, GH18	GT101, GT102, GT11, GT111, GT14, GT24, GT52		PL11	AA10	CBM13
1	Dickeya	GH113, GH148, GH25, GH91	GT97	CE2	PL10		CBM4
2	Brenneria	GH106	GT21		PL17
3	Acerihabitans	GH127, GH15
4	Hard tissue	GH106, GH127, GH140, GH15, GH37, GH39, GH51	GT21		PL17
5	Soft tissue	GH113, GH121, GH146, GH148, GH16, GH18, GH25, ...	GT101, GT102, GT11, GT111, GT14, GT24, GT25, G...	CE2	PL10, PL11, PL35	AA10	CBM0, CBM13, CBM4, CBM91

	Family	Genus	MeanFreq	SdFreq
0	CBM5	Lonsdalea	1.00	0.00
1	CBM50	Lonsdalea	5.79	0.52
2	GH23	Lonsdalea	5.21	1.04
3	GH3	Lonsdalea	1.97	0.28
4	GT2	Lonsdalea	7.03	0.86
...	...	...	...	...
2	GH23	Pectobacteriaceae	6.49	1.55
3	GH3	Pectobacteriaceae	2.49	0.60
4	GT2	Pectobacteriaceae	8.15	2.10
5	GT51	Pectobacteriaceae	3.08	0.37
6	GT9	Pectobacteriaceae	3.70	0.56

	Families	Genus	Incidence
0	PL2+PL22	Pectobacteriaceae	NaN
1	PL2+PL22	Pectobacterium	NaN
2	PL2+PL22	Dickeya	205.0
3	PL2+PL22	Musicola	NaN
4	PL2+PL22	Soft tissue targeting	635.0
...	...	...	...
299	GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28...	Musicola	4.0
300	GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28...	Soft tissue targeting	NaN
301	GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28...	Brenneria	NaN
302	GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28...	Lonsdalea	NaN
303	GH23+GH3+CBM5+CBM50+GT2+GT51+GT9+GH1+GH13+GH28...	Hard tissue targeting	NaN

	Families	Genus	Incidence
0	{CBM48, CE8, GH13}	Soft tissue targeting	635
1	{PL22, PL2}	Soft tissue targeting	635
2	{PL22, PL2}	Dickeya	205
3	{GH148, CBM4}	Soft tissue targeting	8
4	{GH148, CBM4}	Dickeya	8
5	{GT30, PL3}	Dickeya	205
6	{GH88, PL35}	Dickeya	3
7	{GT83, CE11}	Dickeya	204
8	{GH16, GT25}	Dickeya	1
9	{GH5, GH19, PL4}	Dickeya	203
10	{GH154, GH121, GH146}	Soft tissue targeting	1
11	{GH154, GH121, GH146}	Pectobacterium	1
12	{GT56, GH105}	Pectobacterium	425
13	{CBM48, CE8, CE9, GH13}	Pectobacterium	425
14	{GH32, GH102, CE11}	Pectobacterium	425
15	{CBM63, CBM32}	Musicola	2
16	{GH36, GH24}	Musicola	2
17	{GH94, GT84}	Soft tissue targeting	241
18	{GH94, GT84}	Pectobacterium	152
19	{GH94, GT84}	Dickeya	89
20	{GH94, GT84}	Hard tissue targeting	68
21	{GH94, GT84}	Lonsdalea	33
22	{CBM3, GH5}	Pectobacterium	425
23	{CBM3, GH5}	Hard tissue targeting	25
24	{CBM3, GH5}	Brenneria	25
25	{GT30, GH13}	Hard tissue targeting	74
26	{GH73, GT0, GH1}	Hard tissue targeting	74
27	{GT35, GT8, GT5}	Hard tissue targeting	36
28	{GH127, GH15}	Hard tissue targeting	1
29	{GH31, GT81}	Lonsdalea	38
30	{GT83, GH8}	Brenneria	15
31	{CBM67, GH65}	Hard tissue targeting	1
32	{PL17, GT73}	Hard tissue targeting	1
33	{PL17, GT73}	Brenneria	1
34	{PL38, GH106}	Hard tissue targeting	1
35	{PL38, GH106}	Brenneria	1
36	{GH78, GT1}	Lonsdalea	10
37	{GH26, GH51}	Lonsdalea	9
38	{GH73, GH4, GH28, GH1, GT0}	Lonsdalea	38
39	{PL1, GH39, GT30, GH13, PL3}	Lonsdalea	38
40	{GT2, GH23, GT9, GT51, GH28, GH103, PL1, GH3, ...	Soft tissue targeting	636
41	{GT35, GH23, GH103, GT1, GT2, GT9, CE4, GT5, G...	Dickeya	206
42	{GH43, GT2, GH23, GT9, GT51, GH28, GH103, PL1,...	Pectobacterium	426
43	{GT35, GH23, GH103, GT1, GH102, GH5, GT0, GT2,...	Musicola	4
44	{GT19, GH23, GT2, GT26, GT51, GT9, GH3, GT56, ...	Hard tissue targeting	75
45	{GH23, CBM32, GT2, GT26, GT9, CE4, GH77, GT28,...	Lonsdalea	39
46	{GT35, GH23, GH103, GT84, GH102, GT0, GT2, GT2...	Brenneria	33