import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re

from copy import copy
from matplotlib.patches import Patch
from pathlib import Path
import upsetplot
import adjustText
import upsetplot

from Bio import SeqIO
from saintBioutils.utilities.file_io.get_paths import get_file_paths
from saintBioutils.utilities.file_io import make_output_directory
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

%matplotlib inline


# loading and parsing data
from cazomevolve.cazome.explore.parse_data import (
    load_fgp_data,
    load_tax_data,
    add_tax_data_from_tax_df,
    add_tax_column_from_row_index,
)

# functions for exploring the sizes of CAZomes
from cazomevolve.cazome.explore.cazome_sizes import (
    calc_proteome_representation,
    count_items_in_cazome,
    get_proteome_sizes,
)

# explore the frequency of CAZymes per CAZy class
from cazomevolve.cazome.explore.cazy_classes import calculate_class_sizes

# explore the frequencies of CAZy families and identify the co-cazome
from cazomevolve.cazome.explore.cazy_families import (
    build_fam_freq_df,
    build_row_colours,
    build_family_clustermap,
    identify_core_cazome,
    plot_fam_boxplot,
    build_fam_mean_freq_df,
    get_group_specific_fams,
    build_family_clustermap_multi_legend,
)

# functions to identify and explore CAZy families that are always present together
from cazomevolve.cazome.explore.cooccurring_families import (
    identify_cooccurring_fams_corrM,
    calc_cooccuring_fam_freqs,
    identify_cooccurring_fam_pairs,
    add_to_upsetplot_membership,
    build_upsetplot,
    get_upsetplot_grps,
    add_upsetplot_grp_freqs,
    build_upsetplot_matrix,
)

# functions to perform PCA
from cazomevolve.cazome.explore.pca import (
    perform_pca,
    plot_explained_variance,
    plot_scree,
    plot_pca,
    plot_loadings,
)


fgp_file = "../data/cazomes/fam_genome_protein_list"
fgp_df = load_fgp_data(fgp_file)
fgp_df.head(3)


len(set(fgp_df['Protein']))

12487


help(load_tax_data)

Help on function load_tax_data in module cazomevolve.cazome.explore.parse_data:

load_tax_data(tax_csv_path, kingdom=False, phylum=False, tax_class=False, tax_order=False, tax_family=False, genus=False, species=False)
    Load tax data compiled by cazomevolve into a pandas df
    
    :param tax_csv_path: str/Path to csv file of genome, tax_rank, tax_rank
        e.g. 'Genome', 'Genus', 'Species'
    The remaining params are bool checks for lineage ranks included in the tax data file
    
    Return df of genome, tax_rank, tax_rank,  e.g. 'Genome', 'Genus', 'Species'


tax_csv_path = "../data/cazomes/fg_genome_taxs.csv"
tax_df = load_tax_data(tax_csv_path, genus=True, species=True)
tax_df.head(3)


fgp_df = add_tax_data_from_tax_df(
    fgp_df,
    tax_df,
    genus=True,
    species=True,
)
fgp_df.head(3)

Collecting Genus data: 100%|████████████████████████████████████████████████████████████████████████| 32975/32975 [00:11<00:00, 2793.01it/s]
Collecting Species data: 100%|██████████████████████████████████████████████████████████████████████| 32975/32975 [00:12<00:00, 2592.03it/s]


print(f"The dataset includes {len(set(fgp_df['Family']))} unique CAZy families")

The dataset includes 103 unique CAZy families


# Calculate CAZymes per genome
cazome_sizes_dict, cazome_sizes_df = count_items_in_cazome(fgp_df, 'Protein', 'Genus', round_by=2)
cazome_sizes_df

Gathering CAZy families per genome: 100%|██████████████████████████████████████████████████████████| 32975/32975 [00:02<00:00, 14936.15it/s]
Calculating num of Protein per genome and per Genus: 100%|██████████████████████████████████████████████████| 4/4 [00:00<00:00, 2957.38it/s]


# Calculate CAZy families per genome
cazome_fam_dict, cazome_fams_df = count_items_in_cazome(fgp_df, 'Family', 'Genus', round_by=2)
cazome_fams_df

Gathering CAZy families per genome: 100%|██████████████████████████████████████████████████████████| 32975/32975 [00:02<00:00, 14081.93it/s]
Calculating num of Family per genome and per Genus: 100%|███████████████████████████████████████████████████| 4/4 [00:00<00:00, 3725.79it/s]


print(f"The total number of CAZymes is {len(set(fgp_df['Protein']))}")

# exclude the Hafnia genome because it is an outgroup for the phylotree
pd_gf_df = fgp_df[fgp_df['Genus'] != 'Hafnia']
print(f"The total number of Pectobacterium and Dickeya CAZymes is {len(set(pd_gf_df['Protein']))}")

p_gf_df = fgp_df[fgp_df['Genus'] == 'Pectobacterium']
print(f"The total number of Pectobacterium CAZymes is {len(set(p_gf_df['Protein']))}")

d_gf_df = fgp_df[fgp_df['Genus'] == 'Dickeya']
print(f"The total number of Dickeya CAZymes is {len(set(d_gf_df['Protein']))}")

The total number of CAZymes is 12487
The total number of Pectobacterium and Dickeya CAZymes is 12399
The total number of Pectobacterium CAZymes is 8591
The total number of Dickeya CAZymes is 3710


# Get the size of the proteome (the number of protein acc) per genome
grp = 'Genus'
proteome_dir = "../data/proteomes"
proteome_dict = get_proteome_sizes(proteome_dir, fgp_df, grp)

Getting proteome sizes: 100%|█████████████████████████████████████████████████████████████████████████████| 281/281 [00:12<00:00, 22.42it/s]


# Calculate the mean proteome size by genus and the proportion of the proteome represented by the CAZome
proteome_perc_df = calc_proteome_representation(proteome_dict, cazome_sizes_dict, grp, round_by=2)
proteome_perc_df

Getting proteome size: 100%|████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2080.77it/s]
Calc proteome perc: 100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 2901.13it/s]


cazome_fams_df


# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/'), force=True, nodelete=True)
make_output_directory(Path('../results/cazy_classes/'), force=True, nodelete=True)

class_df, class_size_dict = calculate_class_sizes(fgp_df, 'Genus', round_by=2)
filtered_class_df = class_df[class_df['Genus'] != 'Hafnia']
filtered_class_df.to_csv('../results/cazy_classes/cazy_class_sizes.csv')
class_df

Output directory ../results exists, nodelete is True. Adding output to output directory.
Output directory ../results/cazy_classes exists, nodelete is True. Adding output to output directory.
Getting CAZy class sizes: 100%|█████████████████████████████████████████████████████████████████████| 32975/32975 [00:06<00:00, 4956.77it/s]
Calculating CAZy class sizes: 100%|██████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 402.21it/s]


fam_freq_df = build_fam_freq_df(fgp_df, ['Genus', 'Species'])
make_output_directory(Path('../results/cazy_families/'), force=True, nodelete=True)
fam_freq_df.to_csv("../results/cazy_families/cazy_fam_freqs.csv")
fam_freq_df

The dataset contains 103 CAZy families

Counting fam frequencies: 100%|███████████████████████████████████████████████████████████████████████████| 281/281 [00:11<00:00, 24.90it/s]
Output directory ../results/cazy_families exists, nodelete is True. Adding output to output directory.


aa_families = [_ for _ in fam_freq_df.columns if _.startswith('AA')]
aa_families

['AA10', 'AA3']


set(fam_freq_df['AA10'])

{0, 1}


set(fam_freq_df['AA3'])

{0, 1}


aa_dict = {}  # {genus: {AA10: int, AA3: int, 'both': int}}
for ri in range(len(fam_freq_df)):
    genus = fam_freq_df.iloc[ri]['Genus']
    try:
        aa_dict[genus]
    except KeyError:
        aa_dict[genus] = {'AA10': 0, 'AA3': 0, 'both': 0}
    for aa in aa_families:
        if fam_freq_df.iloc[ri][aa] != 0:
            aa_dict[genus][aa] += 1
    if (fam_freq_df.iloc[ri]['AA10'] != 0) and (fam_freq_df.iloc[ri]['AA3'] != 0):
        print(fam_freq_df.iloc[ri]['Genome'], fam_freq_df.iloc[ri]['Genus'], fam_freq_df.iloc[ri]['Species'])
        aa_dict[genus]['both'] += 1
aa_dict

GCF_000738125.1 Pectobacterium brasiliense

{'Dickeya': {'AA10': 0, 'AA3': 47, 'both': 0},
 'Pectobacterium': {'AA10': 2, 'AA3': 149, 'both': 1},
 'Musicola': {'AA10': 0, 'AA3': 0, 'both': 0},
 'Hafnia': {'AA10': 0, 'AA3': 0, 'both': 0}}


for cazy_class in ['GH', 'GT', 'PL', 'CE', 'AA', 'CBM']:
    class_families = aa_families = [_ for _ in fam_freq_df.columns if _.startswith(cazy_class)]
    print(f"{len(class_families)} families from CAZy class {cazy_class} overall")
    for genus in set(fam_freq_df['Genus']):
        class_families = []
        genus_df = fam_freq_df[fam_freq_df['Genus'] == genus]
        for col in genus_df.columns:
            if col.startswith(cazy_class):
                if {0} != set(genus_df[col]):
                    class_families.append(col)
        print(f"For {genus}: {len(class_families)} families from CAZy class {cazy_class} overall")

44 families from CAZy class GH overall
For Hafnia: 22 families from CAZy class GH overall
For Musicola: 22 families from CAZy class GH overall
For Dickeya: 33 families from CAZy class GH overall
For Pectobacterium: 38 families from CAZy class GH overall
28 families from CAZy class GT overall
For Hafnia: 16 families from CAZy class GT overall
For Musicola: 15 families from CAZy class GT overall
For Dickeya: 19 families from CAZy class GT overall
For Pectobacterium: 27 families from CAZy class GT overall
12 families from CAZy class PL overall
For Hafnia: 2 families from CAZy class PL overall
For Musicola: 5 families from CAZy class PL overall
For Dickeya: 10 families from CAZy class PL overall
For Pectobacterium: 10 families from CAZy class PL overall
7 families from CAZy class CE overall
For Hafnia: 3 families from CAZy class CE overall
For Musicola: 6 families from CAZy class CE overall
For Dickeya: 7 families from CAZy class CE overall
For Pectobacterium: 6 families from CAZy class CE overall
2 families from CAZy class AA overall
For Hafnia: 0 families from CAZy class AA overall
For Musicola: 0 families from CAZy class AA overall
For Dickeya: 1 families from CAZy class AA overall
For Pectobacterium: 2 families from CAZy class AA overall
10 families from CAZy class CBM overall
For Hafnia: 3 families from CAZy class CBM overall
For Musicola: 3 families from CAZy class CBM overall
For Dickeya: 8 families from CAZy class CBM overall
For Pectobacterium: 8 families from CAZy class CBM overall


# Drop Hafnia genome
fam_freq_df_pd = copy(fam_freq_df)
fam_freq_df_pd = fam_freq_df_pd[fam_freq_df_pd['Genus'] != 'Hafnia']
fam_freq_df_pd.head(1)


# index the taxonomy data and genome (ggs=genome_genus_species)
fam_freq_df_ggs = copy(fam_freq_df_pd)  # so does not alter fam_freq_df
fam_freq_df_ggs = fam_freq_df_ggs.set_index(['Genome','Genus','Species'])
fam_freq_df_ggs.head(1)


# define a colour scheme to colour code rows by genus
fam_freq_df_ggs['Genus'] = list(fam_freq_df_pd['Genus'])  # add column to use for colour scheme, is removed
fam_freq_genus_row_colours, fam_g_lut = build_row_colours(fam_freq_df_ggs, 'Genus', 'Set2')


# make a figure that is full size, and all data is legible
build_family_clustermap(
    fam_freq_df_ggs,
    row_colours=fam_freq_genus_row_colours,
    fig_size=(25,73),
    file_path="../results/cazy_families/pd_fam_freq_clustermap.svg",
    file_format='svg',
    lut=fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.2,0.05),
    title_fontsize=28,
    legend_fontsize=24,
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f659041cd90>


# make a figure the optimal size to fit in a paper
build_family_clustermap(
    fam_freq_df_ggs,
    row_colours=fam_freq_genus_row_colours,
    fig_size=(20,40),
    file_path="../results/cazy_families/paper_pd_fam_freq_clustermap.png",
    file_format='png',
    font_scale=0.5,
    lut=fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f6591355810>


# define a colour scheme to colour code rows by SPECIES
fam_freq_df_ggs['Species'] = list(fam_freq_df_pd['Species'])  # add column to use for colour scheme, is removed
fam_freq_species_row_colours, fam_s_lut = build_row_colours(fam_freq_df_ggs, 'Species', 'rainbow')


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=fam_freq_df_ggs,
    row_colours=[fam_freq_genus_row_colours,fam_freq_species_row_colours],
    luts=[fam_g_lut, fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.045), (0.63,1.04)],
    legend_cols=[1,5],
    fig_size=(20,40),
    file_path="../results/cazy_families/paper_pd_genus_species_fam_freq_clustermap.png",
    file_format='png',
    font_scale=1,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.96, 0.1, 0.1),  #left, bottom, width, height
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f658ddfc890>


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=fam_freq_df_ggs,
    row_colours=[fam_freq_genus_row_colours,fam_freq_species_row_colours],
    luts=[fam_g_lut, fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.05), (0.63,1.05)],
    legend_cols=[1,5],
    fig_size=(60,90),
    file_path="../results/cazy_families/pd_genus_species_fam_freq_clustermap.pdf",
    file_format='pdf',
    font_scale=1,
    dendrogram_ratio=(0.1,0.03),
    title_fontsize=35,
    legend_fontsize=33,
    cbar_pos=(0.01, 0.98, 0.1, 0.08),  #left, bottom, width, height
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f658d8e9110>


all_families = list(fam_freq_df_pd.columns)[3:]
# dict {group: {only unique fams}} and dict {group: {all fams}}
unique_grp_fams, group_fams = get_group_specific_fams(fam_freq_df_pd, 'Genus', all_families)
unique_grp_fams

Identifying fams in each Genus: 100%|█████████████████████████████████████████████████████████████████████| 280/280 [00:02<00:00, 95.13it/s]
Identifying Genus specific fams: 100%|█████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 15807.68it/s]

{'Dickeya': {'CBM4', 'CE2', 'GH113', 'GH148', 'GH26', 'GH91', 'GT97', 'PL10'},
 'Pectobacterium': {'AA10',
  'CBM3',
  'GH108',
  'GH12',
  'GH153',
  'GH16',
  'GH171',
  'GH18',
  'GH65',
  'GH68',
  'GT101',
  'GT11',
  'GT111',
  'GT14',
  'GT20',
  'GT25',
  'GT32',
  'GT52',
  'GT73',
  'PL11'}}


core_cazome = identify_core_cazome(fam_freq_df_ggs)

core_cazome = list(core_cazome)
core_cazome.sort()

print("The core CAZy families are:")
for fam in core_cazome:
    print('-', fam)

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 10199.34it/s]

The core CAZy families are:
- CBM50
- CE8
- GH1
- GH103
- GH105
- GH13
- GH23
- GH28
- GH3
- GT19
- GT2
- GT30
- GT4
- GT51
- GT9
- PL1
- PL9


# filter the famil freq df to include only those families in the core CAZome
core_cazome_df = fam_freq_df_ggs[core_cazome]
plot_fam_boxplot(core_cazome_df, font_scale=0.8, fig_size=(12,6))


core_cazome_df_genus = copy(core_cazome_df)  # to ensure core_cazome_df is not altereted
core_cazome_df_genus = add_tax_column_from_row_index(core_cazome_df_genus, 'Genus', 1)
core_cazome_df_genus.head(1)


core_cazome_fggf_df, core_cazome_mean_freq_df = build_fam_mean_freq_df(
    core_cazome_df_genus,
    'Genus',
    round_by=2,
)
make_output_directory(Path("../results/core_cazome/"), nodelete=True, force=True)
core_cazome_mean_freq_df.to_csv("../results/core_cazome/core_cazome_freqs.csv")

core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████████████| 280/280 [00:00<00:00, 4977.28it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|█████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 83.47it/s]
Output directory ../results/core_cazome exists, nodelete is True. Adding output to output directory.


# add a tax rank (e.g. genus) column so that the genomes can be grouped by their tax rank
fam_freq_df_ggs_genus = copy(fam_freq_df_ggs)  # to ensure core_cazome_df is not altereted
fam_freq_df_ggs_genus = add_tax_column_from_row_index(fam_freq_df_ggs_genus, 'Genus', 1)
fam_freq_df_ggs_genus.head(1)


# filter for only pectobacterium genomes
pecto_freq_df_ggs_genus = fam_freq_df_ggs_genus[fam_freq_df_ggs_genus['Genus'] == 'Pectobacterium']
print(f"Analysing {len(pecto_freq_df_ggs_genus)} Pectobacterium genomes")

# Identify the core CAZome
pecto_core_cazome = identify_core_cazome(pecto_freq_df_ggs_genus)
print("The core CAZy families in Pectobacterium are:")
pecto_core_cazome = list(pecto_core_cazome)
pecto_core_cazome.sort()
for fam in pecto_core_cazome:
    print('-', fam)

Analysing 188 Pectobacterium genomes

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 11935.52it/s]

The core CAZy families in Pectobacterium are:
- CBM32
- CBM50
- CE8
- CE9
- GH1
- GH103
- GH105
- GH13
- GH23
- GH28
- GH3
- GH43
- GT19
- GT2
- GT28
- GT30
- GT4
- GT51
- GT9
- Genus
- PL1
- PL2
- PL22
- PL3
- PL9


# filter for only dickeya genomes
dic_freq_df_ggs_genus = fam_freq_df_ggs_genus[fam_freq_df_ggs_genus['Genus'] == 'Dickeya']
print(f"Analysing {len(dic_freq_df_ggs_genus)} Dickeya genomes")

# Identify the core CAZome
dic_core_cazome = identify_core_cazome(dic_freq_df_ggs_genus)
print("The core CAZy families in Dickeya are:")
dic_core_cazome = list(dic_core_cazome)
dic_core_cazome.sort()
for fam in dic_core_cazome:
    print('-', fam)

Analysing 90 Dickeya genomes

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 14176.39it/s]

The core CAZy families in Dickeya are:
- CBM48
- CBM50
- CE11
- CE12
- CE4
- CE8
- GH1
- GH102
- GH103
- GH104
- GH105
- GH13
- GH23
- GH28
- GH3
- GH31
- GH32
- GH33
- GH8
- GT1
- GT19
- GT2
- GT30
- GT35
- GT4
- GT41
- GT5
- GT51
- GT83
- GT9
- Genus
- PL1
- PL3
- PL9


core_cazome_mean_freq_df.head(1)


dic_freq_df_ggs_genus.head(1)
dic_only_core_fams = [fam for fam in dic_core_cazome if fam not in core_cazome]
dic_freq_df_ggs_genus_core_cazome = dic_freq_df_ggs_genus[dic_only_core_fams]

d_core_cazome_fggf_df, d_core_cazome_mean_freq_df = build_fam_mean_freq_df(
    dic_freq_df_ggs_genus_core_cazome,
    'Genus',
    round_by=2,
)

pecto_freq_df_ggs_genus.head(1)
pecto_only_core_fams = [fam for fam in pecto_core_cazome if fam not in core_cazome]
pecto_freq_df_ggs_genus_core_cazome = pecto_freq_df_ggs_genus[pecto_only_core_fams]

p_core_cazome_fggf_df, p_core_cazome_mean_freq_df = build_fam_mean_freq_df(
    pecto_freq_df_ggs_genus_core_cazome,
    'Genus',
    round_by=2,
)

pd_core_cazome_mean_freq_df = copy(core_cazome_mean_freq_df)
pd_core_cazome_mean_freq_df = pd.concat(
    [pd_core_cazome_mean_freq_df, d_core_cazome_mean_freq_df],
    ignore_index=True,
)
pd_core_cazome_mean_freq_df = pd.concat(
    [pd_core_cazome_mean_freq_df, p_core_cazome_mean_freq_df],
    ignore_index=True,
)

pd_core_cazome_mean_freq_df.to_csv("../results/core_cazome/pd_all_core_cazome_freqs.csv")
pd_core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|███████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 5224.81it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 87.16it/s]
Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 7961.08it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 197.55it/s]


# reminder of the structure of the df
fam_freq_df.head(1)


# reminder of the structure of the df
fam_freq_df_pd.head(1)


cooccurring_fams_dict = calc_cooccuring_fam_freqs(
    fam_freq_df_pd,
    list(fam_freq_df_pd.columns[3:]),
    exclude_core_cazome=False,
)

Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 206.20it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 145/145 [00:00<00:00, 272479.43it/s]


pd_fam_freq_df = fam_freq_df_pd[fam_freq_df_pd['Genus'].isin(['Pectobacterium','Dickeya'])]
pd_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
    pd_fam_freq_df,
    list(fam_freq_df_pd.columns[3:]),
    exclude_core_cazome=False,
)
musicola_cooccurring_fams_dict = {}
for grpnum in cooccurring_fams_dict:
    fams = list(cooccurring_fams_dict[grpnum]['fams'])
    fams.sort()
    for num in pd_cooccurring_fams_dict:
        pd_fams = list(pd_cooccurring_fams_dict[num]['fams'])
        pd_fams.sort()
        if pd_fams == fams:
            freq = list(cooccurring_fams_dict[grpnum]['freqs'])[0] - list(pd_cooccurring_fams_dict[grpnum]['freqs'])[0]
            if freq > 0:
                musicola_cooccurring_fams_dict[grpnum] = {'fams': set(fams), 'freqs': {freq}}
musicola_cooccurring_fams_dict

Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 213.35it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 162/162 [00:00<00:00, 288158.29it/s]

{3: {'fams': {'CE11', 'GH102', 'GH32'}, 'freqs': {2}},
 6: {'fams': {'GT0', 'GT26'}, 'freqs': {2}}}


# Pectobacterium
pecto_fam_freq_df = fam_freq_df_pd[fam_freq_df_pd['Genus'] == 'Pectobacterium']
pecto_cooccurring_fams_dict = calc_cooccuring_fam_freqs(pecto_fam_freq_df, list(pecto_fam_freq_df.columns[3:]), exclude_core_cazome=False)

Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 277.13it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 292/292 [00:00<00:00, 307028.52it/s]


# Dickeya
dic_fam_freq_df = fam_freq_df_pd[fam_freq_df_pd['Genus'] == 'Dickeya']
dic_cooccurring_fams_dict = calc_cooccuring_fam_freqs(dic_fam_freq_df, list(dic_fam_freq_df.columns[3:]), exclude_core_cazome=False)

Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 476.08it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 538/538 [00:00<00:00, 270827.60it/s]


upsetplot_membership = []
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, cooccurring_fams_dict)
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, pecto_cooccurring_fams_dict)
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, dic_cooccurring_fams_dict)
len(upsetplot_membership)

2084


make_output_directory(Path("../results/cooccurring_families"), force=True, nodelete=True)
pecto_dic_upsetplot = build_upsetplot(
    upsetplot_membership,
    file_path='../results/cooccurring_families/pd_cooccurring_fams.svg',
)

Output directory ../results/cooccurring_families exists, nodelete is True. Adding output to output directory.
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/upsetplot/plotting.py:662: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '['#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' 'black' 'black' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' 'black' 'black' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' 'black' 'black' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black' 'black'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 'black' 'black' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' 'black' 'black' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black' 'black'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black'
 'black' 'black' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 'black' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' 'black' '#0000002e' 'black'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' 'black' 'black' '#0000002e' 'black' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' 'black' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black' 'black' 'black'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black' 'black' 'black'
 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black'
 'black' 'black' 'black' 'black' 'black' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black'
 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black'
 'black' 'black' 'black' 'black' 'black' 'black' 'black' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' 'black' 'black' 'black' 'black'
 'black' 'black' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black'
 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black'
 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black'
 'black' '#0000002e' '#0000002e' 'black' 'black' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' 'black' 'black' 'black'
 'black' 'black' 'black' 'black' 'black' 'black' 'black' 'black'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e' '#0000002e'
 '#0000002e']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  styles["edgecolor"].fillna(styles["facecolor"], inplace=True)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/upsetplot/plotting.py:663: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'solid' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  styles["linestyle"].fillna("solid", inplace=True)


upset_plot_groups = get_upsetplot_grps(upsetplot_membership)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 51.85it/s]


cooccurring_grp_freq_data = []  # empty list to store data for the df

cooccurring_grp_freq_data = add_upsetplot_grp_freqs(
    upset_plot_groups,
    cooccurring_grp_freq_data,
    cooccurring_fams_dict,
    'All',
)
cooccurring_grp_freq_data = add_upsetplot_grp_freqs(
    upset_plot_groups,
    cooccurring_grp_freq_data,
    musicola_cooccurring_fams_dict,
    'Musicola',
)
cooccurring_grp_freq_data= add_upsetplot_grp_freqs(
    upset_plot_groups,
    cooccurring_grp_freq_data,
    pecto_cooccurring_fams_dict,
    'Pectobacterium',
)
cooccurring_grp_freq_data= add_upsetplot_grp_freqs(
    upset_plot_groups,
    cooccurring_grp_freq_data,
    dic_cooccurring_fams_dict,
    'Dickeya',
)

Compiling co-occurring families incidence data: 100%|████████████████████████████████████████████████████| 14/14 [00:00<00:00, 61230.72it/s]
Compiling co-occurring families incidence data: 100%|████████████████████████████████████████████████████| 14/14 [00:00<00:00, 20206.56it/s]
Compiling co-occurring families incidence data: 100%|████████████████████████████████████████████████████| 14/14 [00:00<00:00, 51827.23it/s]
Compiling co-occurring families incidence data: 100%|████████████████████████████████████████████████████| 14/14 [00:00<00:00, 84733.41it/s]


# build the dataframe
cooccurring_fams_freq_df = build_upsetplot_matrix(
    cooccurring_grp_freq_data,
    'Genus',
    file_path='../results/cooccurring_families/cooccurring_fams_freqs.csv',
)
cooccurring_fams_freq_df


num_of_components = len(fam_freq_df_ggs.columns)
pd_pca, X_scaled = perform_pca(fam_freq_df_ggs, num_of_components)
pd_pca

PCA(n_components=103)

PCA(n_components=103)


make_output_directory(Path("../results/pca/pecto_dic/"), force=True, nodelete=True)
cumExpVar = plot_explained_variance(
    pd_pca,
    num_of_components,
    file_path="../results/pca/pecto_dic/pd_pca_explained_variance.png",
)

Output directory ../results/pca/pecto_dic exists, nodelete is True. Adding output to output directory.

Number of features needed to explain 0.95 fraction of total variance is 44.

<Figure size 1200x600 with 0 Axes>


print(f"{pd_pca.explained_variance_ratio_.sum() * 100}% of the variance in the data set was catpured by the PCA")

100.0% of the variance in the data set was catpured by the PCA


plot_scree(pd_pca, nComp=10, file_path="../results/pca/pecto_dic/pd_pca_scree.png")

Explained variance for 1PC: 0.16914860399915893
Explained variance for 2PC: 0.0806094695362935
Explained variance for 3PC: 0.07220174489942305
Explained variance for 4PC: 0.06139555988796338
Explained variance for 5PC: 0.04449773463984567
Explained variance for 6PC: 0.04221132762829172
Explained variance for 7PC: 0.03974058126224163
Explained variance for 8PC: 0.03230394191366608
Explained variance for 9PC: 0.02896770917858891
Explained variance for 10PC: 0.025004059065160098


X_pca = pd_pca.transform(X_scaled)

fam_freq_df_ggs['Genus'] = list(fam_freq_df_pd['Genus']) 

fam_freq_df_ggs_pc = copy(fam_freq_df_ggs)
colnames = []
for i in range(4):
    fam_freq_df_ggs_pc[f'PC{i+1} ({round(pd_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(pd_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Genus",
    diag_kind="kde",
    markers=['o','X', '^'],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/pca/pecto_dic/pd_pc_screen_genus.svg',
    bbox_inches='tight',
    format='svg'
)
plt.savefig(
    '../results/pca/pecto_dic/pd_pc_screen_genus.png',
    bbox_inches='tight',
    format='png'
)


fam_freq_df_ggs_pc['Species'] = list(fam_freq_df_pd['Species']) 

g = sns.pairplot(
    fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Species",
    diag_kind="kde",
    markers=[
        'o', 'X', 'o', 'X', 'D', 'o', 'o', 'o', 'X',
        'X', 'o', 'o', 'X', 'X', 'o', 'X', 'o', 'X',
        'o', 'o', 'o', '^', 'o', 'o', 'X', 'X', 'X', 'o',
    ],
    height=3,
);
i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/pca/pecto_dic/pd_pc_screen_species.svg',
    bbox_inches='tight',
    format='svg'
)
plt.savefig(
    '../results/pca/pecto_dic/pd_pc_screen_species.png',
    bbox_inches='tight',
    format='png'
)


fam_freq_df_ggs['Genus'] = list(fam_freq_df_pd['Genus'])  # add column to use for colour scheme

pc1_pc2_scatter_plt = plot_pca(
    pd_pca,
    X_scaled,
    fam_freq_df_ggs,
    1,
    2,
    'Genus',
    figsize=(10,10),
)

Not applying hue order
Not Applying style


X_pca = pd_pca.transform(X_scaled)

plt.figure(figsize=(10,10))
sns.set(font_scale=1.15)

g = sns.scatterplot(
    x=X_pca[:,0],
    y=X_pca[:,1],
    data=fam_freq_df_ggs,
    hue='Genus',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC2 {100 * pd_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pd_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=3, title='Genus', frameon=False);

genome_lbls = [_[0] for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((xval > 2) and (xval < 4)) or ((yval > 3) and (xval > 0))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto_dic/pd_pc1_pc2_LABELLED.png', bbox_inches='tight', format='png')


X_pca = pd_pca.transform(X_scaled)

plt.figure(figsize=(5,10))
sns.set(font_scale=1.15)

temp_d_fam_freq_df_ggs = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Dickeya']

g = sns.scatterplot(
    x=X_pca[:,0][fam_freq_df_ggs['Genus'] == 'Dickeya'],
    y=X_pca[:,1][fam_freq_df_ggs['Genus'] == 'Dickeya'],
    data=temp_d_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=150,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC2 {100 * pd_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pd_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, -.3), ncol=3, title='Dickeya sp.', frameon=False);

genome_lbls = [_[0] for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if (((yval < -1) and (yval > -3) and (xval > 0)) or ((yval > 2) and (xval > 2) and (xval < 4.1)))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto_dic/pd_pc1_pc2_dickeya_sp.png', bbox_inches='tight', format='png')


X_pca = pd_pca.transform(X_scaled)

plt.figure(figsize=(5,10))
sns.set(font_scale=1.15)

temp_d_fam_freq_df_ggs = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Pectobacterium']

g = sns.scatterplot(
    x=X_pca[:,0][fam_freq_df_ggs['Genus'] == 'Pectobacterium'],
    y=X_pca[:,1][fam_freq_df_ggs['Genus'] == 'Pectobacterium'],
    data=temp_d_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=140,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC2 {100 * pd_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pd_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, -.33), ncol=3, title='Pectobacterium sp.', frameon=False);

genome_lbls = [_[0] for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((xval > -2) and (xval < 0) and (yval > 1))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto_dic/pd_pc1_pc2_pectobacterium_sp.svg', bbox_inches='tight', format='svg')


X_pca = pd_pca.transform(X_scaled)

plt.figure(figsize=(5,5))
sns.set(font_scale=1.15)

temp_d_fam_freq_df_ggs = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Pectobacterium']

g = sns.scatterplot(
    x=X_pca[:,0][fam_freq_df_ggs['Genus'] == 'Pectobacterium'],
    y=X_pca[:,1][fam_freq_df_ggs['Genus'] == 'Pectobacterium'],
    data=temp_d_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=140,
    markers=True,
)

g.set(xlim=(-1.5,-1.3));
g.set(ylim=(1.3,1.5));

plt.ylabel(f"PC2 {100 * pd_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pd_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend([],[], frameon=False);

genome_lbls = ["\n".join(_) for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((xval > -2) and (xval < 0) and (yval > 1))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto_dic/pd_pc1_pc2_pectobacterium_sp_zoomed.svg', bbox_inches='tight', format='svg')


fam_freq_df_ggs_ncbi = copy(fam_freq_df_ggs)
fam_freq_df_ggs_ncbi['Species'] = list(fam_freq_df_pd['Species'])

# get tax data for NCBI
ncbi_tax_csv_path = "../data/pectobact/cazomes/fg_genome_taxs.csv"
ncbi_tax_df = load_tax_data(ncbi_tax_csv_path, genus=True, species=True)

species_col = []
for ri in tqdm(range(len(fam_freq_df_ggs_ncbi)), desc="Getting P.carotovorum ncbi classifications"):
    if fam_freq_df_ggs_ncbi.iloc[ri].name[-1] == 'carotovorum':
        genome = fam_freq_df_ggs_ncbi.iloc[ri].name[0].replace("GCF", "GCA")
        species = ncbi_tax_df[ncbi_tax_df['Genome'] == genome]['Species'].values[0]
        species_col.append(species)
    else:
        species_col.append(fam_freq_df_ggs_ncbi.iloc[ri]['Species'])

fam_freq_df_ggs_ncbi['Species'] = species_col
fam_freq_df_ggs_ncbi.head(5)

Getting P.carotovorum ncbi classifications:   0%|          | 0/280 [00:00<?, ?it/s]


X_pca = pd_pca.transform(X_scaled)

plt.figure(figsize=(5,10))
sns.set(font_scale=1.15)

temp_d_fam_freq_df_ggs = fam_freq_df_ggs_ncbi[fam_freq_df_ggs_ncbi['Genus'] == 'Pectobacterium']

g = sns.scatterplot(
    x=X_pca[:,0][fam_freq_df_ggs_ncbi['Genus'] == 'Pectobacterium'],
    y=X_pca[:,1][fam_freq_df_ggs_ncbi['Genus'] == 'Pectobacterium'],
    data=temp_d_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=140,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC2 {100 * pd_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pd_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);
sns.move_legend(g, "lower center", bbox_to_anchor=(.5, -.35), ncol=3, title='Pectobacterium sp.', frameon=False);

genome_lbls = [" ".join(_) for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((xval > -2) and (xval < 0) and (yval > 1))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto_dic/pd_pc1_pc2_pectobacterium_sp_ncbi.png', bbox_inches='tight', format='png')


X_pca = pd_pca.transform(X_scaled)

plt.figure(figsize=(5,5))
sns.set(font_scale=1.15)

temp_d_fam_freq_df_ggs = fam_freq_df_ggs_ncbi[fam_freq_df_ggs_ncbi['Genus'] == 'Pectobacterium']

g = sns.scatterplot(
    x=X_pca[:,0][fam_freq_df_ggs_ncbi['Genus'] == 'Pectobacterium'],
    y=X_pca[:,1][fam_freq_df_ggs_ncbi['Genus'] == 'Pectobacterium'],
    data=temp_d_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=140,
    markers=True,
)

g.set(xlim=(-1.5,-1.3));
g.set(ylim=(1.3,1.5));

plt.ylabel(f"PC2 {100 * pd_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pd_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend([],[], frameon=False);

genome_lbls = ["\n".join(_) for _ in fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((xval > -2) and (xval < 0) and (yval > 1))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto_dic/pd_pc1_pc2_pectobacterium_sp_ncbi_zoomed.png', bbox_inches='tight', format='png')


plot_loadings(
    pd_pca,
    fam_freq_df_ggs,
    1,
    2,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto_dic/pd_pc1_pc2_loadings_plot",
    font_size=11,
)


# load species data
sp_df = pd.read_csv("../data/pecto_dic/additional_data/pectobacterium_dickeya_niches-SI.csv")

fam_freq_df_ggs['Species'] = list(fam_freq_df_pd['Species'])
fam_freq_df_ggs['Genus'] = list(fam_freq_df_pd['Genus'])

distribution_col, climate_col, host_col, grp_col = [], [], [], []
for i in tqdm(range(len(fam_freq_df_ggs)), desc="Adding species data"):
    species = fam_freq_df_ggs.iloc[i]['Species']
    if species == 'versatile A' or species =='Versatile':
        species = 'versatile'
    species_row = sp_df[sp_df['Species'] == species]
    
    try:
        distribution = species_row['Distribution'].values[0]
    except IndexError:
        distribution = None
        print(f"Could not find data for {species}")
    try:
        climate = species_row['Climate'].values[0]
    except IndexError:
        climate = None
        
    try:
        host = species_row['Plant hosts'].values[0]
    except IndexError:
        host = None
        
    try:
        grp = species_row['Group'].values[0]
    except IndexError:
        grp = None
        
    distribution_col.append(distribution)
    climate_col.append(climate)
    host_col.append(host)
    grp_col.append(str(grp))
    
fam_freq_df_ggs['Distribution'] = distribution_col
fam_freq_df_ggs['Climate'] = climate_col
fam_freq_df_ggs['Host'] = host_col
fam_freq_df_ggs['Group'] = grp_col

fam_freq_df_ggs.head(3)

Adding species data:   0%|          | 0/280 [00:00<?, ?it/s]

Could not find data for quasiaquaticum
Could not find data for quasiaquaticum
Could not find data for sp013449375


c_grps = {
    '1': ['zeae', 'oryzae', 'poaceiphila', 'paradisiaca'],
    '2': ['versatile','polaris','aroidearum','aroidearum'],
    '3': ['fangzhongdai', 'dianthicola', 'solani', 'chrysanthemi'],
    '4': ['undicola'],
}

c_grp_col = []
for sp in list(fam_freq_df_pd['Species']):
    if sp in c_grps['1']:
        c_grp_col.append('1')
    elif sp in c_grps['2']:
        c_grp_col.append('2')
    elif sp in c_grps['3']:
        c_grp_col.append('3')
    elif sp in c_grps['4']:
        c_grp_col.append('4')
    else:
        c_grp_col.append('5')
        
fam_freq_df_ggs['Carbon-Metabolism'] = c_grp_col


make_output_directory(Path("../results/pca/pecto_dic/niche_mapping/"), force=True, nodelete=True)

pc_pairs = [(1,2), (1,3), (1,4), (1,5), (2,3), (2,4), (3,4)]
columns = ['Species', 'Loadings', 'Distribution','Climate','Host','Group', 'Carbon-Metabolism']
fam_freq_df_ggs['Species'] = list(fam_freq_df_pd['Species'])
fam_freq_df_ggs['Genus'] = list(fam_freq_df_pd['Genus'])

for pc_pair in pc_pairs:
    for col in columns:
        print(f"Below: PC{pc_pair[0]} vs PC{pc_pair[1]} - Colour coded by {col}")
        if col == 'Species':
            plot_pca(
                pd_pca,
                X_scaled,
                fam_freq_df_ggs,
                pc_pair[0],
                pc_pair[1],
                'Genus',
                figsize=(10,10),
                style=col,
            )
        
        elif col == 'Loadings':
            # loadings plot 
            plot_loadings(
                pd_pca,
                fam_freq_df_ggs[fam_freq_df_ggs.columns[:-7]],
                pc_pair[0],
                pc_pair[1],
                threshold=0.3,
                fig_size=(10,10),
                font_size=11,
                style=True,
            )
        
        else:
            plot_pca(
                pd_pca,
                X_scaled,
                fam_freq_df_ggs,
                pc_pair[0],
                pc_pair[1],
                col,
                figsize=(10,10),
                style=col,
                file_path=f"../results/pca/pecto_dic/niche_mapping/pd_pc{pc_pair[0]}_pc{pc_pair[1]}_{col}_plot",
            )

Output directory ../results/pca/pecto_dic/niche_mapping exists, nodelete is True. Adding output to output directory.

Below: PC1 vs PC2 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC2 - Colour coded by Loadings
Below: PC1 vs PC2 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC2 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC2 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC2 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC2 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC3 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC3 - Colour coded by Loadings
Below: PC1 vs PC3 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC3 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC3 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC3 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC3 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC4 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC4 - Colour coded by Loadings
Below: PC1 vs PC4 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC4 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC4 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC4 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC4 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/cazomevolve/cazome/explore/pca.py:210: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`.
  plt.figure(figsize=figsize)

Below: PC1 vs PC5 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC5 - Colour coded by Loadings
Below: PC1 vs PC5 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC5 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC5 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC5 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC1 vs PC5 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC3 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC3 - Colour coded by Loadings
Below: PC2 vs PC3 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC3 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC3 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC3 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC3 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC4 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC4 - Colour coded by Loadings
Below: PC2 vs PC4 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC4 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC4 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC4 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC2 vs PC4 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order
Below: PC3 vs PC4 - Colour coded by Species
Not applying hue order
Applying style
Not applying style order
Below: PC3 vs PC4 - Colour coded by Loadings
Below: PC3 vs PC4 - Colour coded by Distribution
Not applying hue order
Applying style
Not applying style order
Below: PC3 vs PC4 - Colour coded by Climate
Not applying hue order
Applying style
Not applying style order
Below: PC3 vs PC4 - Colour coded by Host
Not applying hue order
Applying style
Not applying style order
Below: PC3 vs PC4 - Colour coded by Group
Not applying hue order
Applying style
Not applying style order
Below: PC3 vs PC4 - Colour coded by Carbon-Metabolism
Not applying hue order
Applying style
Not applying style order


# add column to separate genomes into respective genera
fam_freq_df_ggs['Genus'] = list(fam_freq_df_pd['Genus'])

# create one df for each genus
fam_freq_df_pecto = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Pectobacterium']
fam_freq_df_dic = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Dickeya']

# drop the Genus column so columns only contain numerical data
fam_freq_df_pecto = fam_freq_df_pecto.drop('Genus', axis=1)
fam_freq_df_dic = fam_freq_df_dic.drop('Genus', axis=1)


# exclude species column from pca
pecto_num_of_components = len(fam_freq_df_pecto.columns[:-6])
pecto_pca, pecto_X_scaled = perform_pca(
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    pecto_num_of_components,
)
pecto_pca

PCA(n_components=103)

PCA(n_components=103)


make_output_directory(Path("../results/pca/pecto/"), force=True, nodelete=True)
cumExpVar = plot_explained_variance(
    pecto_pca,
    pecto_num_of_components,
    file_path="../results/pca/pecto/pecto_explained_variance.png",
)

Output directory ../results/pca/pecto exists, nodelete is True. Adding output to output directory.

Number of features needed to explain 0.95 fraction of total variance is 37.

<Figure size 1200x600 with 0 Axes>


print(f"{pecto_pca.explained_variance_ratio_.sum() * 100}% of the variance in the data set was catpured by the PCA")

100.0% of the variance in the data set was catpured by the PCA


plot_scree(pecto_pca, nComp=10, file_path="../results/pca/pecto/pecto_scree_plot.png")

Explained variance for 1PC: 0.1288094991239525
Explained variance for 2PC: 0.09867142062115815
Explained variance for 3PC: 0.08137220611258046
Explained variance for 4PC: 0.06666524273337604
Explained variance for 5PC: 0.05894511557943021
Explained variance for 6PC: 0.05261555241250664
Explained variance for 7PC: 0.045248770576620065
Explained variance for 8PC: 0.03724207873504617
Explained variance for 9PC: 0.031080659482280365
Explained variance for 10PC: 0.02781621702171926


fam_freq_df_ggs['Species'] = list(fam_freq_df_pd['Species'])  # add column to use for colour scheme
temp_df = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Pectobacterium']
fam_freq_df_pecto['Species'] = list(temp_df['Species'])

pc1_pc2_scatter_plt = plot_pca(
    pecto_pca,
    pecto_X_scaled,
    fam_freq_df_pecto,
    1,
    2,
    'Species',
    style='Species',
    figsize=(10,10),
    file_path="../results/pca/pecto/pecto_pc1_pc2.png",
)

plot_loadings(
    pecto_pca,
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    1,
    2,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto/pecto_pc1_pc2_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc1_pc3_scatter_plt = plot_pca(
    pecto_pca,
    pecto_X_scaled,
    fam_freq_df_pecto,
    1,
    3,
    'Species',
    style='Species',
    figsize=(10,12),
    file_path="../results/pca/pecto/pecto_pc1_pc3.png",
)

plot_loadings(
    pecto_pca,
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    1,
    3,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto/pecto_pc1_pc3_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc1_pc4_scatter_plt = plot_pca(
    pecto_pca,
    pecto_X_scaled,
    fam_freq_df_pecto,
    1,
    4,
    'Species',
    style='Species',
    figsize=(10,12),
    file_path="../results/pca/pecto/pecto_pc1_pc4.png",
)

plot_loadings(
    pecto_pca,
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    1,
    4,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto/pecto_pc1_pc24_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc2_pc3_scatter_plt = plot_pca(
    pecto_pca,
    pecto_X_scaled,
    fam_freq_df_pecto,
    2,
    3,
    'Species',
    style='Species',
    figsize=(10,12),
    file_path="../results/pca/pecto/pecto_pc2_pc3.png",
)

plot_loadings(
    pecto_pca,
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    2,
    3,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto/pecto_pc2_pc3_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc2_pc4_scatter_plt = plot_pca(
    pecto_pca,
    pecto_X_scaled,
    fam_freq_df_pecto,
    2,
    4,
    'Species',
    style='Species',
    figsize=(10,12),
    file_path="../results/pca/pecto/pecto_pc2_pc4.png",
)

plot_loadings(
    pecto_pca,
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    2,
    4,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto/pecto_pc2_pc4_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc3_pc4_scatter_plt = plot_pca(
    pecto_pca,
    pecto_X_scaled,
    fam_freq_df_pecto,
    3,
    4,
    'Species',
    style='Species',
    figsize=(10,10),
    file_path="../results/pca/pecto/pecto_pc3_pc4.png",
)

plot_loadings(
    pecto_pca,
    fam_freq_df_pecto[fam_freq_df_pecto.columns[:-6]],
    3,
    4,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/pecto/pecto_pc3_pc4_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


X_pca = pecto_pca.transform(pecto_X_scaled)

plt.figure(figsize=(10,10))
sns.set(font_scale=1.15)

g = sns.scatterplot(
    x=X_pca[:,0],
    y=X_pca[:,1],
    data=fam_freq_df_pecto,
    hue='Species',
    style='Species',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC2 {100 * pecto_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * pecto_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);

genome_lbls = ["-".join(_) for _ in fam_freq_df_pecto.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,1]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((yval > 2) and (xval < 0))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto/pecto_pc1_pc2_LABELLED.png', bbox_inches='tight', format='png')


import re
small_pc_group = [re.match(r"GCF_\d+.\d+-", _.properties()['text']).group()[:-1] for _ in texts]
with open("../data/genomic_accessions/small_p-carotovorum_grp", "w") as fh:
    for genome in small_pc_group:
        fh.write(f"{genome}\n")
        
a = ""
for g in small_pc_group:
    a += f" {g}"
a

' GCF_009931215.1 GCF_002904005.1 GCF_002904195.1 GCF_004137815.1 GCF_002904025.1 GCF_000769535.1'


large_pc_group = []
for i in range(len(fam_freq_df_pecto)):
    sp = fam_freq_df_pecto.iloc[i].name[-1]
    if sp.startswith("carotovorum"):
        gnm = fam_freq_df_pecto.iloc[i].name[0]
        if gnm not in small_pc_group:
            large_pc_group.append(gnm)
with open("../data/genomic_accessions/large_p-carotovorum_grp", "w") as fh:
    for genome in large_pc_group:
        fh.write(f"{genome}\n")


pc_group = []
for i in range(len(fam_freq_df_pecto)):
    genus = fam_freq_df_pecto.iloc[i].name[1]
    if genus == 'Pectobacterium':
        pc_group.append(gnm)
len(pc_group)

188


X_pca = pecto_pca.transform(pecto_X_scaled)

plt.figure(figsize=(10,10))
sns.set(font_scale=1.15)

g = sns.scatterplot(
    x=X_pca[:,2],
    y=X_pca[:,3],
    data=fam_freq_df_pecto,
    hue='Species',
    style='Species',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.xlabel(f"PC3 {100 * pecto_pca.explained_variance_ratio_[2]:.2f}%");
plt.ylabel(f"PC4 {100 * pecto_pca.explained_variance_ratio_[3]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);

genome_lbls = ["-".join(_) for _ in fam_freq_df_pecto.index]
x_vals = X_pca[:,2]
y_vals = X_pca[:,3]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if ((yval < -2.5) and (xval > 2.5))
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/pecto/pecto_pc3_pc4_LABELLED.png', bbox_inches='tight', format='png')


dic_num_of_components = len(fam_freq_df_dic)
dic_pca, dic_X_scaled = perform_pca(
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    dic_num_of_components,
)
dic_pca

PCA(n_components=90)

PCA(n_components=90)


make_output_directory(Path("../results/pca/dickeya/"), force=True, nodelete=True)

cumExpVar = plot_explained_variance(
    dic_pca,
    dic_num_of_components,
    file_path="../results/pca/dickeya/dic_explained_variance.png",
)

Output directory ../results/pca/dickeya exists, nodelete is True. Adding output to output directory.

Number of features needed to explain 0.95 fraction of total variance is 25.

<Figure size 1200x600 with 0 Axes>


print(f"{dic_pca.explained_variance_ratio_.sum() * 100}% of the variance in the data set was catpured by the PCA")

100.0% of the variance in the data set was catpured by the PCA


plot_scree(dic_pca, nComp=10, file_path="../results/pca/dickeya/dic_scree_plot.png")

Explained variance for 1PC: 0.16882456007544758
Explained variance for 2PC: 0.13528682331629904
Explained variance for 3PC: 0.08442204502051477
Explained variance for 4PC: 0.06560367524275842
Explained variance for 5PC: 0.05673883152920681
Explained variance for 6PC: 0.05198576365429983
Explained variance for 7PC: 0.050035678921381714
Explained variance for 8PC: 0.04088558780923047
Explained variance for 9PC: 0.032418292152083715
Explained variance for 10PC: 0.028025122685904142


fam_freq_df_ggs['Species'] = list(fam_freq_df_pd['Species'])  # add column to use for colour scheme
temp_df = fam_freq_df_ggs[fam_freq_df_ggs['Genus'] == 'Dickeya']
fam_freq_df_dic['Species'] = list(temp_df['Species'])

pc1_pc2_scatter_plt = plot_pca(
    dic_pca,
    dic_X_scaled,
    fam_freq_df_dic,
    1,
    2,
    'Species',
    style='Species',
    figsize=(10,10),
    file_path="../results/pca/dickeya/dic_pc1_pc2.png",
)

plot_loadings(
    dic_pca,
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    1,
    2,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/dickeya/dic_pc1_pc2_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc1_pc3_scatter_plt = plot_pca(
    dic_pca,
    dic_X_scaled,
    fam_freq_df_dic,
    1,
    3,
    'Species',
    style='Species',
    figsize=(10,10),
    file_path="../results/pca/dickeya/dic_pc1_pc3.png",
)

plot_loadings(
    dic_pca,
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    1,
    3,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/dickeya/dic_pc1_pc3_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc1_pc4_scatter_plt = plot_pca(
    dic_pca,
    dic_X_scaled,
    fam_freq_df_dic,
    1,
    4,
    'Species',
    style='Species',
    figsize=(10,10),
    file_path="../results/pca/dickeya/dic_pc1_pc4.png",
)

plot_loadings(
    dic_pca,
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    1,
    4,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/dickeya/dic_pc1_pc4_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc2_pc3_scatter_plt = plot_pca(
    dic_pca,
    dic_X_scaled,
    fam_freq_df_dic,
    2,
    3,
    'Species',
    style='Species',
    figsize=(10,10),
    file_path="../results/pca/dickeya/dic_pc2_pc3.png",
)

plot_loadings(
    dic_pca,
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    2,
    3,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/dickeya/dic_pc2_pc3_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc2_pc4_scatter_plt = plot_pca(
    dic_pca,
    dic_X_scaled,
    fam_freq_df_dic,
    2,
    4,
    'Species',
    style='Species',
    figsize=(10,12),
    file_path="../results/pca/dickeya/dic_pc2_pc4.png",
)

plot_loadings(
    dic_pca,
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    2,
    4,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/dickeya/dic_pc2_pc4_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


pc3_pc4_scatter_plt = plot_pca(
    dic_pca,
    dic_X_scaled,
    fam_freq_df_dic,
    3,
    4,
    'Species',
    style='Species',
    figsize=(10,12),
    file_path="../results/pca/dickeya/dic_pc3_pc4.png",
)

plot_loadings(
    dic_pca,
    fam_freq_df_dic[fam_freq_df_dic.columns[:-6]],
    3,
    4,
    threshold=0.3,
    fig_size=(10,10),
    file_path="../results/pca/dickeya/dic_pc3_pc4_loadings_plot",
    font_size=11,
    style=True,
)

Not applying hue order
Applying style
Not applying style order


X_pca = dic_pca.transform(dic_X_scaled)

plt.figure(figsize=(10,12))
sns.set(font_scale=1.15)

g = sns.scatterplot(
    x=X_pca[:,2],
    y=X_pca[:,3],
    data=fam_freq_df_dic,
    hue='Species',
    style='Species',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC4 {100 * dic_pca.explained_variance_ratio_[2]:.2f}%");
plt.xlabel(f"PC3 {100 * dic_pca.explained_variance_ratio_[3]:.2f}%");
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0);

genome_lbls = ["-".join(_) for _ in fam_freq_df_dic.index]
x_vals = X_pca[:,2]
y_vals = X_pca[:,3]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if (yval > 12)
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig('../results/pca/dickeya/dic_pc3_pc4_LABELLED.png', bbox_inches='tight', format='png')


# build output dir
output_dir = Path("../results/ie_cazymes")
make_output_directory(output_dir, force=True, nodelete=True)

Output directory ../results/ie_cazymes exists, nodelete is True. Adding output to output directory.


# load data
ie_fgp_file = "../data/pecto_dic/cazomes/pd_IE_fam_genomes_proteins"

ie_fgp_df = pd.read_table(ie_fgp_file, index_col="Unnamed: 0")

tax_csv_path = "../data/cazomes/fg_genome_taxs.csv"
tax_df = load_tax_data(tax_csv_path, genus=True, species=True)

ie_fgp_df = add_tax_data_from_tax_df(
    ie_fgp_df,
    tax_df,
    genus=True,
    species=True,
)

ie_fgp_df = ie_fgp_df[ie_fgp_df['Genus'] != 'Hafnia']
print(
    'Proteins:', len(set(ie_fgp_df['Protein'])),
    'Genera:', len(set(ie_fgp_df['Genus'])),
    'Genomes:', len(set(ie_fgp_df['Genome'])),
)

Collecting Genus data: 100%|████████████████████████████████████████████████████████████████████████| 32325/32325 [00:11<00:00, 2926.87it/s]
Collecting Species data: 100%|██████████████████████████████████████████████████████████████████████| 32325/32325 [00:12<00:00, 2692.65it/s]

Proteins: 12399 Genera: 3 Genomes: 280


ie_fgp_df.head(2)


def get_ie_fams(df):
    """Separate fams into extracellular only, intracellular only, and both intracellular and extracellular
    
    :param df: pandas df, cols = ['Family','Genome','Protein']
        Family set up as i_/e_{cazy family}
        
    Return 6 lists of protein accessions:
    1. All extracellular families
    2. All intracellular famileis
    3. extracellular only
    4. intracellular only
    5. both intracellular and extracellular
    6. all fams
    """
    fams = {}
    e_fams, i_fams = set(), set()
    e_only, i_only, both = set(), set(), set()
    
    for i in tqdm(range(len(df)), desc="Identifying extra-/intra-cellular CAZymes"):
        fam = df.iloc[i]['Fam'].split("_")[-1]
        try:
            fams[fam].add(df.iloc[i]['Fam'])
        except KeyError:
            fams[fam] = {df.iloc[i]['Fam']}
            
        if df.iloc[i]['Fam'].startswith('e'):
            e_fams.add(fam)
        else:
            i_fams.add(fam)
    
    for fam in fams:
        if len(fams[fam]) == 2:
            both.add(fam)
        
        else:
            if list(fams[fam])[0].startswith('e'):
                e_only.add(fam)
            else:
                i_only.add(fam)
    
    print(
        f"""
        Found {len(e_only)} families that are only extracellular.
        Found {len(i_only)} families that are only intracellular.
        Found {len(both)} families that are extracellular and intracellular.
        """
    )
    
    return e_fams, i_fams, e_only, i_only, both, e_fams.union(i_fams)


def cal_mean_ie_fams(df):
    """Calculate mean extracellular only, intracellular only, and both intracellular and extracellular families per genome
    +/- standard deviation (sd)
    
    :param df: pandas df, cols = ['Family','Genome','Protein']
        Family set up as i_/e_{cazy family}
        
    Return
    1. mean number of extracellular only families means per genome
    2. sd of extracellular only families means per genome
    3. mean number of intracellular only families means per genome
    4. sd of intracellular only families means per genome
    5. mean number of both intracellular and extracellular only families means per genome
    6. sd of both intracellular and extracellular only families means per genome
    """
    fams = {}  # {genome: fam{ e_fam, i_fam}}
    genome_counts = {} # {genome: {e: int, i: int: b: int}}
    
    genomes = set(df['Genome'])
    for genome in tqdm(genomes, desc='Getting genomes extra-/intra-cellular CAZymes'):
        fams[genome] = {}
        genome_rows = df[df['Genome'] == genome]
        for i in range(len(genome_rows)):
            fam = genome_rows.iloc[i]['Fam'].split("_")[-1]
            try:
                fams[genome][fam].add(genome_rows.iloc[i]['Fam'])
            except KeyError:
                fams[genome][fam] = {genome_rows.iloc[i]['Fam']}
        
        # count number of fams in each category
        e_only, i_only, both = 0, 0, 0
    
        for fam in fams[genome]:
            if len(fams[genome][fam]) == 2:
                both += 1

            else:
                if list(fams[genome][fam])[0].startswith('e'):
                    e_only += 1
                else:
                    i_only += 1
        genome_counts[genome] = {}
        genome_counts[genome]['e'] = e_only
        genome_counts[genome]['i'] = i_only
        genome_counts[genome]['b'] = both
    
    e_counts, i_counts, b_counts = [], [], []
    for genome in tqdm(genomes, desc='Calculating mena extra-/intra-cellular CAZymes'):
        e_counts.append(genome_counts[genome]['e'])
        i_counts.append(genome_counts[genome]['i'])
        b_counts.append(genome_counts[genome]['b'])
    
    return (
        np.mean(e_counts),
        np.std(e_counts),
        np.mean(i_counts),
        np.std(i_counts),
        np.mean(b_counts),
        np.std(b_counts),
    )


extra_fams, intra_fams, extra_only_fams, intra_only_fams, both_intra_extra_fams, all_fams = get_ie_fams(ie_fgp_df)
print("Extracellular only fams:")
for fam in extra_only_fams:
    print(f"- {fam}")
print("Intracellular only fams:")
for fam in intra_only_fams:
    print(f"- {fam}")
print("Both intra and extra celluar fams:")
for fam in both_intra_extra_fams:
    print(f"- {fam}")

Identifying extra-/intra-cellular CAZymes:   0%|          | 0/32232 [00:00<?, ?it/s]

        Found 9 families that are only extracellular.
        Found 53 families that are only intracellular.
        Found 35 families that are extracellular and intracellular.
        
Extracellular only fams:
- CBM5
- CBM3
- GH16
- CBM4
- PL10
- CBM63
- AA10
- GH171
- CE2
Intracellular only fams:
- GT5
- GH32
- GT2
- GH73
- GT30
- GT0
- AA3
- CE9
- GH104
- GH108
- GH19
- GT73
- GH38
- GH33
- CBM48
- GH2
- GT19
- GT52
- GT8
- PL22
- GT14
- GH1
- GH42
- GH31
- GH94
- GT81
- CE11
- GT111
- GH105
- GT26
- PL26
- GT41
- GH91
- GT20
- PL11
- GH4
- GT11
- GT35
- GH68
- GT32
- GT28
- GH65
- GH0
- GH88
- GT97
- GT9
- GH113
- GT25
- GT84
- GT56
- GT1
- GH77
- GT101
Both intra and extra celluar fams:
- GH102
- GT83
- CE1
- PL2
- GH36
- PL9
- GH13
- GT51
- GH12
- CBM13
- PL38
- CE12
- PL3
- CBM50
- GH24
- GH26
- PL35
- GH8
- GH30
- CE4
- CBM32
- GH103
- GH43
- GH53
- GH23
- GH18
- PL4
- CE8
- PL1
- GH28
- GT4
- GH78
- GH153
- GH5
- GH3


mean_e_fams, sd_e_fams, mean_i_fams, sd_i_fams, mean_b_fams, sd_b_fams = cal_mean_ie_fams(ie_fgp_df)
print(
    f"""
    Mean number of extracellular only fams per genome {round(mean_e_fams,3)} (+/- {round(sd_e_fams,3)} SD)
    Mean number of intracellular only fams per genome {round(mean_i_fams,3)} (+/- {round(sd_i_fams,3)} SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome {round(mean_b_fams,3)} (+/- {round(sd_b_fams,3)} SD)
    """
)

Getting genomes extra-/intra-cellular CAZymes:   0%|          | 0/280 [00:00<?, ?it/s]

Calculating mena extra-/intra-cellular CAZymes:   0%|          | 0/280 [00:00<?, ?it/s]

    Mean number of extracellular only fams per genome 12.018 (+/- 2.142 SD)
    Mean number of intracellular only fams per genome 39.996 (+/- 2.486 SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome 7.268 (+/- 1.176 SD)


p_extra_fams, p_intra_fams, p_extra_only_fams, p_intra_only_fams, p_both_intra_extra_fams, p_all_fams = get_ie_fams(
    ie_fgp_df[ie_fgp_df['Genus']=='Pectobacterium']
)
print("Extracellular only fams:")
for fam in p_extra_only_fams:
    print(f"- {fam}")
print("Intracellular only fams:")
for fam in p_intra_only_fams:
    print(f"- {fam}")
print("Both intra and extra celluar fams:")
for fam in p_both_intra_extra_fams:
    print(f"- {fam}")

Identifying extra-/intra-cellular CAZymes:   0%|          | 0/21680 [00:00<?, ?it/s]

        Found 11 families that are only extracellular.
        Found 51 families that are only intracellular.
        Found 28 families that are extracellular and intracellular.
        
Extracellular only fams:
- CBM5
- GH30
- PL38
- CBM3
- CBM32
- GH16
- CBM63
- GH5
- AA10
- GH171
- CBM13
Intracellular only fams:
- GT5
- GT2
- GH32
- GT30
- GH73
- GT83
- GT0
- CE9
- GH104
- GH108
- GH19
- GT73
- GH38
- GH33
- CBM48
- GH2
- GT19
- GT52
- GT8
- PL22
- GT14
- GH1
- GT81
- GH31
- GH42
- GH94
- CE11
- GT111
- GH105
- GT26
- PL26
- GT41
- GT20
- PL11
- GT11
- GH4
- GH68
- GT35
- AA3
- GT32
- GT28
- GH65
- GH0
- GH88
- GT9
- GT25
- GT56
- GT84
- GT1
- GH77
- GT101
Both intra and extra celluar fams:
- PL2
- CE1
- GH36
- PL9
- GH13
- GH12
- GT51
- CE12
- PL3
- CBM50
- GH24
- PL35
- GH8
- CE4
- GH103
- GH43
- GH53
- GH23
- GH18
- GH28
- PL1
- CE8
- PL4
- GT4
- GH78
- GH153
- GH3
- GH102


mean_e_fams, sd_e_fams, mean_i_fams, sd_i_fams, mean_b_fams, sd_b_fams = cal_mean_ie_fams(
    ie_fgp_df[ie_fgp_df['Genus'] == 'Pectobacterium']
)
print(
    f"""
    Mean number of extracellular only fams per genome {round(mean_e_fams,3)} (+/- {round(sd_e_fams,3)} SD)
    Mean number of intracellular only fams per genome {round(mean_i_fams,3)} (+/- {round(sd_i_fams,3)} SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome {round(mean_b_fams,3)} (+/- {round(sd_b_fams,3)} SD)
    """
)

Getting genomes extra-/intra-cellular CAZymes:   0%|          | 0/188 [00:00<?, ?it/s]

Calculating mena extra-/intra-cellular CAZymes:   0%|          | 0/188 [00:00<?, ?it/s]

    Mean number of extracellular only fams per genome 12.697 (+/- 2.101 SD)
    Mean number of intracellular only fams per genome 39.441 (+/- 2.428 SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome 7.441 (+/- 1.268 SD)


d_extra_fams, d_intra_fams, d_extra_only_fams, d_intra_only_fams, d_both_intra_extra_fams, d_all_fams = get_ie_fams(
    ie_fgp_df[ie_fgp_df['Genus']=='Dickeya']
)
print("Extracellular only fams:")
for fam in d_extra_only_fams:
    print(f"- {fam}")
print("Intracellular only fams:")
for fam in d_intra_only_fams:
    print(f"- {fam}")
print("Both intra and extra celluar fams:")
for fam in d_both_intra_extra_fams:
    print(f"- {fam}")

Identifying extra-/intra-cellular CAZymes:   0%|          | 0/10362 [00:00<?, ?it/s]

        Found 9 families that are only extracellular.
        Found 46 families that are only intracellular.
        Found 20 families that are extracellular and intracellular.
        
Extracellular only fams:
- GH8
- PL35
- PL38
- CBM4
- CE8
- PL10
- CBM63
- GH102
- CE2
Intracellular only fams:
- GT5
- GH32
- GT2
- GH73
- GT30
- GT0
- PL2
- CE9
- GH104
- GH19
- GH13
- GT51
- GH33
- CBM48
- GH2
- GT19
- GT8
- PL22
- GH1
- CBM32
- GH42
- GH31
- GH94
- GT81
- CE11
- CE4
- GH105
- GT26
- PL26
- GT41
- GH91
- AA3
- GH4
- GT35
- GT28
- GH0
- GT97
- GH88
- GT9
- GH113
- GT84
- GT56
- GH78
- GT1
- GH77
- GH5
Both intra and extra celluar fams:
- GT83
- CE1
- GH36
- PL9
- CBM13
- CE12
- PL3
- CBM50
- GH24
- GH26
- GH30
- GH103
- GH43
- GH53
- GH23
- PL4
- PL1
- GH28
- GT4
- GH3


mean_e_fams, sd_e_fams, mean_i_fams, sd_i_fams, mean_b_fams, sd_b_fams = cal_mean_ie_fams(
    ie_fgp_df[ie_fgp_df['Genus'] == 'Dickeya']
)
print(
    f"""
    Mean number of extracellular only fams per genome {round(mean_e_fams,3)} (+/- {round(sd_e_fams,3)} SD)
    Mean number of intracellular only fams per genome {round(mean_i_fams,3)} (+/- {round(sd_i_fams,3)} SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome {round(mean_b_fams,3)} (+/- {round(sd_b_fams,3)} SD)
    """
)

Getting genomes extra-/intra-cellular CAZymes:   0%|          | 0/90 [00:00<?, ?it/s]

Calculating mena extra-/intra-cellular CAZymes:   0%|          | 0/90 [00:00<?, ?it/s]

    Mean number of extracellular only fams per genome 10.711 (+/- 1.352 SD)
    Mean number of intracellular only fams per genome 41.244 (+/- 2.089 SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome 6.911 (+/- 0.865 SD)


m_extra_fams, m_intra_fams, m_extra_only_fams, m_intra_only_fams, m_both_intra_extra_fams, m_all_fams = get_ie_fams(
    ie_fgp_df[ie_fgp_df['Genus']=='Musicola']
)
print("Extracellular only fams:")
for fam in m_extra_only_fams:
    print(f"- {fam}")
print("Intracellular only fams:")
for fam in m_intra_only_fams:
    print(f"- {fam}")
print("Both intra and extra celluar fams:")
for fam in m_both_intra_extra_fams:
    print(f"- {fam}")

Identifying extra-/intra-cellular CAZymes:   0%|          | 0/190 [00:00<?, ?it/s]

        Found 7 families that are only extracellular.
        Found 36 families that are only intracellular.
        Found 7 families that are extracellular and intracellular.
        
Extracellular only fams:
- GH8
- GH30
- CE12
- GH103
- CE8
- CE1
- GH102
Intracellular only fams:
- GT5
- GT2
- GH32
- GH73
- GT30
- GT83
- PL2
- GT0
- CE9
- GH36
- GH104
- GH19
- GH13
- GT51
- GH38
- GH33
- CBM48
- PL38
- GH2
- GT19
- GT1
- GH24
- PL22
- GH1
- CE4
- GT81
- GH31
- CE11
- GT26
- GH105
- GT35
- GT28
- GT9
- GT56
- GH5
- GH77
Both intra and extra celluar fams:
- GH23
- CBM50
- GH28
- PL1
- GT4
- PL9
- GH3


mean_e_fams, sd_e_fams, mean_i_fams, sd_i_fams, mean_b_fams, sd_b_fams = cal_mean_ie_fams(
    ie_fgp_df[ie_fgp_df['Genus'] == 'Musicola']
)
print(
    f"""
    Mean number of extracellular only fams per genome {round(mean_e_fams,3)} (+/- {round(sd_e_fams,3)} SD)
    Mean number of intracellular only fams per genome {round(mean_i_fams,3)} (+/- {round(sd_i_fams,3)} SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome {round(mean_b_fams,3)} (+/- {round(sd_b_fams,3)} SD)
    """
)

Getting genomes extra-/intra-cellular CAZymes:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating mena extra-/intra-cellular CAZymes:   0%|          | 0/2 [00:00<?, ?it/s]

    Mean number of extracellular only fams per genome 7.0 (+/- 0.0 SD)
    Mean number of intracellular only fams per genome 36.0 (+/- 0.0 SD)
    Mean number of families that are both 
        extra- and intra-cellular fams per genome 7.0 (+/- 0.0 SD)


CAZY_CLASSES = ['GH', 'GT', 'PL', 'CE', 'AA', 'CBM']

def calculate_ie_class_sizes(df, round_by=2):
    """Calculate the mean (+- SD) of CAZymes per CAZy class per grp.

    Num of CAZymes is the number of unique protein accessions.

    :param df: pandas df, columns = ['Family', 'Genome', 'Protein', 'Genus', 'Species']
    :param grp: str, tax rank to group genomes by, e.g. 'Genus' or 'Species'
    :param round_by: int, num of dp to round the mean and sd to, if None does not round

    Return
    * df columns ['CAZyClass', grp, 'MeanCazyClass', 'SdCazyClass', 'NumOfGenomes']
    * dict cazy_class_size_dict
    """
    cazy_class_size_dict = {}  # {class: 
    # {grp: {genome: {
    # 'proteins': set(protein id), 'numOfProteins': int}, 'extraC': set(Fam), 'intraC': set(Fam), 'Fams': set(fams)
    # }}}

    # calculate total CAZymes per genome - used to calculate the percentage of the cazome
    # made up of each CAZy class
    cazome_sizes = {}  # {grp: {genome: {'cazymes':{ids/accs}}}}
    intra_extra_classifications = {}  # {protein: {i, e}}

    for ri in tqdm(range(len(df)), desc="Getting CAZy class sizes"):
        protein_acc = df.iloc[ri]['Protein']
        fam = df.iloc[ri]['Family'].split("_")[-1]
        intra_extra_cellular = df.iloc[ri]['Family'].split("_")[0]
        genus = df.iloc[ri]['Genus']
        genome = df.iloc[ri]['Genome']
        
        try:
            intra_extra_classifications[protein_acc].add(intra_extra_cellular)
        except KeyError:
            intra_extra_classifications[protein_acc] = {intra_extra_cellular}

        try:
            cazome_sizes[genus]
        except KeyError:
            cazome_sizes[genus] = {}

        try:
            cazome_sizes[genus][genome]['CAZymes'].add(protein_acc)
        except KeyError:
            cazome_sizes[genus][genome] = {'CAZymes': {protein_acc}}
        
        cazy_class = re.match('\D{2,3}', fam).group()
        
        try:
            cazy_class_size_dict[cazy_class]
        except KeyError:
            cazy_class_size_dict[cazy_class] = {}
            
        try:
            cazy_class_size_dict[cazy_class][genus]
        except KeyError:
            cazy_class_size_dict[cazy_class][genus] = {}
            
        try:
            cazy_class_size_dict[cazy_class][genus][genome]['proteins'].add(protein_acc)
            cazy_class_size_dict[cazy_class][genus][genome]['Fams'].add(fam)
        except KeyError:
            cazy_class_size_dict[cazy_class][genus][genome] = {
                'proteins': {protein_acc},
                'extraC': set(),
                'intraC': set(),
                'Fams': {fam},
            }
        
        if intra_extra_cellular == 'e':
            cazy_class_size_dict[cazy_class][genus][genome]['extraC'].add(fam)
        else:
            cazy_class_size_dict[cazy_class][genus][genome]['intraC'].add(fam)
            
    cazy_class_data = []  # ['CAZyClass', grp, 'MeanCazyClass', 'SdCazyClass', 'NumOfGenomes']
    extraC_class_data = []
    intraC_class_data = []
    bothC_class_data = []

    grps = set(df['Genus'])

    for cazy_class in tqdm(CAZY_CLASSES, desc="Calculating CAZy class sizes"):
        
        for genus in grps:
            try:
                cazy_class_size_dict[cazy_class][genus]
            except KeyError:
                # cazy class is not in any genomes from the grp_name
                num_genomes = len(set(df[df['Genus'] == genus]['Genome']))  # sample size
                cazy_class_data.append(
                    [cazy_class, genus, 0, 0, 0, 0, num_genomes]
                )
                continue
                
            # get sample size, i.e. num of genomes
            num_genomes = len(list(cazy_class_size_dict[cazy_class][genus].keys()))
            
            # calculate the number of CAZyme in the class
            cazy_class_sizes = []
            for genome in cazy_class_size_dict[cazy_class][genus]:
                num_of_cazymes = len(cazy_class_size_dict[cazy_class][genus][genome]['proteins'])
                cazy_class_size_dict[cazy_class][genus][genome]['numOfProteins'] = num_of_cazymes
                cazy_class_sizes.append(num_of_cazymes)
                
            mean_cazy_class = round(np.mean(cazy_class_sizes), round_by)
            sd_cazy_class = round(np.std(cazy_class_sizes), round_by)

            # calculate the percentage of the CAZome represented by the CAZy class
            cazy_class_percentages = []
            for genome in cazy_class_size_dict[cazy_class][genus]:
                total_cazymes = len(cazome_sizes[genus][genome]['CAZymes'])
                num_class_cazymes = len(cazy_class_size_dict[cazy_class][genus][genome]['proteins'])
                percentage = (num_class_cazymes / total_cazymes) * 100
                cazy_class_percentages.append(percentage)
                
            mean_perc_cazy_class = round(np.mean(cazy_class_percentages), round_by)
            sd_perc_cazy_class = round(np.std(cazy_class_percentages), round_by)
        
            # calculate number of extracellular only fams, intracellular only fams
            # and fams that are both intra and extracellular
            total_fams = []
            total_extrac_fams = []
            total_intrac_fams = []
            total_extrac_intrac_fams = []
            for genome in cazy_class_size_dict[cazy_class][genus]:
                total_fams.append(len(cazy_class_size_dict[cazy_class][genus][genome]['Fams']))
                total_extrac_fams.append(len(cazy_class_size_dict[cazy_class][genus][genome]['extraC']))
                total_intrac_fams.append(len(cazy_class_size_dict[cazy_class][genus][genome]['intraC']))
                total_extrac_intrac_fams.append(
                    len(
                        cazy_class_size_dict[cazy_class][genus][genome]['extraC'].intersection(
                            cazy_class_size_dict[cazy_class][genus][genome]['intraC']
                        )
                    )
                )
                                         
            mean_total_fams = round(np.mean(total_fams), 2)    
            sd_total_fams = round(np.std(total_fams), 2)  

            mean_extrac_fams = round(np.mean(total_extrac_fams), 2)    
            sd_extrac_fams = round(np.std(total_extrac_fams), 2)  

            mean_intrac_fams = round(np.mean(total_intrac_fams), 2)    
            sd_intrac_fams = round(np.std(total_intrac_fams), 2)  

            mean_both_fams = round(np.mean(total_extrac_intrac_fams), 2)    
            sd_both_fams = round(np.std(total_extrac_intrac_fams), 2)  
            
            # calculate percentages of total fams
            total_extrac_fams = []
            total_intrac_fams = []
            total_extrac_intrac_fams = []
            for genome in cazy_class_size_dict[cazy_class][genus]:
                total_fams = len(cazy_class_size_dict[cazy_class][genus][genome]['Fams'])
                
                total_extrac_fams.append(
                    len(cazy_class_size_dict[cazy_class][genus][genome]['extraC']) / total_fams * 100
                )
                total_intrac_fams.append(
                    len(cazy_class_size_dict[cazy_class][genus][genome]['intraC']) / total_fams * 100
                )
                total_extrac_intrac_fams.append(
                    len(
                        cazy_class_size_dict[cazy_class][genus][genome]['extraC'].intersection(
                            cazy_class_size_dict[cazy_class][genus][genome]['intraC']
                        )
                    )  / total_fams * 100
                )

            mean_perc_extrac_fams = round(np.mean(total_extrac_fams), 2)    
            sd_perc_extrac_fams = round(np.std(total_extrac_fams), 2)  

            mean_perc_intrac_fams = round(np.mean(total_intrac_fams), 2)    
            sd_perc_intrac_fams = round(np.std(total_intrac_fams), 2)  

            mean_perc_both_fams = round(np.mean(total_extrac_intrac_fams), 2)    
            sd_perc_both_fams = round(np.std(total_extrac_intrac_fams), 2)  
            
            cazy_class_data.append([
                cazy_class, genus,
                mean_cazy_class, sd_cazy_class,
                mean_perc_cazy_class, sd_perc_cazy_class,
                num_genomes,
            ])
            extraC_class_data.append([
                cazy_class, genus,
                mean_total_fams, sd_total_fams,
                mean_extrac_fams, sd_extrac_fams,
                mean_perc_extrac_fams, sd_perc_extrac_fams,
                num_genomes,
            ])
            intraC_class_data.append([
                cazy_class, genus,
                mean_total_fams, sd_total_fams,
                mean_intrac_fams, sd_intrac_fams,
                mean_perc_intrac_fams, sd_perc_intrac_fams,
                num_genomes,
            ])
            bothC_class_data.append([
                cazy_class, genus,
                mean_total_fams, sd_total_fams,
                mean_both_fams, sd_both_fams,
                mean_perc_both_fams, sd_perc_both_fams,
                num_genomes,
            ])

    col_names = [
        'CAZyClass', 'Genus',
        'MeanCazyClass', 'SdCazyClass',
        'MeanClassPerc', 'SdClassPerc',
        'NumOfGenomes',
    ]
    class_df = pd.DataFrame(cazy_class_data, columns=col_names)
    
    col_names = [
        'CAZyClass', 'Genus',
        'MeanTotalFams', 'SdTotalFams',
        'MeanFams', 'SdFams',
        'MeanFamsPerc', 'SdFamsPerc',
        'NumOfGenomes',
    ]
    extraC_class_df = pd.DataFrame(extraC_class_data, columns=col_names)
    
    col_names = [
        'CAZyClass', 'Genus',
        'MeanTotalFams', 'SdTotalFams',
        'MeanFams', 'SdFams',
        'MeanFamsPerc', 'SdFamsPerc',
        'NumOfGenomes',
    ]
    intraC_class_df = pd.DataFrame(intraC_class_data, columns=col_names)
    
    col_names = [
        'CAZyClass', 'Genus',
        'MeanTotalFams', 'SdTotalFams',
        'MeanFams', 'SdFams',
        'MeanFamsPerc', 'SdFamsPerc',
        'NumOfGenomes',
    ]
    bothC_class_df = pd.DataFrame(bothC_class_data, columns=col_names)
    
    return class_df, cazy_class_size_dict, extraC_class_df, intraC_class_df, bothC_class_df


# make output dir for results, will not delete data if dir already exists
make_output_directory(Path('../results/ie_cazymes/cazy_classes/'), force=True, nodelete=True)
ie_fgp_df.columns = ['Family', 'Genome', 'Protein', 'Genus', 'Species']

(
    ie_class_df, ie_class_size_dict,
    extraC_class_df, intraC_class_df, bothC_class_df
) = calculate_ie_class_sizes(ie_fgp_df)

ie_class_df.to_csv('../results/ie_cazymes/cazy_classes/cazy_class_sizes.csv')
extraC_class_df.to_csv('../results/ie_cazymes/cazy_classes/extracellular_cazy_class_sizes.csv')
intraC_class_df.to_csv('../results/ie_cazymes/cazy_classes/intracellular_cazy_class_sizes.csv')
bothC_class_df.to_csv('../results/ie_cazymes/cazy_classes/extra_intra_cellular_cazy_class_sizes.csv')

ie_class_df

Output directory ../results/ie_cazymes/cazy_classes exists, nodelete is True. Adding output to output directory.

Getting CAZy class sizes:   0%|          | 0/32232 [00:00<?, ?it/s]

Calculating CAZy class sizes:   0%|          | 0/6 [00:00<?, ?it/s]


extraC_class_df #


ie_fgp_df.columns=['Family','Genome','Protein','Genus','Species']
ie_fgp_df.head(1)


ie_fam_freq_df = build_fam_freq_df(ie_fgp_df, ['Genus', 'Species'])
make_output_directory(Path('../results/ie_cazymes/cazy_families/'), force=True, nodelete=True)
ie_fam_freq_df.to_csv(Path("../results/ie_cazymes/cazy_families/cazy_fam_freqs.csv"))
ie_fam_freq_df

The dataset contains 132 CAZy families

Counting fam frequencies: 100%|███████████████████████████████████████████████████████████████████████████| 280/280 [00:13<00:00, 20.30it/s]
Output directory ../results/ie_cazymes/cazy_families exists, nodelete is True. Adding output to output directory.


# index the taxonomy data and genome (ggs=genome_genus_species)
ie_fam_freq_df_ggs = copy(ie_fam_freq_df)  # so does not alter fam_freq_df
ie_fam_freq_df_ggs = ie_fam_freq_df_ggs.set_index(['Genome','Genus','Species'])
ie_fam_freq_df_ggs.head(1)


# define a colour scheme to colour code rows by genus
ie_fam_freq_df_ggs['Genus'] = list(ie_fam_freq_df['Genus'])  # add column to use for colour scheme, is removed
ie_fam_freq_genus_row_colours, ie_fam_g_lut = build_row_colours(ie_fam_freq_df_ggs, 'Genus', 'Set2')


# make a figure that is full size, and all data is legible
build_family_clustermap(
    ie_fam_freq_df_ggs,
    row_colours=ie_fam_freq_genus_row_colours,
    fig_size=(25,73),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_IE_fam_freq_clustermap.pdf"),
    file_format='pdf',
    lut=ie_fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.2,0.05),
    title_fontsize=28,
    legend_fontsize=24,
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f654e652550>


# make a figure the optimal size to fit in a paper
build_family_clustermap(
    ie_fam_freq_df_ggs,
    row_colours=ie_fam_freq_genus_row_colours,
    fig_size=(20,40),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_IE_fam_freq_clustermap.png"),
    file_format='png',
    font_scale=0.5,
    lut=ie_fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.95, 0.1, 0.05),  #left, bottom, width, height
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f654fee9650>


extrac_cols = [_ for _ in list(ie_fam_freq_df_ggs.columns) if _.startswith('e_')]
e_fam_freq_df_ggs = ie_fam_freq_df_ggs[extrac_cols]
e_fam_freq_df = ie_fam_freq_df[['Genome', 'Genus', 'Species'] + extrac_cols]

# define a colour scheme to colour code rows by genus
e_fam_freq_df_ggs['Genus'] = list(e_fam_freq_df['Genus'])  # add column to use for colour scheme, is removed
e_fam_freq_genus_row_colours, e_fam_g_lut = build_row_colours(e_fam_freq_df_ggs, 'Genus', 'Set2')

/tmp/ipykernel_479/1071892966.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_fam_freq_df_ggs['Genus'] = list(e_fam_freq_df['Genus'])  # add column to use for colour scheme, is removed


# make a figure that is full size, and all data is legible
build_family_clustermap(
    e_fam_freq_df_ggs,
    row_colours=e_fam_freq_genus_row_colours,
    fig_size=(25,73),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_Extracellular_fam_freq_clustermap.pdf"),
    file_format='pdf',
    lut=e_fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.2,0.05),
    title_fontsize=28,
    legend_fontsize=24,
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f6552fbe9d0>


intrac_cols = [_ for _ in list(ie_fam_freq_df_ggs.columns) if _.startswith('i_')]
i_fam_freq_df_ggs = ie_fam_freq_df_ggs[intrac_cols]
i_fam_freq_df = ie_fam_freq_df[['Genome', 'Genus', 'Species'] + intrac_cols]

# define a colour scheme to colour code rows by genus
i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme, is removed
i_fam_freq_genus_row_colours, i_fam_g_lut = build_row_colours(i_fam_freq_df_ggs, 'Genus', 'Set2')

/tmp/ipykernel_479/1362901214.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme, is removed


# make a figure that is full size, and all data is legible
build_family_clustermap(
    i_fam_freq_df_ggs,
    row_colours=i_fam_freq_genus_row_colours,
    fig_size=(25,73),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_Intracellular_fam_freq_clustermap.pdf"),
    file_format='pdf',
    lut=i_fam_g_lut,
    legend_title='Genus',
    dendrogram_ratio=(0.2,0.05),
    title_fontsize=28,
    legend_fontsize=24,
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f6551d9d690>


# define a colour scheme to colour code rows by SPECIES
ie_fam_freq_df_ggs['Species'] = list(ie_fam_freq_df['Species'])  # add column to use for colour scheme, is removed
ie_fam_freq_species_row_colours, ie_fam_s_lut = build_row_colours(ie_fam_freq_df_ggs, 'Species', 'rainbow')

e_fam_freq_df_ggs['Species'] = list(e_fam_freq_df['Species'])
e_fam_freq_species_row_colours, e_fam_s_lut = build_row_colours(e_fam_freq_df_ggs, 'Species', 'rainbow')

i_fam_freq_df_ggs['Species'] = list(i_fam_freq_df['Species'])
i_fam_freq_species_row_colours, i_fam_s_lut = build_row_colours(i_fam_freq_df_ggs, 'Species', 'rainbow')

/tmp/ipykernel_479/3653240554.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_fam_freq_df_ggs['Species'] = list(e_fam_freq_df['Species'])
/tmp/ipykernel_479/3653240554.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Species'] = list(i_fam_freq_df['Species'])


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=ie_fam_freq_df_ggs,
    row_colours=[ie_fam_freq_genus_row_colours, ie_fam_freq_species_row_colours],
    luts=[ie_fam_g_lut, ie_fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.045), (0.63,1.04)],
    legend_cols=[1,5],
    fig_size=(25,73),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_IE_genus_species_fam_freq_clustermap.pdf"),
    file_format='pdf',
    font_scale=1,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.96, 0.1, 0.08),  #left, bottom, width, height
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f6548fd9d90>


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=e_fam_freq_df_ggs,
    row_colours=[e_fam_freq_genus_row_colours, e_fam_freq_species_row_colours],
    luts=[e_fam_g_lut, e_fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.045), (0.63,1.04)],
    legend_cols=[1,5],
    fig_size=(25,73),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_Extracellular_genus_species_fam_freq_clustermap.pdf"),
    file_format='pdf',
    font_scale=1,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.96, 0.1, 0.08),  #left, bottom, width, height
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f65528a9d50>


# make a figure the optimal size to fit in a paper
build_family_clustermap_multi_legend(
    df=i_fam_freq_df_ggs,
    row_colours=[i_fam_freq_genus_row_colours, i_fam_freq_species_row_colours],
    luts=[i_fam_g_lut, i_fam_s_lut],
    legend_titles=['Genus', 'Species'],
    bbox_to_anchors=[(0.2,1.045), (0.63,1.04)],
    legend_cols=[1,5],
    fig_size=(25,73),
    file_path=Path("../results/ie_cazymes/cazy_families/pd_Intracellular_genus_species_fam_freq_clustermap.pdf"),
    file_format='pdf',
    font_scale=1,
    dendrogram_ratio=(0.1,0.05),
    title_fontsize=18,
    legend_fontsize=16,
    cbar_pos=(0.01, 0.96, 0.1, 0.08),  #left, bottom, width, height
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<seaborn.matrix.ClusterGrid at 0x7f6549340590>


all_families = list(ie_fam_freq_df.columns)[3:]
# dict {group: {only unique fams}} and dict {group: {all fams}}
unique_grp_fams, group_fams = get_group_specific_fams(ie_fam_freq_df, 'Genus', all_families)
unique_grp_fams

Identifying fams in each Genus: 100%|█████████████████████████████████████████████████████████████████████| 280/280 [00:04<00:00, 63.84it/s]
Identifying Genus specific fams: 100%|█████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24385.49it/s]

{'Dickeya': {'e_CBM4',
  'e_CE2',
  'e_GH26',
  'e_GT83',
  'e_PL10',
  'i_CBM13',
  'i_CBM32',
  'i_GH113',
  'i_GH26',
  'i_GH30',
  'i_GH91',
  'i_GT97'},
 'Pectobacterium': {'e_AA10',
  'e_CBM3',
  'e_CBM32',
  'e_CBM5',
  'e_CE4',
  'e_GH12',
  'e_GH13',
  'e_GH153',
  'e_GH16',
  'e_GH171',
  'e_GH18',
  'e_GH5',
  'e_GH78',
  'e_GT51',
  'e_PL2',
  'i_CE8',
  'i_GH102',
  'i_GH108',
  'i_GH12',
  'i_GH153',
  'i_GH18',
  'i_GH65',
  'i_GH68',
  'i_GH8',
  'i_GT101',
  'i_GT11',
  'i_GT111',
  'i_GT14',
  'i_GT20',
  'i_GT25',
  'i_GT32',
  'i_GT52',
  'i_GT73',
  'i_PL11',
  'i_PL35'},
 'Musicola': {'i_PL38'}}


ie_core_cazome = identify_core_cazome(ie_fam_freq_df_ggs)

ie_core_cazome = list(ie_core_cazome)
ie_core_cazome.sort()

print("The core CAZy families are:")
for fam in ie_core_cazome:
    print('-', fam)

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 12327.40it/s]

The core CAZy families are:
- e_CBM50
- e_GH103
- e_GH23
- e_PL9
- i_GH1
- i_GH105
- i_GH13
- i_GH23
- i_GH3
- i_GT19
- i_GT2
- i_GT30
- i_GT51
- i_GT9


# filter the family freq df to include only those families in the core CAZome
ie_core_cazome_df = ie_fam_freq_df_ggs[ie_core_cazome]

ie_core_cazome_df_genus = copy(ie_core_cazome_df)  # to ensure core_cazome_df is not altereted
ie_core_cazome_df_genus = add_tax_column_from_row_index(ie_core_cazome_df_genus, 'Genus', 1)

ie_core_cazome_fggf_df, ie_core_cazome_mean_freq_df = build_fam_mean_freq_df(
    ie_core_cazome_df_genus,
    'Genus',
    round_by=2,
)

ie_core_cazome_mean_freq_df = ie_core_cazome_mean_freq_df.sort_values("Family")
make_output_directory(Path("../results/ie_cazymes/core_cazome"), nodelete=True, force=True)
ie_core_cazome_mean_freq_df.to_csv(Path("../results/ie_cazymes/core_cazome/IE_core_cazome_freqs.csv"))

ie_core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████████████| 280/280 [00:00<00:00, 5395.57it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|█████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 94.04it/s]
Output directory ../results/ie_cazymes/core_cazome exists, nodelete is True. Adding output to output directory.


ie_fam_freq_df_ggs['Genus'] = list(ie_fam_freq_df['Genus'])
p_ie_fam_freq_df_ggs = ie_fam_freq_df_ggs[ie_fam_freq_df_ggs['Genus'] == 'Pectobacterium']
d_ie_fam_freq_df_ggs = ie_fam_freq_df_ggs[ie_fam_freq_df_ggs['Genus'] == 'Dickeya']
m_ie_fam_freq_df_ggs = ie_fam_freq_df_ggs[ie_fam_freq_df_ggs['Genus'] == 'Musicola']

ie_fam_freq_df_ggs = ie_fam_freq_df_ggs.drop('Genus', axis=1)
p_ie_fam_freq_df_ggs = p_ie_fam_freq_df_ggs.drop('Genus', axis=1)
d_ie_fam_freq_df_ggs = d_ie_fam_freq_df_ggs.drop('Genus', axis=1)
m_ie_fam_freq_df_ggs = m_ie_fam_freq_df_ggs.drop('Genus', axis=1)


set(p_ie_fam_freq_df_ggs['i_CE8'])

{0, 1, 2}


p_ie_core_cazome = identify_core_cazome(p_ie_fam_freq_df_ggs)

p_ie_core_cazome = list(p_ie_core_cazome)
p_ie_core_cazome.sort()

print("The core CAZy families are:")
for fam in p_ie_core_cazome:
    print('-', fam)

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 12743.36it/s]

The core CAZy families are:
- e_CBM32
- e_CBM50
- e_GH103
- e_GH23
- e_GH28
- e_PL3
- e_PL9
- i_CE9
- i_GH1
- i_GH105
- i_GH13
- i_GH23
- i_GH3
- i_GH43
- i_GT19
- i_GT2
- i_GT28
- i_GT30
- i_GT51
- i_GT9
- i_PL22


# filter the family freq df to include only those families in the core CAZome
p_ie_core_cazome = p_ie_fam_freq_df_ggs[p_ie_core_cazome]

p_ie_core_cazome_df_genus = copy(p_ie_core_cazome)  # to ensure core_cazome_df is not altereted
p_ie_core_cazome_df_genus = add_tax_column_from_row_index(p_ie_core_cazome_df_genus, 'Genus', 1)

p_ie_core_cazome_fggf_df, p_ie_core_cazome_mean_freq_df = build_fam_mean_freq_df(
    p_ie_core_cazome_df_genus,
    'Genus',
    round_by=2,
)

p_ie_core_cazome_mean_freq_df = p_ie_core_cazome_mean_freq_df.sort_values("Family")
p_ie_core_cazome_mean_freq_df.to_csv(Path("../results/ie_cazymes/core_cazome/pectobacterium_IE_core_cazome_freqs.csv"))

p_ie_core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████████████| 188/188 [00:00<00:00, 4492.02it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 44.75it/s]


d_ie_core_cazome = identify_core_cazome(d_ie_fam_freq_df_ggs)

d_ie_core_cazome = list(d_ie_core_cazome)
d_ie_core_cazome.sort()

print("The core CAZy families are:")
for fam in d_ie_core_cazome:
    print('-', fam)

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 14932.39it/s]

The core CAZy families are:
- e_CBM50
- e_CE8
- e_GH102
- e_GH103
- e_GH23
- e_GH8
- e_GT4
- e_PL1
- e_PL3
- e_PL9
- i_CBM48
- i_CBM50
- i_CE11
- i_CE4
- i_GH1
- i_GH104
- i_GH105
- i_GH13
- i_GH23
- i_GH28
- i_GH3
- i_GH31
- i_GH32
- i_GH33
- i_GT1
- i_GT19
- i_GT2
- i_GT30
- i_GT35
- i_GT4
- i_GT41
- i_GT5
- i_GT51
- i_GT9


# filter the family freq df to include only those families in the core CAZome
d_ie_core_cazome = d_ie_fam_freq_df_ggs[d_ie_core_cazome]

d_ie_core_cazome_df_genus = copy(d_ie_core_cazome)  # to ensure core_cazome_df is not altereted
d_ie_core_cazome_df_genus = add_tax_column_from_row_index(d_ie_core_cazome_df_genus, 'Genus', 1)

d_ie_core_cazome_fggf_df, d_ie_core_cazome_mean_freq_df = build_fam_mean_freq_df(
    d_ie_core_cazome_df_genus,
    'Genus',
    round_by=2,
)

d_ie_core_cazome_mean_freq_df = d_ie_core_cazome_mean_freq_df.sort_values("Family")
d_ie_core_cazome_mean_freq_df.to_csv(Path("../results/ie_cazymes/core_cazome/dickeya_IE_core_cazome_freqs.csv"))

d_ie_core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|███████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 3833.41it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 32.82it/s]


m_ie_core_cazome = identify_core_cazome(m_ie_fam_freq_df_ggs)

m_ie_core_cazome = list(m_ie_core_cazome)
m_ie_core_cazome.sort()

print("The core CAZy families are:")
for fam in m_ie_core_cazome:
    print('-', fam)

Identifying core CAZome: 100%|█████████████████████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 15912.17it/s]

The core CAZy families are:
- e_CBM50
- e_CE1
- e_CE12
- e_CE8
- e_GH102
- e_GH103
- e_GH23
- e_GH28
- e_GH3
- e_GH30
- e_GH8
- e_GT4
- e_PL1
- e_PL9
- i_CBM48
- i_CBM50
- i_CE11
- i_CE4
- i_CE9
- i_GH1
- i_GH104
- i_GH105
- i_GH13
- i_GH19
- i_GH2
- i_GH23
- i_GH24
- i_GH28
- i_GH3
- i_GH31
- i_GH32
- i_GH33
- i_GH36
- i_GH38
- i_GH5
- i_GH73
- i_GH77
- i_GT0
- i_GT1
- i_GT19
- i_GT2
- i_GT26
- i_GT28
- i_GT30
- i_GT35
- i_GT4
- i_GT5
- i_GT51
- i_GT56
- i_GT81
- i_GT83
- i_GT9
- i_PL1
- i_PL2
- i_PL22
- i_PL38
- i_PL9


# filter the family freq df to include only those families in the core CAZome
m_ie_core_cazome = m_ie_fam_freq_df_ggs[m_ie_core_cazome]

m_ie_core_cazome_df_genus = copy(m_ie_core_cazome)  # to ensure core_cazome_df is not altereted
m_ie_core_cazome_df_genus = add_tax_column_from_row_index(m_ie_core_cazome_df_genus, 'Genus', 1)

m_ie_core_cazome_fggf_df, m_ie_core_cazome_mean_freq_df = build_fam_mean_freq_df(
    m_ie_core_cazome_df_genus,
    'Genus',
    round_by=2,
)

m_ie_core_cazome_mean_freq_df = m_ie_core_cazome_mean_freq_df.sort_values("Family")
m_ie_core_cazome_mean_freq_df.to_csv(Path("../results/ie_cazymes/core_cazome/musicola_IE_core_cazome_freqs.csv"))

m_ie_core_cazome_mean_freq_df

Building [fam, grp, genome, freq] df: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1407.72it/s]
Building [Fam, grp, mean freq, sd freq] df: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.06it/s]


make_output_directory(Path("../results/ie_cazymes/cooccurring_families/"), force=True, nodelete=True)

ie_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
    ie_fam_freq_df,
    list(ie_fam_freq_df.columns[3:]),
    exclude_core_cazome=False,
)
ie_cooccurring_fams_dict

Output directory ../results/ie_cazymes/cooccurring_families exists, nodelete is True. Adding output to output directory.
Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 166.83it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 103/103 [00:00<00:00, 206408.65it/s]

{0: {'fams': {'e_AA10', 'e_GH171'}, 'freqs': {2}},
 1: {'fams': {'e_CBM5', 'e_GH18'}, 'freqs': {2}},
 2: {'fams': {'e_CBM50',
   'e_GH103',
   'e_GH23',
   'e_PL9',
   'i_GH1',
   'i_GH105',
   'i_GH13',
   'i_GH23',
   'i_GH3',
   'i_GT19',
   'i_GT2',
   'i_GT30',
   'i_GT51',
   'i_GT9'},
  'freqs': {280}},
 3: {'fams': {'e_CE4', 'e_GH153'}, 'freqs': {38}},
 4: {'fams': {'e_GH16', 'i_GT14'}, 'freqs': {1}},
 5: {'fams': {'e_PL1', 'i_CBM48'}, 'freqs': {279}},
 6: {'fams': {'i_CE11', 'i_GH32', 'i_GT4'}, 'freqs': {279}},
 7: {'fams': {'i_GH94', 'i_GT84'}, 'freqs': {97}},
 8: {'fams': {'i_GT0', 'i_GT26'}, 'freqs': {278}},
 9: {'fams': {'i_GT111', 'i_GT52'}, 'freqs': {9}},
 10: {'fams': {'i_GT20', 'i_PL35'}, 'freqs': {2}}}


# Pectobacterium
p_ie_fam_freq_df = ie_fam_freq_df[ie_fam_freq_df['Genus'] == 'Pectobacterium']
p_ie_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
    p_ie_fam_freq_df,
    list(p_ie_fam_freq_df.columns[3:]),
    exclude_core_cazome=False,
)
p_ie_cooccurring_fams_dict

Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 239.97it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 233/233 [00:00<00:00, 177718.28it/s]

{0: {'fams': {'e_AA10', 'e_GH171'}, 'freqs': {2}},
 1: {'fams': {'e_CBM32',
   'e_CBM50',
   'e_GH103',
   'e_GH23',
   'e_GH28',
   'e_PL3',
   'e_PL9',
   'i_CE9',
   'i_GH1',
   'i_GH105',
   'i_GH13',
   'i_GH23',
   'i_GH3',
   'i_GH43',
   'i_GT19',
   'i_GT2',
   'i_GT28',
   'i_GT30',
   'i_GT51',
   'i_GT9',
   'i_PL22'},
  'freqs': {188}},
 2: {'fams': {'e_CBM5', 'e_GH18'}, 'freqs': {2}},
 3: {'fams': {'e_CE4', 'e_GH153'}, 'freqs': {38}},
 4: {'fams': {'e_GH16', 'i_GT14'}, 'freqs': {1}},
 5: {'fams': {'e_PL1', 'i_CBM48', 'i_GT0', 'i_GT26', 'i_GT56'},
  'freqs': {187}},
 6: {'fams': {'i_CE11', 'i_GH32', 'i_GT4', 'i_PL2'}, 'freqs': {187}},
 7: {'fams': {'i_GH94', 'i_GT84'}, 'freqs': {67}},
 8: {'fams': {'i_GT111', 'i_GT52'}, 'freqs': {9}},
 9: {'fams': {'i_GT20', 'i_PL35'}, 'freqs': {2}}}


# co-occurring cazy families in dickeya
d_ie_fam_freq_df = ie_fam_freq_df[ie_fam_freq_df['Genus'] == 'Dickeya']
d_ie_cooccurring_fams_dict = calc_cooccuring_fam_freqs(
    d_ie_fam_freq_df,
    list(d_ie_fam_freq_df.columns[3:]),
    exclude_core_cazome=False,
)
d_ie_cooccurring_fams_dict

Identifying pairs of co-occurring families: 100%|████████████████████████████████████████████████████████| 132/132 [00:00<00:00, 447.44it/s]
Combining pairs of co-occurring families: 100%|███████████████████████████████████████████████████████| 566/566 [00:00<00:00, 333611.03it/s]

{0: {'fams': {'e_CBM50',
   'e_CE8',
   'e_GH102',
   'e_GH103',
   'e_GH23',
   'e_GH8',
   'e_GT4',
   'e_PL1',
   'e_PL3',
   'e_PL9',
   'i_CBM48',
   'i_CBM50',
   'i_CE11',
   'i_CE4',
   'i_GH1',
   'i_GH104',
   'i_GH105',
   'i_GH13',
   'i_GH23',
   'i_GH28',
   'i_GH3',
   'i_GH31',
   'i_GH32',
   'i_GH33',
   'i_GT1',
   'i_GT19',
   'i_GT2',
   'i_GT30',
   'i_GT35',
   'i_GT4',
   'i_GT41',
   'i_GT5',
   'i_GT51',
   'i_GT9'},
  'freqs': {90}},
 1: {'fams': {'e_CE12', 'e_GH36'}, 'freqs': {3}},
 2: {'fams': {'e_PL35', 'i_GH88'}, 'freqs': {1}},
 3: {'fams': {'i_GH19', 'i_GH5'}, 'freqs': {88}},
 4: {'fams': {'i_GH94', 'i_GT84'}, 'freqs': {30}},
 5: {'fams': {'i_GT0', 'i_GT26'}, 'freqs': {89}}}


upsetplot_membership = []
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, ie_cooccurring_fams_dict)
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, p_ie_cooccurring_fams_dict)
upsetplot_membership = add_to_upsetplot_membership(upsetplot_membership, d_ie_cooccurring_fams_dict)
len(upsetplot_membership)

2251


pecto_dic_upsetplot = build_upsetplot(
    upsetplot_membership,
    file_path='../results/ie_cazymes/cooccurring_families/pd_cooccurring_fams.svg',
)

/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/upsetplot/plotting.py:662: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '['#0000002e' '#0000002e' '#0000002e' ... '#0000002e' '#0000002e'
 '#0000002e']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  styles["edgecolor"].fillna(styles["facecolor"], inplace=True)
/home/hobnobmancer/anaconda3/envs/pectobacteriaceae/lib/python3.11/site-packages/upsetplot/plotting.py:663: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'solid' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  styles["linestyle"].fillna("solid", inplace=True)


upset_plot_groups = get_upsetplot_grps(upsetplot_membership)

ie_cooccurring_grp_freq_data = []  # empty list to store data for the df

for fam_grp in upset_plot_groups:
    added = False
    
    for grp_num in ie_cooccurring_fams_dict:
        fams = list(ie_cooccurring_fams_dict[grp_num]['fams'])
        fams.sort()
        fam_grp = list(fam_grp)
        fam_grp.sort()
        if fams == fam_grp:
            freq = list(ie_cooccurring_fams_dict[grp_num]['freqs'])[0]
            ie_cooccurring_grp_freq_data.append(["+".join(fam_grp), 'Both', freq])
            added = True
    
    if added:
        continue
        
    # pecto_cooccurring_fams_dict
    for grp_num in p_ie_cooccurring_fams_dict:
        fams = list(p_ie_cooccurring_fams_dict[grp_num]['fams'])
        fams.sort()
        fam_grp = list(fam_grp)
        fam_grp.sort()
        if fams == fam_grp:
            freq = list(p_ie_cooccurring_fams_dict[grp_num]['freqs'])[0]
            ie_cooccurring_grp_freq_data.append(["+".join(fam_grp), 'Pectobacterium', freq])
            added = True
    
    if added:
        continue
    
    # dic_cooccurring_fams_dict
    for grp_num in d_ie_cooccurring_fams_dict:
        fams = list(d_ie_cooccurring_fams_dict[grp_num]['fams'])
        fams.sort()
        fam_grp = list(fam_grp)
        fam_grp.sort()
        if fams == fam_grp:
            freq = list(d_ie_cooccurring_fams_dict[grp_num]['freqs'])[0]
            ie_cooccurring_grp_freq_data.append(["+".join(fam_grp), 'Dickeya', freq])
            added = True
    
    if added:
        continue

ie_cooccurring_fams_freq_df = pd.DataFrame(ie_cooccurring_grp_freq_data, columns=['Families', 'Genus', 'Incidence'])
ie_cooccurring_fams_freq_df.to_csv('../results/ie_cazymes/cooccurring_families/cooccurring_fams_freqs.csv')
ie_cooccurring_fams_freq_df

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 46.90it/s]


num_of_components = len(ie_fam_freq_df_ggs.columns)
ie_pca, ie_X_scaled = perform_pca(ie_fam_freq_df_ggs, num_of_components)
ie_pca

PCA(n_components=132)

PCA(n_components=132)


make_output_directory(Path("../results/ie_cazymes/ie_pca"), force=True, nodelete=True)

print(f"{ie_pca.explained_variance_ratio_.sum() * 100}% of the variance in the data set was catpured by the PCA")

cumExpVar = plot_explained_variance(
    ie_pca,
    num_of_components,
    file_path="../results/ie_cazymes/ie_pca/pd_pca_IE_explained_variance.png",
);

Output directory ../results/ie_cazymes/ie_pca exists, nodelete is True. Adding output to output directory.

100.0% of the variance in the data set was catpured by the PCA
Number of features needed to explain 0.95 fraction of total variance is 54.

<Figure size 1200x600 with 0 Axes>


plot_scree(ie_pca, nComp=10, file_path="../results/ie_cazymes/ie_pca/pd_pca_IE_scree.png")

Explained variance for 1PC: 0.17652879911506902
Explained variance for 2PC: 0.07549420462384307
Explained variance for 3PC: 0.053345849637646006
Explained variance for 4PC: 0.0425572938851641
Explained variance for 5PC: 0.03983585897385499
Explained variance for 6PC: 0.0348566982884042
Explained variance for 7PC: 0.032585872785582
Explained variance for 8PC: 0.028281453025477264
Explained variance for 9PC: 0.027154201346603422
Explained variance for 10PC: 0.025161493425666707


ie_fam_freq_df_ggs_pc = copy(ie_fam_freq_df_ggs)
ie_fam_freq_df_ggs_pc['Genus'] = list(ie_fam_freq_df['Genus'])
X_pca = ie_pca.transform(ie_X_scaled)

colnames = []
for i in range(4):
    ie_fam_freq_df_ggs_pc[f'PC{i+1} ({round(ie_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(ie_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    ie_fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Genus",
    diag_kind="kde",
    markers=['o','X', '^'],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/ie_cazymes/ie_pca/pd_pc_screen_genus.svg',
    bbox_inches='tight',
    format='svg'
)


try:
    ie_fam_freq_df_ggs_pc = ie_fam_freq_df_ggs_pc.drop('Genus', axis=1)
except KeyError:
    pass
ie_fam_freq_df_ggs_pc['Species'] = list(ie_fam_freq_df['Species'])

colnames = []
for i in range(4):
    ie_fam_freq_df_ggs_pc[f'PC{i+1} ({round(ie_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(ie_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    ie_fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Species",
    diag_kind="kde",
    markers=[
        'o', 'o', 'X', 'X', 'o', 'o',
        'o', 'o', 'X', 'X', 'o', 'X',
        'X', 'o', 'o', 'X', 'o', 'o',
        'X', 'X', 'o', 'o', 'o', 'X',
        '^', 'o', 'o', 'X',
    ],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/ie_cazymes/ie_pca/pd_pc_screen_specis.svg',
    bbox_inches='tight',
    format='svg'
)


def plot_ie_loadings(
    pca,
    fam_df,
    first_pc,
    second_pc,
    style=False,
    threshold=0.7,
    font_scale=1.15,
    font_size=12,
    dpi=300,
    fig_size=(16,16),
    file_path=None,
    file_format='png',
    marker_size=100,
):
    """Build loadings plot for intracellular/extracellular CAZyme data
    
    :param pca: sklearn pca object
    :param fam_df: cazy family frequncy df
    :param first_pc: int, number of the first PC, e.g. PC1 == 1
    :param second_pc: int, number of the second PC e.g. PC2 == 2
    :param style: boolean, change shape of points depending on CAZy class
    :param threshold: correlation cut off for showing labels
        Only families with a value greater than the threshold
        will be annotated
    :param font_scale: scale font
    :param font_size: font size of family labels
    :param fig_size: tuple (width, height) of final plot
    :param file_path: str, path to write out a figure.
    
    Return labels - CAZy family annotations for families >= threshold
    """
    sns.set(font_scale=font_scale)

    # calculate loading = variables x loadings, returns an array
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    # get labels of variables, i.e. cazy families
    loadings_labels = list(fam_df.columns)

    for col in ['Kingdom', 'Phylum', 'Class', 'Order', 'Tax_Family', 'Genus', 'Species', 'Genome', 'Genomes']:
        try:
            loadings_labels.remove(col)
        except (KeyError, ValueError):
            pass

    loadings_x = loadings[:,(first_pc-1)]
    loadings_y = loadings[:,(second_pc-1)]

    loadings_df = pd.DataFrame()
    loadings_df['loadings_x'] = loadings_x
    loadings_df['loadings_y'] = loadings_y

    cazy_class = []
    for lbl in loadings_labels:
        lbl = lbl.split("_")[1]
        if lbl.startswith('GH'):
            cazy_class.append('GH')
        elif lbl.startswith('GT'):
            cazy_class.append('GT')
        elif lbl.startswith('PL'):
            cazy_class.append('PL')
        elif lbl.startswith('CE'):
            cazy_class.append('CE')
        elif lbl.startswith('AA'):
            cazy_class.append('AA')
        else:
            cazy_class.append('CBM')

    loadings_df['cazy_class'] = cazy_class

    plt.figure(figsize=fig_size)
    if style:
        g = sns.scatterplot(
            x=loadings_x,
            y=loadings_y,
            data=loadings_df,
            hue=cazy_class,
            s=marker_size,
            style=cazy_class,
        );
    else:
        g = sns.scatterplot(
            x=loadings_x,
            y=loadings_y,
            data=loadings_df,
            hue=cazy_class,
            s=marker_size,
        );
    g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
    g.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    g.set(xlim=(-1,1),ylim=(-1,1));
    plt.ylabel(f"PC{second_pc}") 
    plt.xlabel(f"PC{first_pc}")

    texts = [
        plt.text(
            xval,
            yval,
            lbl,
            ha='center',
            va='center',
            fontsize=font_size,
        ) for (xval, yval, lbl) in zip(
            loadings_x, loadings_y, loadings_labels
        ) if abs(xval) > threshold or abs(yval) > threshold
    ]
    adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

    sns.move_legend(g, "lower center", bbox_to_anchor=(.5, 1), ncol=3, title=None, frameon=False);

    if file_path is not None:
        plt.savefig(file_path, dpi=dpi, bbox_inches='tight', format=file_format)
        
    return texts


ie_fam_freq_df_ggs['Genus'] = list(ie_fam_freq_df['Genus'])  # add column to use for colour scheme

pc1_pc2_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    1,
    2,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/ie_pca/ie_pc1_pc2.png",
)

pc1_pc2_labels = plot_ie_loadings(
    ie_pca,
    ie_fam_freq_df_ggs,
    1,
    2,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/ie_pca/ie_pc1_pc2_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc2_labels = [_._text for _ in pc1_pc2_labels]
pc1_pc2_fams = {}
for fam in pc1_pc2_labels:
    try:
        pc1_pc2_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc2_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc2_fams:
    if len(pc1_pc2_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order
CBM13
CBM32
CE1
CE12
CE4
CE8
GH28
GH3
GH5
GH8
PL1


pc1_pc3_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    1,
    3,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/ie_pca/ie_pc1_pc3.png",
)

pc1_pc3_labels = plot_ie_loadings(
    ie_pca,
    ie_fam_freq_df_ggs,
    1,
    3,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/ie_pca/ie_pc1_pc3_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc3_labels = [_._text for _ in pc1_pc3_labels]
pc1_pc3_fams = {}
for fam in pc1_pc3_labels:
    try:
        pc1_pc3_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc3_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc3_fams:
    if len(pc1_pc3_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order
CBM13
CBM32
CE1
CE12
CE4
CE8
GH13
GH23
GH28
GH5
GH8
PL1


pc1_pc4_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    1,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/ie_pca/ie_pc1_pc4.png",
)

pc1_pc4_labels = plot_ie_loadings(
    ie_pca,
    ie_fam_freq_df_ggs,
    1,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/ie_pca/ie_pc1_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc4_labels = [_._text for _ in pc1_pc4_labels]
pc1_pc4_fams = {}
for fam in pc1_pc4_labels:
    try:
        pc1_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc4_fams:
    if len(pc1_pc4_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order
CBM13
CBM32
CE1
CE12
CE4
CE8
GH28
GH5
GH8
GT83
PL1
PL3


pc2_pc3_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    2,
    3,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/ie_pca/ie_pc2_pc3.png",
)

pc2_pc3_labels = plot_ie_loadings(
    ie_pca,
    ie_fam_freq_df_ggs,
    2,
    3,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/ie_pca/ie_pc2_pc3_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc2_pc3_labels = [_._text for _ in pc2_pc3_labels]
pc2_pc3_fams = {}
for fam in pc2_pc3_labels:
    try:
        pc2_pc3_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc2_pc3_fams[fam.split("_")[1]] = {fam}
for fam in pc2_pc3_fams:
    if len(pc2_pc3_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order
CE1
CE12
CE4
GH36


pc2_pc4_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    2,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/ie_pca/ie_pc2_pc4.png",
)

pc2_pc4_labels = plot_ie_loadings(
    ie_pca,
    ie_fam_freq_df_ggs,
    2,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/ie_pca/ie_pc2_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc2_pc4_labels = [_._text for _ in pc2_pc4_labels]
pc2_pc4_fams = {}
for fam in pc2_pc4_labels:
    try:
        pc2_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc2_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc2_pc4_fams:
    if len(pc2_pc4_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order
CE1
CE12
CE4
GH28
PL1
PL3


pc3_pc4_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    3,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/ie_pca/ie_pc3_pc4.png",
)

pc3_pc4_labels = plot_ie_loadings(
    ie_pca,
    ie_fam_freq_df_ggs,
    3,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/ie_pca/ie_pc3_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc3_pc4_labels = [_._text for _ in pc3_pc4_labels]
pc3_pc4_fams = {}
for fam in pc3_pc4_labels:
    try:
        pc3_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc3_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc3_pc4_fams:
    if len(pc3_pc4_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order
GH13
PL3


ie_fam_freq_df_ggs['Species'] = list(ie_fam_freq_df['Species'])
pc3_pc4_scatter_plt = plot_pca(
    ie_pca,
    ie_X_scaled,
    ie_fam_freq_df_ggs,
    3,
    4,
    'Species',
    figsize=(10,10),
    style='Species',
    file_path="../results/ie_cazymes/ie_pca/pca_ie_3_vs_4_species.svg",
    file_format='svg',
)

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs = ie_fam_freq_df_ggs[intrac_cols]

num_of_components = len(i_fam_freq_df_ggs.columns)
i_pca, i_X_scaled = perform_pca(i_fam_freq_df_ggs, num_of_components)
print(i_pca)

make_output_directory(Path("../results/ie_cazymes/i_pca"), force=True, nodelete=True)

print(f"{i_pca.explained_variance_ratio_.sum() * 100}% of the variance in the data set was catpured by the PCA")

cumExpVar = plot_explained_variance(
    i_pca,
    num_of_components,
    file_path="../results/ie_cazymes/i_pca/pd_pca_IE_explained_variance.png",
)

Output directory ../results/ie_cazymes/i_pca exists, nodelete is True. Adding output to output directory.

PCA(n_components=88)
100.0% of the variance in the data set was catpured by the PCA
Number of features needed to explain 0.95 fraction of total variance is 45.

<Figure size 1200x600 with 0 Axes>


plot_scree(i_pca, nComp=10, file_path="../results/ie_cazymes/i_pca/pd_pca_IE_scree.png")

Explained variance for 1PC: 0.15454683976067185
Explained variance for 2PC: 0.0834828523639291
Explained variance for 3PC: 0.060125503307268735
Explained variance for 4PC: 0.05148887607075085
Explained variance for 5PC: 0.04248178246293622
Explained variance for 6PC: 0.038646206680417954
Explained variance for 7PC: 0.03223334516416426
Explained variance for 8PC: 0.028157400721296664
Explained variance for 9PC: 0.02629539533978042
Explained variance for 10PC: 0.025459789770284326


# multiplot
i_fam_freq_df_ggs_pc = copy(i_fam_freq_df_ggs)
i_fam_freq_df_ggs_pc['Genus'] = list(i_fam_freq_df['Genus'])
X_pca = i_pca.transform(i_X_scaled)

colnames = []
for i in range(4):
    i_fam_freq_df_ggs_pc[f'PC{i+1} ({round(i_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(i_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    i_fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Genus",
    diag_kind="kde",
    markers=['o','X', '^'],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/ie_cazymes/i_pca/pd_pc_screen_genus.svg',
    bbox_inches='tight',
    format='svg'
)

try:
    i_fam_freq_df_ggs_pc = i_fam_freq_df_ggs_pc.drop('Genus', axis=1)
except KeyError:
    pass
i_fam_freq_df_ggs_pc['Species'] = list(i_fam_freq_df['Species'])

colnames = []
for i in range(4):
    i_fam_freq_df_ggs_pc[f'PC{i+1} ({round(i_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(i_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    i_fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Species",
    diag_kind="kde",
    markers=[
        'o', 'o', 'X', 'X', 'o', 'o',
        'o', 'o', 'X', 'X', 'o', 'X',
        'X', 'o', 'o', 'X', 'o', 'o',
        'X', 'X', 'o', 'o', 'o', 'X',
        '^', 'o', 'o', 'X',
    ],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/ie_cazymes/i_pca/pd_pc_screen_species.svg',
    bbox_inches='tight',
    format='svg'
)


i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

pc1_pc2_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    1,
    2,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/i_pca/i_pc1_pc2.png",
)

pc1_pc2_labels = plot_ie_loadings(
    i_pca,
    i_fam_freq_df_ggs,
    1,
    2,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/i_pca/i_pc1_pc2_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc2_labels = [_._text for _ in pc1_pc2_labels]
pc1_pc2_fams = {}
for fam in pc1_pc2_labels:
    try:
        pc1_pc2_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc2_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc2_fams:
    if len(pc1_pc2_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/83899765.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

pc1_pc3_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    1,
    3,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/i_pca/i_pc1_pc3.png",
)

pc1_pc3_labels = plot_ie_loadings(
    i_pca,
    i_fam_freq_df_ggs,
    1,
    3,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/i_pca/i_pc1_pc3_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc3_labels = [_._text for _ in pc1_pc3_labels]
pc1_pc3_fams = {}
for fam in pc1_pc3_labels:
    try:
        pc1_pc3_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc3_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc3_fams:
    if len(pc1_pc3_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/664309407.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

pc1_pc4_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    1,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/i_pca/i_pc1_pc4.png",
)

pc1_pc4_labels = plot_ie_loadings(
    i_pca,
    i_fam_freq_df_ggs,
    1,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/i_pca/i_pc1_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc4_labels = [_._text for _ in pc1_pc4_labels]
pc1_pc4_fams = {}
for fam in pc1_pc4_labels:
    try:
        pc1_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc4_fams:
    if len(pc1_pc4_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/1551041151.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

pc2_pc3_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    2,
    3,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/i_pca/i_pc2_pc3.png",
)

pc2_pc3_labels = plot_ie_loadings(
    i_pca,
    i_fam_freq_df_ggs,
    2,
    3,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/i_pca/i_pc2_pc3_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc2_pc3_labels = [_._text for _ in pc2_pc3_labels]
pc2_pc3_fams = {}
for fam in pc2_pc3_labels:
    try:
        pc2_pc3_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc2_pc3_fams[fam.split("_")[1]] = {fam}
for fam in pc2_pc3_fams:
    if len(pc2_pc3_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/878882969.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

pc2_pc4_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    2,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/i_pca/i_pc2_pc4.png",
)

pc2_pc4_labels = plot_ie_loadings(
    i_pca,
    i_fam_freq_df_ggs,
    2,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/i_pca/i_pc2_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc2_pc4_labels = [_._text for _ in pc2_pc4_labels]
pc2_pc4_fams = {}
for fam in pc2_pc4_labels:
    try:
        pc2_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc2_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc2_pc4_fams:
    if len(pc2_pc4_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/1749631204.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

pc3_pc4_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    3,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/i_pca/i_pc3_pc4.png",
)

pc3_pc4_labels = plot_ie_loadings(
    i_pca,
    i_fam_freq_df_ggs,
    3,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/i_pca/i_pc3_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc3_pc4_labels = [_._text for _ in pc3_pc4_labels]
pc3_pc4_fams = {}
for fam in pc3_pc4_labels:
    try:
        pc3_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc3_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc3_pc4_fams:
    if len(pc3_pc4_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/40578501.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Genus'] = list(i_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


i_fam_freq_df_ggs['Species'] = list(i_fam_freq_df['Species'])  # add column to use for colour scheme

pc3_pc4_scatter_plt = plot_pca(
    i_pca,
    i_X_scaled,
    i_fam_freq_df_ggs,
    3,
    4,
    'Species',
    figsize=(10,10),
    style='Species',
    file_path="../results/ie_cazymes/i_pca/i_pc3_pc4_species.png",
)

/tmp/ipykernel_479/454184917.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i_fam_freq_df_ggs['Species'] = list(i_fam_freq_df['Species'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


e_fam_freq_df_ggs = ie_fam_freq_df_ggs[extrac_cols]

num_of_components = len(e_fam_freq_df_ggs.columns)
e_pca, e_X_scaled = perform_pca(e_fam_freq_df_ggs, num_of_components)
print(e_pca)

make_output_directory(Path("../results/ie_cazymes/e_pca"), force=True, nodelete=True)

print(f"{e_pca.explained_variance_ratio_.sum() * 100}% of the variance in the data set was catpured by the PCA")

cumExpVar = plot_explained_variance(
    e_pca,
    num_of_components,
    file_path="../results/ie_cazymes/e_pca/pd_pca_IE_explained_variance.png",
)

Output directory ../results/ie_cazymes/e_pca exists, nodelete is True. Adding output to output directory.

PCA(n_components=44)
100.0% of the variance in the data set was catpured by the PCA
Number of features needed to explain 0.95 fraction of total variance is 26.

<Figure size 1200x600 with 0 Axes>


plot_scree(e_pca, nComp=10, file_path="../results/ie_cazymes/e_pca/pd_pca_E_scree.png")

Explained variance for 1PC: 0.22852140989711353
Explained variance for 2PC: 0.07286778687197029
Explained variance for 3PC: 0.0659550653530078
Explained variance for 4PC: 0.05615341356754494
Explained variance for 5PC: 0.05490691317249101
Explained variance for 6PC: 0.04899328522245456
Explained variance for 7PC: 0.044316222567705996
Explained variance for 8PC: 0.035613504034801194
Explained variance for 9PC: 0.031087555865400594
Explained variance for 10PC: 0.03047734378659763


# multiplot
e_fam_freq_df_ggs_pc = copy(e_fam_freq_df_ggs)
e_fam_freq_df_ggs_pc['Genus'] = list(e_fam_freq_df['Genus'])
X_pca = e_pca.transform(e_X_scaled)

colnames = []
for i in range(4):
    e_fam_freq_df_ggs_pc[f'PC{i+1} ({round(e_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(e_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    e_fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Genus",
    diag_kind="kde",
    markers=['o','X', '^'],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/ie_cazymes/e_pca/pd_pc_screen_genus.svg',
    bbox_inches='tight',
    format='svg'
)

try:
    e_fam_freq_df_ggs_pc = e_fam_freq_df_ggs_pc.drop('Genus', axis=1)
except KeyError:
    pass
e_fam_freq_df_ggs_pc['Species'] = list(e_fam_freq_df['Species'])

colnames = []
for i in range(4):
    e_fam_freq_df_ggs_pc[f'PC{i+1} ({round(e_pca.explained_variance_ratio_[i] * 100, 2)}%)'] = X_pca[:,i]
    colnames.append(f'PC{i+1} ({round(e_pca.explained_variance_ratio_[i] * 100, 2)}%)')
    
g = sns.pairplot(
    e_fam_freq_df_ggs_pc,
    vars=colnames,
    hue="Species",
    diag_kind="kde",
    markers=[
        'o', 'o', 'X', 'X', 'o', 'o',
        'o', 'o', 'X', 'X', 'o', 'X',
        'X', 'o', 'o', 'X', 'o', 'o',
        'X', 'X', 'o', 'o', 'o', 'X',
        '^', 'o', 'o', 'X',
    ],
    height=3,
);

i = 0
for ax in g.axes.ravel():
    if ax is None:
        continue
    if i not in [0,5,10,15]:
        ax.axhline(0, linestyle='--', color='grey', linewidth=1.25);
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    else:
        ax.axvline(0, linestyle='--', color='grey', linewidth=1.25);
    i += 1
    
plt.savefig(
    '../results/ie_cazymes/e_pca/pd_pc_screen_species.svg',
    bbox_inches='tight',
    format='svg'
)


e_fam_freq_df_ggs['Genus'] = list(e_fam_freq_df['Genus'])  # add column to use for colour scheme

pc1_pc2_scatter_plt = plot_pca(
    e_pca,
    e_X_scaled,
    e_fam_freq_df_ggs,
    1,
    2,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/e_pca/e_pc1_pc2.png",
)

pc1_pc2_labels = plot_ie_loadings(
    e_pca,
    e_fam_freq_df_ggs,
    1,
    2,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/e_pca/e_pc1_pc2_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc2_labels = [_._text for _ in pc1_pc2_labels]
pc1_pc2_fams = {}
for fam in pc1_pc2_labels:
    try:
        pc1_pc2_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc2_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc2_fams:
    if len(pc1_pc2_fams[fam]) == 2:
        print(fam)

/tmp/ipykernel_479/1787893727.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_fam_freq_df_ggs['Genus'] = list(e_fam_freq_df['Genus'])  # add column to use for colour scheme

Not applying hue order
Applying style
Not applying style order


pc1_pc3_scatter_plt = plot_pca(
    e_pca,
    e_X_scaled,
    e_fam_freq_df_ggs,
    1,
    3,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/e_pca/e_pc1_pc3.png",
)

pc1_pc3_labels = plot_ie_loadings(
    e_pca,
    e_fam_freq_df_ggs,
    1,
    3,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/e_pca/e_pc1_pc3_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc3_labels = [_._text for _ in pc1_pc3_labels]
pc1_pc3_fams = {}
for fam in pc1_pc3_labels:
    try:
        pc1_pc3_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc3_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc3_fams:
    if len(pc1_pc3_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order


# plot pc 1 vs pc 3 - labelling pectobacterium species that are isolated from the rest of the group
X_pca = e_pca.transform(e_X_scaled)

plt.figure(figsize=(10,10))
sns.set(font_scale=1.15)

e_fam_freq_df_ggs['Species'] = list(e_fam_freq_df['Species'])

g = sns.scatterplot(
    x=X_pca[:,0],  # PC1
    y=X_pca[:,2],  # PC3
    data=e_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC3 {100 * e_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * e_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend([],[], frameon=False);

genome_lbls = ["\n".join(_) for _ in e_fam_freq_df_ggs.index]
x_vals = X_pca[:,0]
y_vals = X_pca[:,2]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if (yval > 10)
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig(
    "../results/ie_cazymes/e_pca/e_pc1_pc3_annotated_loadings.png",
    bbox_inches='tight',
    format='png',
)

/tmp/ipykernel_479/343031468.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_fam_freq_df_ggs['Species'] = list(e_fam_freq_df['Species'])


pc1_pc4_scatter_plt = plot_pca(
    e_pca,
    e_X_scaled,
    e_fam_freq_df_ggs,
    1,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/e_pca/e_pc1_pc4.png",
)

pc1_pc4_labels = plot_ie_loadings(
    e_pca,
    e_fam_freq_df_ggs,
    1,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/e_pca/e_pc1_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc1_pc4_labels = [_._text for _ in pc1_pc4_labels]
pc1_pc4_fams = {}
for fam in pc1_pc4_labels:
    try:
        pc1_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc1_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc1_pc4_fams:
    if len(pc1_pc4_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order


pc2_pc3_scatter_plt = plot_pca(
    e_pca,
    e_X_scaled,
    e_fam_freq_df_ggs,
    2,
    3,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/e_pca/e_pc2_pc3.png",
)

pc2_pc3_labels = plot_ie_loadings(
    e_pca,
    e_fam_freq_df_ggs,
    2,
    3,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/e_pca/e_pc2_pc3_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc2_pc3_labels = [_._text for _ in pc2_pc3_labels]
pc2_pc3_fams = {}
for fam in pc2_pc3_labels:
    try:
        pc2_pc3_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc2_pc3_fams[fam.split("_")[1]] = {fam}
for fam in pc2_pc3_fams:
    if len(pc2_pc3_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order


pc2_pc4_scatter_plt = plot_pca(
    e_pca,
    e_X_scaled,
    e_fam_freq_df_ggs,
    2,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/e_pca/e_pc2_pc4.png",
)

pc2_pc4_labels = plot_ie_loadings(
    e_pca,
    e_fam_freq_df_ggs,
    2,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/e_pca/e_pc2_pc4_loadings.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc2_pc4_labels = [_._text for _ in pc2_pc4_labels]
pc2_pc4_fams = {}
for fam in pc2_pc4_labels:
    try:
        pc2_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc2_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc2_pc4_fams:
    if len(pc2_pc4_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order


pc3_pc4_scatter_plt = plot_pca(
    e_pca,
    e_X_scaled,
    e_fam_freq_df_ggs,
    3,
    4,
    'Genus',
    figsize=(10,10),
    style='Genus',
    file_path="../results/ie_cazymes/e_pca/e_pc3_pc4.png",
)

pc3_pc4_labels = plot_ie_loadings(
    e_pca,
    e_fam_freq_df_ggs,
    3,
    4,
    style=True,
    threshold=0.3,
    fig_size=(10,10),
    font_size=11,
    file_path="../results/ie_cazymes/e_pca/e_pc3_pc4.png",
)

# identify families where both the intracellular and extracellular families were annotated
pc3_pc4_labels = [_._text for _ in pc3_pc4_labels]
pc3_pc4_fams = {}
for fam in pc3_pc4_labels:
    try:
        pc3_pc4_fams[fam.split("_")[1]].add(fam)
    except KeyError:
        pc3_pc4_fams[fam.split("_")[1]] = {fam}
for fam in pc3_pc4_fams:
    if len(pc3_pc4_fams[fam]) == 2:
        print(fam)

Not applying hue order
Applying style
Not applying style order


# plot pc 3 vs pc 4 - labelling pectobacterium species that are isolated from the rest of the group
X_pca = e_pca.transform(e_X_scaled)

plt.figure(figsize=(10,10))
sns.set(font_scale=1.15)

e_fam_freq_df_ggs['Species'] = list(e_fam_freq_df['Species'])

g = sns.scatterplot(
    x=X_pca[:,2],  # PC3
    y=X_pca[:,3],  # PC4
    data=e_fam_freq_df_ggs,
    hue='Species',
    style='Species',
    s=100,
    markers=True,
)

g.axhline(0, linestyle='--', color='grey', linewidth=1.25);
g.axvline(0, linestyle='--', color='grey', linewidth=1.25);

plt.ylabel(f"PC3 {100 * e_pca.explained_variance_ratio_[1]:.2f}%");
plt.xlabel(f"PC1 {100 * e_pca.explained_variance_ratio_[0]:.2f}%");
plt.legend([],[], frameon=False);

genome_lbls = ["\n".join(_) for _ in e_fam_freq_df_ggs.index]
x_vals = X_pca[:,2]
y_vals = X_pca[:,3]

texts = [
    plt.text(
        xval,
        yval,
        lbl,
        ha='center',
        va='center',
        fontsize=12,
    ) for (xval, yval, lbl) in zip(
        x_vals, y_vals, genome_lbls
    ) if (xval > 10)
]
adjustText.adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'));

plt.savefig(
   "../results/ie_cazymes/e_pca/e_pc3_pc4_species.png",
    bbox_inches='tight',
    format='png',
)

/tmp/ipykernel_479/1795624431.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_fam_freq_df_ggs['Species'] = list(e_fam_freq_df['Species'])

	Genus	MeanFamilys	SdFamilys	NumOfGenomes
0	Dickeya	60.06	2.98	90
1	Pectobacterium	60.55	2.59	188
2	Musicola	51.00	0.00	2
3	Hafnia	46.00	0.00	1

	Genus	MeanProteomeSize	SdProteomeSize	MeanProteomePerc	SdProteomePerc	NumOfGenomes
0	Dickeya	4107.53	140.33	2.76	0.08	90
1	Pectobacterium	4207.19	189.81	2.65	0.11	188
2	Musicola	3877.50	45.50	2.40	0.00	2
3	Hafnia	4297.00	0.00	2.05	0.00	1

	Genus	MeanFamilys	SdFamilys	NumOfGenomes
0	Dickeya	60.06	2.98	90
1	Pectobacterium	60.55	2.59	188
2	Musicola	51.00	0.00	2
3	Hafnia	46.00	0.00	1

	CAZyClass	Genus	MeanTotalFams	SdTotalFams	MeanFams	SdFams	MeanFamsPerc	SdFamsPerc	NumOfGenomes
0	GH	Musicola	22.00	0.00	7.00	0.00	31.82	0.00	2
1	GH	Dickeya	23.94	1.41	7.06	0.97	29.55	4.26	90
2	GH	Pectobacterium	23.84	1.39	7.61	1.03	31.94	3.98	188
3	GT	Musicola	15.00	0.00	1.00	0.00	6.67	0.00	2
4	GT	Dickeya	16.27	0.63	1.11	0.31	6.85	2.01	90
5	GT	Pectobacterium	16.22	1.57	1.01	0.13	6.24	0.85	188
6	PL	Musicola	5.00	0.00	2.00	0.00	40.00	0.00	2
7	PL	Dickeya	7.91	0.71	5.14	0.51	65.13	4.43	90
8	PL	Pectobacterium	7.82	0.71	5.63	0.55	72.38	8.08	188
9	CE	Musicola	6.00	0.00	3.00	0.00	50.00	0.00	2
10	CE	Dickeya	6.20	0.50	2.22	0.44	35.58	4.45	90
11	CE	Pectobacterium	5.79	0.42	1.77	0.70	30.36	11.46	188
12	AA	Dickeya	1.00	0.00	0.00	0.00	0.00	0.00	47
13	AA	Pectobacterium	1.01	0.08	0.01	0.11	1.00	9.07	150
14	CBM	Musicola	2.00	0.00	1.00	0.00	50.00	0.00	2
15	CBM	Dickeya	4.02	0.70	2.09	0.61	51.39	8.38	90
16	CBM	Pectobacterium	5.11	0.80	4.11	0.80	80.01	3.65	188

	Family	Genome	Protein
0	GH8	GCF_000365365.1	WP_024104087.1
1	GT51	GCF_000365365.1	WP_024104265.1
2	PL38	GCF_000365365.1	WP_024104266.1

	Genome	Genus	Species
0	GCF_003382595.2	Pectobacterium	aquaticum
1	GCF_003628695.1	Pectobacterium	parmentieri
2	GCF_000808515.1	Pectobacterium	carotovorum

	Genus	MeanProteins	SdProteins	NumOfGenomes
0	Dickeya	113.57	6.20	90
1	Pectobacterium	111.63	7.39	188
2	Musicola	93.00	1.00	2
3	Hafnia	88.00	0.00	1

	CAZyClass	Genus	MeanCazyClass	SdCazyClass	MeanClassPerc	SdClassPerc	NumOfGenomes
0	GH	Hafnia	44.00	0.00	50.00	0.00	1
1	GH	Musicola	36.00	1.00	38.70	0.66	2
2	GH	Dickeya	42.14	3.60	37.09	2.09	90
3	GH	Pectobacterium	49.24	3.38	44.15	1.94	188
4	GT	Hafnia	35.00	0.00	39.77	0.00	1
5	GT	Musicola	35.00	0.00	37.64	0.40	2
6	GT	Dickeya	40.91	3.32	36.07	2.82	90
7	GT	Pectobacterium	33.77	3.90	30.20	2.29	188
8	PL	Hafnia	2.00	0.00	2.27	0.00	1
9	PL	Musicola	12.00	0.00	12.90	0.14	2
10	PL	Dickeya	16.96	1.71	14.92	1.27	90
11	PL	Pectobacterium	15.22	1.32	13.65	0.98	188
12	CE	Hafnia	4.00	0.00	4.55	0.00	1
13	CE	Musicola	6.00	0.00	6.45	0.07	2
14	CE	Dickeya	7.02	0.68	6.19	0.56	90
15	CE	Pectobacterium	7.21	0.92	6.46	0.68	188
16	AA	Hafnia	0.00	0.00	0.00	0.00	1
17	AA	Musicola	0.00	0.00	0.00	0.00	2
18	AA	Dickeya	1.00	0.00	0.87	0.04	47
19	AA	Pectobacterium	1.01	0.08	0.89	0.09	150
20	CBM	Hafnia	8.00	0.00	9.09	0.00	1
21	CBM	Musicola	8.00	0.00	8.60	0.09	2
22	CBM	Dickeya	10.49	1.21	9.23	0.88	90
23	CBM	Pectobacterium	11.10	0.77	9.99	1.10	188

	Genome	Genus	Species	AA10	AA3	CBM13	CBM3	CBM32	CBM34	CBM4	...	PL11	PL17	PL2	PL22	PL26	PL3	PL35	PL38	PL4	PL9
0	GCF_003718335.1	Dickeya	solani	0	1	1	0	0	0	0	...	0	0	1	1	1	2	0	0	1	3
1	GCF_000260925.1	Pectobacterium	parmentieri	0	0	1	1	1	0	0	...	0	0	2	1	1	1	0	1	1	2
2	GCF_016949835.1	Pectobacterium	brasiliense	0	1	1	1	1	0	0	...	0	0	2	1	0	2	0	1	1	2
3	GCF_018361225.1	Dickeya	dianthicola	0	0	0	0	1	0	0	...	0	0	1	1	1	2	0	1	2	3
4	GCF_002250215.1	Pectobacterium	brasiliense	0	1	1	1	1	0	0	...	0	0	2	1	1	2	0	1	1	2
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
276	GCF_013449485.1	Pectobacterium	brasiliense	0	1	1	1	1	0	0	...	0	0	2	1	1	2	0	1	1	2
277	GCF_016944275.1	Pectobacterium	brasiliense	0	1	1	1	1	0	0	...	0	0	2	1	0	2	0	1	1	2
278	GCF_900129615.1	Pectobacterium	carotovorum	0	1	1	1	1	0	0	...	1	0	2	1	1	2	0	1	1	2
279	GCF_013449655.1	Pectobacterium	versatile A	0	1	1	1	1	0	0	...	0	0	2	1	1	2	0	1	1	2
280	GCF_016944295.1	Pectobacterium	brasiliense	0	1	1	1	1	0	0	...	0	0	2	1	0	2	0	1	1	2

			AA10	AA3	CBM13	CBM3	CBM32	CBM34	CBM4	CBM48	CBM5	CBM50	...	PL11	PL17	PL2	PL22	PL26	PL3	PL35	PL38	PL4	PL9
Genome	Genus	Species
GCF_003718335.1	Dickeya	solani	0	1	1	0	0	0	0	2	1	5	...	0	0	1	1	1	2	0	0	1	3

	Family	Genus	MeanFreq	SdFreq
0	CBM50	Musicola	5.00	0.00
1	CE8	Musicola	1.00	0.00
2	GH1	Musicola	6.00	1.00
3	GH103	Musicola	1.00	0.00
4	GH105	Musicola	2.00	0.00
5	GH13	Musicola	2.00	0.00
6	GH23	Musicola	4.00	0.00
7	GH28	Musicola	2.00	0.00
8	GH3	Musicola	3.00	0.00
9	GT19	Musicola	1.00	0.00
10	GT2	Musicola	8.00	0.00
11	GT30	Musicola	1.00	0.00
12	GT4	Musicola	8.00	0.00
13	GT51	Musicola	3.00	0.00
14	GT9	Musicola	4.00	0.00
15	PL1	Musicola	6.00	0.00
16	PL9	Musicola	3.00	0.00
17	CBM50	Dickeya	4.98	0.15
18	CE8	Dickeya	1.79	0.41
19	GH1	Dickeya	4.69	0.74
20	GH103	Dickeya	1.90	0.30
21	GH105	Dickeya	2.09	0.35
22	GH13	Dickeya	1.97	0.18
23	GH23	Dickeya	5.99	1.60
24	GH28	Dickeya	3.21	0.86
25	GH3	Dickeya	3.00	0.21
26	GT19	Dickeya	1.00	0.00
27	GT2	Dickeya	11.54	2.18
28	GT30	Dickeya	1.00	0.00
29	GT4	Dickeya	8.10	2.03
30	GT51	Dickeya	3.00	0.15
31	GT9	Dickeya	4.06	0.23
32	PL1	Dickeya	6.78	0.90
33	PL9	Dickeya	2.96	0.21
34	CBM50	Pectobacterium	4.96	0.35
35	CE8	Pectobacterium	1.92	0.27
36	GH1	Pectobacterium	9.35	1.13
37	GH103	Pectobacterium	1.48	0.50
38	GH105	Pectobacterium	1.99	0.07
39	GH13	Pectobacterium	3.01	0.51
40	GH23	Pectobacterium	6.86	1.44
41	GH28	Pectobacterium	3.79	0.53
42	GH3	Pectobacterium	2.20	0.46
43	GT19	Pectobacterium	1.00	0.00
44	GT2	Pectobacterium	8.87	2.16
45	GT30	Pectobacterium	1.00	0.00
46	GT4	Pectobacterium	4.56	1.53
47	GT51	Pectobacterium	3.06	0.45
48	GT9	Pectobacterium	3.55	0.59
49	PL1	Pectobacterium	5.65	0.67
50	PL9	Pectobacterium	1.98	0.14

	Families	Genus	Incidence
0	GT0+GT26	All	278
1	GH94+GT84	All	97
2	GT52+GT111	All	9
3	GH148+CBM4	All	6
4	GH171+AA10	All	2
5	GT14+GH16	All	1
6	CE11+GH102+GH32	All	279
7	PL9+CE8+GH3+GT19+GH13+GH105+GT2+GH103+GH1+GH28...	All	280
8	GT0+GT26	Musicola	2
9	CE11+GH102+GH32	Musicola	2
10	GH94+GT84	Pectobacterium	67
11	GT52+GT111	Pectobacterium	9
12	GH171+AA10	Pectobacterium	2
13	GT14+GH16	Pectobacterium	1
14	CE11+GH102+GH32	Pectobacterium	187
15	GH5+CBM3+CE1	Pectobacterium	187
16	GT0+GT26+CBM48+GT56	Pectobacterium	187
17	PL9+CE8+GH3+GT19+GH13+GH105+GT2+GH103+GH1+GH28...	Pectobacterium	188
18	GT0+GT26	Dickeya	89
19	GH94+GT84	Dickeya	30
20	GH148+CBM4	Dickeya	6
21	GH88+PL35	Dickeya	1
22	GH5+PL4+CBM5+GH19	Dickeya	88
23	PL9+CE8+GH3+GT19+GH13+GH105+GT2+GH103+GH1+GH28...	Dickeya	90

	Fam	Genome	Protein	Genus	Species
0	e_GH8	GCF_000365365.1	WP_024104087.1	Dickeya	dianthicola
1	i_GT51	GCF_000365365.1	WP_024104265.1	Dickeya	dianthicola

	Family	Genus	MeanFreq	SdFreq
0	e_CBM50	Musicola	3.00	0.00
14	e_CBM50	Dickeya	2.98	0.15
28	e_CBM50	Pectobacterium	2.98	0.13
1	e_GH103	Musicola	1.00	0.00
15	e_GH103	Dickeya	1.82	0.38
29	e_GH103	Pectobacterium	1.41	0.49
2	e_GH23	Musicola	2.00	0.00
16	e_GH23	Dickeya	2.59	0.77
30	e_GH23	Pectobacterium	3.93	1.09
31	e_PL9	Pectobacterium	1.82	0.38
3	e_PL9	Musicola	2.00	0.00
17	e_PL9	Dickeya	2.62	0.55
32	i_GH1	Pectobacterium	9.35	1.13
4	i_GH1	Musicola	6.00	1.00
18	i_GH1	Dickeya	4.69	0.74
33	i_GH105	Pectobacterium	1.99	0.07
19	i_GH105	Dickeya	2.09	0.35
5	i_GH105	Musicola	2.00	0.00
34	i_GH13	Pectobacterium	2.91	0.41
20	i_GH13	Dickeya	1.97	0.18
6	i_GH13	Musicola	2.00	0.00
21	i_GH23	Dickeya	3.40	1.36
7	i_GH23	Musicola	2.00	0.00
35	i_GH23	Pectobacterium	2.93	0.88
22	i_GH3	Dickeya	2.39	0.55
36	i_GH3	Pectobacterium	2.14	0.39
8	i_GH3	Musicola	2.00	0.00
23	i_GT19	Dickeya	1.00	0.00
37	i_GT19	Pectobacterium	1.00	0.00
9	i_GT19	Musicola	1.00	0.00
10	i_GT2	Musicola	8.00	0.00
24	i_GT2	Dickeya	11.54	2.18
38	i_GT2	Pectobacterium	8.86	2.16
25	i_GT30	Dickeya	1.00	0.00
39	i_GT30	Pectobacterium	1.00	0.00
11	i_GT30	Musicola	1.00	0.00
40	i_GT51	Pectobacterium	3.05	0.41
12	i_GT51	Musicola	3.00	0.00
26	i_GT51	Dickeya	3.00	0.15
27	i_GT9	Dickeya	4.06	0.23
13	i_GT9	Musicola	4.00	0.00
41	i_GT9	Pectobacterium	3.55	0.59

	Family	Genus	MeanFreq	SdFreq
0	e_CBM32	Pectobacterium	1.00	0.00
1	e_CBM50	Pectobacterium	2.98	0.13
2	e_GH103	Pectobacterium	1.41	0.49
3	e_GH23	Pectobacterium	3.93	1.09
4	e_GH28	Pectobacterium	2.89	0.38
5	e_PL3	Pectobacterium	1.00	0.00
6	e_PL9	Pectobacterium	1.82	0.38
7	i_CE9	Pectobacterium	1.11	0.31
8	i_GH1	Pectobacterium	9.35	1.13
9	i_GH105	Pectobacterium	1.99	0.07
10	i_GH13	Pectobacterium	2.91	0.41
11	i_GH23	Pectobacterium	2.93	0.88
12	i_GH3	Pectobacterium	2.14	0.39
13	i_GH43	Pectobacterium	2.27	0.70
14	i_GT19	Pectobacterium	1.00	0.00
15	i_GT2	Pectobacterium	8.86	2.16
16	i_GT28	Pectobacterium	1.00	0.00
17	i_GT30	Pectobacterium	1.00	0.00
18	i_GT51	Pectobacterium	3.05	0.41
19	i_GT9	Pectobacterium	3.55	0.59
20	i_PL22	Pectobacterium	1.00	0.00

	Family	Genus	MeanFreq	SdFreq
0	e_CBM50	Musicola	3.0	0.0
1	e_CE1	Musicola	1.0	0.0
2	e_CE12	Musicola	1.0	0.0
3	e_CE8	Musicola	1.0	0.0
4	e_GH102	Musicola	1.0	0.0
5	e_GH103	Musicola	1.0	0.0
6	e_GH23	Musicola	2.0	0.0
7	e_GH28	Musicola	1.0	0.0
8	e_GH3	Musicola	1.0	0.0
9	e_GH30	Musicola	1.0	0.0
10	e_GH8	Musicola	1.0	0.0
11	e_GT4	Musicola	1.0	0.0
12	e_PL1	Musicola	4.0	0.0
13	e_PL9	Musicola	2.0	0.0
14	i_CBM48	Musicola	2.0	0.0
15	i_CBM50	Musicola	1.0	0.0
16	i_CE11	Musicola	1.0	0.0
17	i_CE4	Musicola	1.0	0.0
18	i_CE9	Musicola	1.0	0.0
19	i_GH1	Musicola	6.0	1.0
20	i_GH104	Musicola	1.0	0.0
21	i_GH105	Musicola	2.0	0.0
22	i_GH13	Musicola	2.0	0.0
23	i_GH19	Musicola	1.0	0.0
24	i_GH2	Musicola	1.0	0.0
25	i_GH23	Musicola	2.0	0.0
26	i_GH24	Musicola	1.0	0.0
27	i_GH28	Musicola	1.0	0.0
28	i_GH3	Musicola	2.0	0.0
29	i_GH31	Musicola	1.0	0.0
30	i_GH32	Musicola	1.0	0.0
31	i_GH33	Musicola	2.0	0.0
32	i_GH36	Musicola	1.0	0.0
33	i_GH38	Musicola	1.0	0.0
34	i_GH5	Musicola	1.0	0.0
35	i_GH73	Musicola	1.0	0.0
36	i_GH77	Musicola	1.0	0.0
37	i_GT0	Musicola	1.0	0.0
38	i_GT1	Musicola	1.0	0.0
39	i_GT19	Musicola	1.0	0.0
40	i_GT2	Musicola	8.0	0.0
41	i_GT26	Musicola	1.0	0.0
42	i_GT28	Musicola	1.0	0.0
43	i_GT30	Musicola	1.0	0.0
44	i_GT35	Musicola	2.0	0.0
45	i_GT4	Musicola	7.0	0.0
46	i_GT5	Musicola	1.0	0.0
47	i_GT51	Musicola	3.0	0.0
48	i_GT56	Musicola	1.0	0.0
49	i_GT81	Musicola	1.0	0.0
50	i_GT83	Musicola	1.0	0.0
51	i_GT9	Musicola	4.0	0.0
52	i_PL1	Musicola	2.0	0.0
53	i_PL2	Musicola	1.0	0.0
54	i_PL22	Musicola	1.0	0.0
55	i_PL38	Musicola	1.0	0.0
56	i_PL9	Musicola	1.0	0.0

	Families	Genus	Incidence
0	e_PL1+i_CBM48	Both	279
1	i_GT0+i_GT26	Both	278
2	i_GH94+i_GT84	Both	97
3	i_GH19+i_GH5	Dickeya	88
4	e_CE4+e_GH153	Both	38
5	i_GT111+i_GT52	Both	9
6	e_AA10+e_GH171	Both	2
7	e_CBM5+e_GH18	Both	2
8	i_GT20+i_PL35	Both	2
9	e_CE12+e_GH36	Dickeya	3
10	e_GH16+i_GT14	Both	1
11	e_PL35+i_GH88	Dickeya	1
12	i_CE11+i_GH32+i_GT4	Both	279
13	i_CE11+i_GH32+i_GT4+i_PL2	Pectobacterium	187
14	e_PL1+i_CBM48+i_GT0+i_GT26+i_GT56	Pectobacterium	187
15	e_CBM50+e_GH103+e_GH23+e_PL9+i_GH1+i_GH105+i_G...	Both	280
16	e_CBM32+e_CBM50+e_GH103+e_GH23+e_GH28+e_PL3+e_...	Pectobacterium	188
17	e_CBM50+e_CE8+e_GH102+e_GH103+e_GH23+e_GH8+e_G...	Dickeya	90

Explore the CAZome of Pectobacterium and Dickeya genomes¶

Table of Contents¶

0. Imports¶

Packages¶

Data¶

1. CAZome size¶

2. CAZy classes¶

3. CAZy families¶

CAZy family frequency dataframe¶

Clustermaps¶

Add species classifications¶

Genus specific CAZy families¶

4. The Core CAZome¶

Genus specific core CAZomes¶

Pectobacterium Core CAZome¶

Dickeya Core CAZome¶

Combine the mean core CAZy family frequencies¶

5. Families that always occur together¶

Build an upset plot of co-occurring CAZy families¶

6. Principal Component Analysis (PCA)¶

Scatter and loadings plots¶

PCs 1 - 4¶

PC1 vs PC2¶

Pectobacterium and Dickeya: PC1 vs PC2¶

Label species of PC1 vs PC2: Dickeya¶

Label species of PC1 vs PC2: Pectobacterium¶

Loadings plot of PC1 vs PC2¶

Exploration of PC2-PC4 clustering of genomes.¶

¶

7. Genus specific PCA¶

Pectobacterium¶

Dickeya¶

Intracellular and Extracellular CAZymes¶

Data¶

1. CAZome size¶

Look at only Pectobacterium¶

Look at only Dickeya¶

Look at only Musicola¶

2. CAZy classes¶

3. CAZy families¶

3.1 Cluster map of all intracellular and extracellular CAZymes¶

3.2 Clustermap extracellular families¶

3.3 Clustermap intracellular only families¶

3.5 Add species classifications¶

3.6 Genus specific fams¶

4. The Core CAZome¶

4.1 Core CAZome¶

4.1.1 Pectobacterium¶

4.1.2 Dickeya¶

4.1.3 Musicola¶

5. Families that always occur together¶

Build an upset plot of co-occurring CAZy families¶

6. Principal Component Analysis (PCA)¶

6.1 PCA of all intracellular and extracellular families¶

Scatter and loadings plots¶

PC1 - PC4¶

6.2 PCA of intracellular families¶

6.3 PCA of extracellular families¶