文章SameStr（三）：图3代码

“Publication Figure 3”

百度云盘链接: https://pan.baidu.com/s/15g7caZp354zIWktpnWzWhQ

提取码: 4sh7

Libraries

Standard Import

library(tidyverse)
library(cowplot)
library(scales)
library(ggpubr)

Special

library(ggridges)
library(grid)
# library(Hmisc)
# library(ggExtra)

Paths

bin_dir = '../bin/'
data_dir = '../data/'
results_dir = '../results/'

Custom Scripts / Theme

source(paste0(bin_dir, 'theme_settings.R'))
source (paste0(bin_dir, 'utilities.R'))
source(paste0(bin_dir, 'species_competition_functions.R'))
source(paste0(bin_dir, 'distmat_utils.R'))
source(paste0(bin_dir, 'analysis_metadata.R'))
date <- format(Sys.time(), "%d%m%y")

Import Tables

Metadata

samples.metadata <- read_tsv(paste0(data_dir, 'samples.metadata.tsv'))
microbe.taxonomy <- read_tsv(paste0(data_dir, 'microbe.taxonomy.tsv'))
microbe.metadata <- microbe.taxonomy %>% 
  right_join(., read_tsv(paste0(data_dir, 'microbe.metadata.tsv')))

Taxonomy

MetaPhlAn2 tables
combine with taxonomy

mp_species.long <- microbe.taxonomy %>% 
  right_join(., read_tsv(paste0(data_dir, 'samples.mp_profiles.species.tsv')), 
             by = 'species') %>% 
  left_join(., samples.metadata, by = 'Name')

rCDI subset
annotate with metadata

mp_species.rcdi.long <- 
  mp_species.long %>%
  filter(Study %in% c(rcdi.studies))

SameStr Data

SameStr Case-Summary table

samples.sstr_data <- read_tsv(paste0(data_dir, 'samples.sstr_data_wmetadata.tsv')) %>% 
  left_join(., microbe.taxonomy, by = 'species')

Strain Finder Data

Strain Finder Tables

Sys.setenv("VROOM_CONNECTION_SIZE" = 10485760)

sf_strain_table <- read_tsv(paste0(data_dir, 'ALM.Strain_Table.tsv')) %>% 
  rename(Sample = '...1') # X1
sf_species_table <- read_tsv(paste0(data_dir, 'ALM.MG_OTU_Bacteria.tsv')) %>% 
  rename(bact = `BACT ID`) %>% 
  rename_all(.funs = funs(str_to_lower(.)))

Mock Data

Multi-Strain Species Population

mock.sstr_data <- read_tsv(paste0(data_dir, 'mock.sstr_data_wmetadata.tsv')) %>% 
  left_join(., microbe.taxonomy, by = 'species')

Fast ANI

FastAni Comparison Data

refs <- read_tsv(paste0(data_dir, 'references.mp_fastani.tsv'))

Set Figure

fig = paste0('Fig_3.')

Specificity of Shared Taxa

Comparing distributions of shared (genera, species, and) strains in related and unrelated sample pairs in order to assess the specificity with which methods call shared strains.

Strain Finder

Format

Taxa per sample matrix

# mg-OTU strain-level matrix
sf_strain_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  column_to_rownames('Sample') %>% 
  as.matrix()

colnames(sf_strain_table.matrix) <- 
  tibble(tax = colnames(sf_strain_table.matrix)) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  group_by(bact) %>% 
  mutate(strain_id = paste0(bact, '.', dense_rank(gdna))) %>% 
  pull(strain_id)

# mg-OTU species-level matrix
sf_species_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>% 
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  pivot_wider(id_cols = 'Sample', names_from = 'bact', values_from = 'rel_abund') %>% 
  column_to_rownames('Sample') %>% 
  as.matrix()

# mg-OTU genus-level matrix
sf_genus_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  left_join(., sf_species_table %>% dplyr::select(bact, genus), by = 'bact') %>% 
  group_by(Sample, genus) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  pivot_wider(id_cols = 'Sample', names_from = 'genus', values_from = 'rel_abund') %>% 
  column_to_rownames('Sample') %>% 
  as.matrix()

Sample/Sample co-occurrence matrix at mg-OTU species- and strain-level

# genus
sf_genus_table.dist <- sf_genus_table.matrix %*% t(sf_genus_table.matrix)
sf_genus_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_genus_table.dist), 
                  value_name = 'n.shared_genus',
                  rm_diag = T)
sf_genus_counts <- diag(sf_genus_table.dist) %>% 
  as.data.frame() %>% 
  rownames_to_column('Sample') %>% 
  rename(n.genus = '.')

# species
sf_species_table.dist <- sf_species_table.matrix %*% t(sf_species_table.matrix)
sf_species_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_species_table.dist), 
                  value_name = 'n.shared_species',
                  rm_diag = T)
sf_species_counts <- diag(sf_species_table.dist) %>% 
  as.data.frame() %>% 
  rownames_to_column('Sample') %>% 
  rename(n.species = '.')

# strain
sf_strain_table.dist <- sf_strain_table.matrix %*% t(sf_strain_table.matrix)
sf_strain_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_strain_table.dist), 
                  value_name = 'n.shared_strains',
                  rm_diag = T)
sf_strain_counts <- diag(sf_strain_table.dist) %>% 
  as.data.frame() %>% 
  rownames_to_column('Sample') %>% 
  rename(n.strains = '.')

join data

sf <-
  full_join(sf_species_table.long, sf_strain_table.long, by = c('row','col')) %>% 
  full_join(., sf_genus_table.long, by = c('row','col')) %>% 
  
  left_join(., sf_genus_counts, by = c('row' = 'Sample')) %>% 
  left_join(., sf_genus_counts, by = c('col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  
  left_join(., sf_species_counts, by = c('row' = 'Sample')) %>% 
  left_join(., sf_species_counts, by = c('col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  
  left_join(., sf_strain_counts, by = c('row' = 'Sample')) %>% 
  left_join(., sf_strain_counts, by = c('col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  
  rename(Sample.row = row, 
         Sample.col = col, 
         n.shared_species.sf = n.shared_species,
         n.shared_strains.sf = n.shared_strains, 
         n.shared_genus.sf = n.shared_genus) %>% 
  
  rename_at(.vars = vars(matches('n.*col'), matches('n.*row')),
            .funs = funs(gsub(gsub(., pattern = '.row', replacement = '.sf.row'), 
                              pattern = '.col', replacement = '.sf.col'))) %>%
  left_join(., samples.metadata, by = c('Sample.row' = 'Sample')) %>% 
  left_join(., samples.metadata, by = c('Sample.col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  filter(!is.na(Name.row), !is.na(Name.col)) %>% 
  rowwise() %>% 
  mutate(Name.paste = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  mutate(n.not_shared_strains.sf = (n.strains.sf.row - n.shared_strains.sf) + (n.strains.sf.col - n.shared_strains.sf), 
         n.not_shared_species.sf = (n.species.sf.row - n.shared_species.sf) + (n.species.sf.col - n.shared_species.sf), 
         n.not_shared_genus.sf = (n.genus.sf.row - n.shared_genus.sf) + (n.genus.sf.col - n.shared_genus.sf), 
         n.total_strains.sf = n.shared_strains.sf + n.not_shared_strains.sf,
         n.total_species.sf = n.shared_species.sf + n.not_shared_species.sf, 
         n.total_genus.sf = n.shared_genus.sf + n.not_shared_genus.sf, 
         f.shared_strains.sf = n.shared_strains.sf / n.total_strains.sf,
         f.shared_species.sf = n.shared_species.sf / n.total_species.sf,
         f.shared_genus.sf = n.shared_genus.sf / n.total_genus.sf)

Shared Taxa

Density Ridge Plot based on Strain Finder Data published with the Alm dataset, showing co-occurrence of genera, species, and strains in samples which are common in unrelated but more common in related samples.

sf %>% 
  mutate(related = ifelse(replace_na(Donor.Subject.row == Donor.Subject.col, F), 'Related', 'Unrelated')) %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = c('n.shared_genus.sf', 'n.shared_species.sf', 'n.shared_strains.sf')) %>% 
  mutate(metric = case_when(
         metric == 'n.shared_strains.sf' ~ 'StrainFinder/\nmg-OTUs (strain)', 
         metric == 'n.shared_species.sf' ~ 'StrainFinder/\nmg-OTUs (species)',
         metric == 'n.shared_genus.sf' ~ 'StrainFinder/\nmg-OTUs (genus)',
         T ~ metric
       )) %>% 
ggplot(aes(value, fct_relevel(metric, 
                              'StrainFinder/\nmg-OTUs (strain)', 
                              'StrainFinder/\nmg-OTUs (species)', 
                              'StrainFinder/\nmg-OTUs (genus)'), fill = related)) + 
  geom_density_ridges(panel_scaling = F, alpha = 0.75, scale = 5) + 
  scale_fill_manual(values = c('black','white')) +
  scale_x_continuous(trans = pseudo_log_trans(), breaks = c(0, 3, 10, 30, 100, 300)) +
  theme_cowplot() +
  labs(x = 'Shared Bacteria') +
  theme(legend.key.size = unit(c(.75, .5, .75, .5), "lines"),
        legend.title = element_blank(),
        axis.title.y = element_blank(), 
        )

在这里插入图片描述

SameStr

Format

Generate MetaPhlAn co-occurrence table, taxa counts

cooccurrence <- mp_cooccurrence(mp_species.long)
taxa_counts <- cooccurrence$mp_taxa_counts
taxa_cooccurrence <- cooccurrence$mp_taxa_table.long

Merge with SameStr strain-level data

sstr <-
  samples.sstr_data %>% 
  
  group_by(Study.pair, Name.row, Name.col, 
           Case_Name.pair, Case_Name.row, Case_Name.col, Same_Case.label, 
           Related.label, Related.bool, Pair_Type) %>%
  summarize(.groups = 'drop',
            n.analyzed_strains = sum(overlap > min_overlap, na.rm = T),
            n.shared_strains.mvs = sum(distance.mvs > min_similarity & overlap > min_overlap, na.rm = T),
            n.shared_strains.cvs = sum(distance.cvs > min_similarity & overlap > min_overlap, na.rm = T)) %>%

  # tag successful fmt cases
  mutate(fmt_success = !Case_Name.pair %in% cases_failed) %>% 
  mutate(Study.row = str_split_fixed(Name.row, pattern = '_', 2)[,1], 
         Study.col = str_split_fixed(Name.col, pattern = '_', 2)[,1]) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated')) %>% 

  rowwise() %>%
  mutate(Sample_Pair = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  left_join(., taxa_cooccurrence, by = 'Sample_Pair') %>% 
  left_join(., taxa_counts, by = c('Name.row' = 'Sample')) %>% 
  left_join(., taxa_counts, by = c('Name.row' = 'Sample'), suffix = c('.row','.col')) %>% 
  
  mutate(n.not_shared_species = (n.species.row - n.shared_species) + (n.species.col - n.shared_species), 
         n.not_shared_genus = (n.genus.row - n.shared_genus) + (n.genus.col - n.shared_genus), 
         n.not_shared_family = (n.family.row - n.shared_family) + (n.family.col - n.shared_family), 

         n.total_species = n.shared_species + n.not_shared_species, 
         n.total_genus = n.shared_genus + n.not_shared_genus, 
         n.total_family = n.shared_family + n.not_shared_family, 

         f.shared_strains.mvs = replace_na(n.shared_strains.mvs / n.analyzed_strains, 0), 
         f.shared_strains.cvs = replace_na(n.shared_strains.cvs / n.analyzed_strains, 0), 
         f.shared_species = n.shared_species / n.total_species,
         f.shared_genus = n.shared_genus / n.total_genus, 
         f.shared_family = n.shared_family / n.total_family)

Subset rCDI Alm data

sstr.alm <-
  sstr %>%
  # ALM only, same as above
  filter(Study.pair == 'ALM')

sstr.control <- 
  sstr %>% 
  # Control only
  filter(Study.row %in% control.studies &
         Study.col %in% control.studies) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated'))

Shared Taxa

Density Ridge Plot based on SameStr Data for the same Alm dataset shown above, showing co-occurrence of genera, species, and strains in samples. Co-occurrence is common in unrelated genera and species, but more separable compared to SF. For shared strains, distributions divide related from unrelated sample pairs which mostly don’t share any strains.

Alm

sstr.alm %>% 

  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = c('n.shared_genus', 'n.shared_species', 'n.shared_strains.mvs')) %>% 
  mutate(metric = case_when(
         metric == 'n.shared_strains.mvs' ~ 'SameStr/\nMetaPhlAn2 (strain)', 
         metric == 'n.shared_species' ~ 'SameStr/\nMetaPhlAn2 (species)', 
         metric == 'n.shared_genus' ~ 'SameStr/\nMetaPhlAn2 (genus)', 
         T ~ metric
       )) %>% 
ggplot(aes(value, fct_relevel(metric, 
                              'SameStr/\nMetaPhlAn2 (strain)',
                              'SameStr/\nMetaPhlAn2 (species)', 
                              'SameStr/\nMetaPhlAn2 (genus)',), fill = Related.bool)) + 
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  # scale = 4
  scale_x_continuous(trans = pseudo_log_trans(), breaks = c(0, 3, 10, 30, 100, 300)) +

  scale_fill_manual(values = c('black','white')) +
  theme_cowplot() +
  labs(x = 'Shared Bacteria') +
  theme(legend.key.size = unit(c(.75, .5, .75, .5), "lines"),
        legend.title = element_blank(),
        axis.title.y = element_blank(), 
        )

在这里插入图片描述

Most unrelated strains

samples.sstr_data %>% 
  filter(Study.pair == 'ALM') %>% 
  filter(overlap > min_overlap) %>%
  group_by(genus, species, Related.bool) %>% 
  summarize(count = sum(Shared_Strain.bool == T, na.rm = T),
            total = sum(!is.na(Shared_Strain.bool))) %>% 
  mutate(fraction = count / total) %>% 
  filter(total > 20) %>% 
  arrange(Related.bool, desc(fraction))

Control

plot.control <-
  sstr %>% 
  filter(Study.row %in% control.studies &
         Study.col %in% control.studies) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated')) %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = c('f.shared_strains.mvs', 'f.shared_strains.cvs')) %>% 
mutate(metric = case_when(
  metric == 'f.shared_strains.mvs' ~ 'SameStr', 
  metric == 'f.shared_strains.cvs' ~ 'Consensus', 
)) %>% 
ggplot(aes(value, Related.text, fill = metric)) + 
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  # scale = 4
  scale_x_continuous(labels = percent_format(), expand = c(0,0), breaks = c(0, 0.5, 1)) + 
  scale_fill_manual(values = c('black','white')) +
  theme_cowplot() +
  labs(x = 'as [%] of analyzed Spp.') + 
  theme(legend.title = element_blank(),
        axis.title.y = element_blank(), 
        )
plot.control + theme(legend.position = 'none')

Exporting Plot

output_name = 'SharedStrains.Control'

ggsave(plot.control + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 2.85, height = 3.25,
       filename = paste0(results_dir, fig, output_name, '.RidgePlot.pdf'))

sstr.control %>% 
  dplyr::select(Related.text, n.shared_strains.mvs) %>% 
  group_by(Related.text) %>%
  summarize_all(.funs = funs(n(), min, max, median, mean, sd))

Tool Comparison

Format

Merge Tool Data

alm_table <-
  sf %>% 
  left_join(., sstr.alm %>% 
  rename_at(.vars = vars(n.shared_strains.mvs,
                         n.shared_species,
                         n.shared_genus,
                         n.shared_family,
                         f.shared_strains.mvs,
                         f.shared_species,
                         f.shared_genus,
                         f.shared_family),
            .funs = funs(paste0(gsub(., pattern = '.mvs', replacement = ''), '.sstr'))) %>% 

  rowwise() %>% 
  mutate(Name.paste = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  dplyr::select(-Name.row, -Name.col, -Case_Name.row, -Case_Name.col)) %>% 
  
  # label study, case
  mutate(Case_Split.row = str_split(Case_Name.row, ';'), 
         Case_Split.col = str_split(Case_Name.col, ';')) %>%
  rowwise() %>%
  mutate(Study.same = Study.row == Study.col, 
         Study.pair = ifelse(Study.same, Study.row, 'Unrelated'),
         Case_Name.same = (Case_Name.col %in% Case_Split.row) | (Case_Name.row %in% Case_Split.col), 
         Case_Name.pair = ifelse(Case_Name.same, intersect(Case_Split.row, Case_Split.col), 'Unrelated')) %>% 
  dplyr::select(-starts_with('Case_Split.')) %>% 
  ungroup() %>% 
  
  # label comparison type + study, case 
  mutate(
    Pair_Type = 'No-Pair',
    Pair_Type = case_when(

      
    (Sample_Type.row == 'recipient' & Sample_Type.col == 'donor') | 
    (Sample_Type.col == 'recipient' & Sample_Type.row == 'donor') ~ 'Pre-FMT/Donor',
    
    (Sample_Type.row == 'post' & Sample_Type.col == 'recipient') | 
    (Sample_Type.col == 'post' & Sample_Type.row == 'recipient') ~ 'Pre-FMT/Post-FMT',
    
    (Sample_Type.row == 'post' & Sample_Type.col == 'donor') | 
    (Sample_Type.col == 'post' & Sample_Type.row == 'donor') ~ 'Donor/Post-FMT',
    
    Sample_Type.row == 'recipient' & Sample_Type.col == 'recipient' ~ 'Pre-FMT/Pre-FMT',
    Sample_Type.row == 'post' & Sample_Type.col == 'post' ~ 'Post-FMT/Post-FMT',
    Sample_Type.row == 'donor' & Sample_Type.col == 'donor' ~ 'Donor/Donor')) %>% 
  
  # label family relationship
  mutate(Same_Case.label = ifelse(Case_Name.same, 'Same Case', 'Distinct Case'),
         Same_Study.label = ifelse(Study.same, 'Same Study', 'Distinct Study'),
         ) %>% 
  
  dplyr::select(Name.paste, contains('n.shared'), starts_with('f.'), ends_with('.pair'), 
         Related.label, Related.bool, Pair_Type) %>% 
  mutate(Related.text = ifelse(Related.bool, 'Related', 'Unrelated'))

alm_table.n <-
  alm_table %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('n.shared')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  mutate(taxonomic_level = gsub(taxonomic_level, pattern = 'shared_', replacement = ''),
         taxonomic_level = gsub(taxonomic_level, pattern = 'strains', replacement = 'strain')) %>% 
  dplyr::select(-metric)

alm_table.f <-
  alm_table %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('f.shared')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  mutate(taxonomic_level = gsub(taxonomic_level, pattern = 'shared_', replacement = ''),
         taxonomic_level = gsub(taxonomic_level, pattern = 'strains', replacement = 'strain')) %>% 
  dplyr::select(-metric)

Shared Strains n

Number of shared strains between related and unrelated samples of the ALM rCDI cohort based on CVS, MVS of MetaPhlAn2 alignments and Strain Finder on AMPHORA markers

plot.strain_count <-

  alm_table.n %>% 
  filter(taxonomic_level == 'strain') %>% 
  
  mutate(tool = case_when(
    tool == 'sstr' ~ 'MVS',
    tool == 'sf' ~ 'SF',
    tool == 'cvs' ~ 'CVS',
    T ~ tool)) %>%

  mutate(tool = fct_relevel(tool,'SF','CVS','MVS')) %>%

  ggplot(aes(value, tool, fill = Related.text)) +
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  
  scale_x_continuous(expand = c(0,0), 
                     trans = pseudo_log_trans(1, base = 10),
                     breaks = c(0, 1, 10, 30, 100, 300)
) + 
  theme_cowplot() +
  labs(x = 'Count [n]') +  
  scale_fill_manual(values = c('black', 'white')) + 
  theme(strip.background = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        legend.position = c(0.6, 0.9))


legend <- cowplot::get_legend(plot.strain_count)
plot.strain_count + theme(legend.position = 'none')
grid.newpage()
grid.draw(legend)

Whereas Strain Finder is finding a lot of co-occurring strains in unrelated samples, the consensus approach is very conservative. Compared to SameStr MVS, consensus finds slightly fewer potential false-positives (means ~.45 vs. .22 strains in unrelated samples) but also misses on a large fraction of potential true-positives (means ~15 vs. 8 in related samples)

alm_table.n %>% 
  filter(taxonomic_level == 'strain') %>% 
  ungroup() %>% 
  mutate(tool = case_when(
    tool == 'sstr' ~ 'MVS',
    tool == 'sf' ~ 'SF',
    tool == 'cvs' ~ 'CVS',
    T ~ tool)) %>% 
  dplyr::select(Name.paste, Related.text, tool, value) %>% 
  pivot_wider(names_from = 'tool', values_from = 'value') %>% 
  mutate(`SF-MVS` = SF - MVS) %>% 
  dplyr::select(-Name.paste) %>% 
  group_by(Related.text) %>% 
  summarize_all(.funs = funs(n(), min, max, median, mean, sd)) %>% 
  group_by(Related.text) %>%
  mutate_all(.funs = funs(round(., 2))) %>% 
  pivot_longer(names_to = 'names', values_to = 'value', cols = matches('.*_.*')) %>% 
  separate(names, into = c('tool','metric'), sep = '_') %>% 
  pivot_wider(names_from = 'metric', values_from = 'value')

Related
CVS 7.99±6.83
MVS 14.77±11.34
SF 78.35±73.50
DIFF 93.13±82.59

Unrelated
CVS 0.22±0.55
MVS 0.45±0.75
SF 34.71±37.52
DIFF 35.16±37.68

Shared Strains %

Fraction of shared strains betweent related and unrelated samples of the ALM rCDI cohort

plot.strain_fraction <-

  alm_table.f %>% 
  filter(taxonomic_level == 'strain') %>% 

  mutate(tool = case_when(
    tool == 'sstr' ~ 'SameStr MVS/\nMetaPhlAn2', 
    tool == 'sf' ~ 'StrainFinder/\nmg-OTUs', 
    tool == 'cvs' ~ 'Consensus/\nMetaPhlAn2', 
    T ~ tool)) %>% 
  
  mutate(tool = fct_relevel(tool,
    'StrainFinder/\nmg-OTUs', 
    'Consensus/\nMetaPhlAn2', 
    'SameStr MVS/\nMetaPhlAn2', 
  )) %>% 

  ggplot(aes(value, tool, fill = Related.text)) +
  geom_density_ridges(alpha = 0.75, scale = 4, stat = 'binline', bins = 18) +  
  scale_x_continuous(expand = c(0,0), 
                     trans = pseudo_log_trans(base = 10),
                     labels = percent_format()
) + 
  theme_cowplot() +
  labs(x = 'Taxa Shared between Samples [%]') + 
  scale_fill_manual(values = c('black', 'white')) + 
  theme(strip.background = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        legend.position = c(0.6, 0.9))

plot.strain_fraction + theme(legend.position = 'none')

Exporting Plot

output_name = 'SharedStrains.Alm'

ggsave(plot.strain_count + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 3.5, height = 3.25,
       filename = paste0(results_dir, fig, output_name, '.Counts', '.RidgePlot.pdf'))
ggsave(plot.strain_fraction + theme(legend.position = 'none', axis.line.y = element_blank()), 
       device = 'pdf', dpi = 300, width = 5, height = 4,
       filename = paste0(results_dir, fig, output_name, '.Fraction', '.RidgePlot.pdf'))
ggsave(legend, 
       device = 'pdf', dpi = 300, width = 5, height = 5,
       filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))

Shared F>G>Spp. %

alm_table %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('f.shared')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  dplyr::select(-metric) %>% 
  filter(tool != 'cvs') %>% 
  mutate(taxonomic_level = str_to_title(gsub(taxonomic_level, pattern = 'shared_', replacement = ''))) %>% 
  mutate(tool = case_when(tool == 'sstr' ~ 'SameStr/\nMetaPhlAn2', tool == 'sf' ~ 'StrainFinder/\nmg-OTUs', T ~ tool)) %>% 

  ggplot(aes(value, tool, fill = Related.text)) +
  geom_density_ridges(alpha = 0.75, scale = 3) + 
  scale_x_continuous(limits = c(0, 1), labels = percent_format(), expand = c(0,0)) + 
  theme_cowplot() +
  labs(x = 'Fraction of Taxa Shared between Samples') + 
  scale_fill_manual(values = c('black', 'white')) + 
  facet_wrap(~taxonomic_level, nrow = 1, scales = 'free_x') + 
  theme(strip.background = element_blank(),
        axis.title.y = element_blank(),
        legend.title = element_blank(),
        legend.position = c(0.8, 0.9))

Sensitivity of Taxa Detection

Format

Comparison of genera found with MP2/Amphora

counts <-
  full_join(sf_genus_counts, sf_species_counts, by = 'Sample') %>%  
  mutate(Name = paste0('ALM_', Sample, '.pair')) %>% 
  full_join(., taxa_counts %>% 
              rename(Name = 'Sample') %>% dplyr::select(-n.family), 
            by = 'Name', suffix = c('.sf','.mp')) %>% 
  filter(!is.na(n.species.mp), !is.na(n.species.sf))

lm_eqn <- function(df){
    m <- lm(mp ~ sf, df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)~"="~rv, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             rv = format(sqrt(summary(m)$r.squared), digits = 2)))
    as.character(as.expression(eq));
}
counts.data <- counts %>% 
  pivot_longer(names_to = 'metric', values_to = 'n', cols = starts_with('n.')) %>% 
  separate(metric, into = c('metric','taxonomic_level','tool'), sep = '[.]') %>% 
  dplyr::select(-metric) %>% 
  pivot_wider(names_from = 'tool', values_from = 'n') %>% 
  mutate(difference = mp - sf) 

counts.data.genus <-
  counts.data %>% 
  filter(taxonomic_level == 'genus')

counts.data.species <-
  counts.data %>% 
  filter(taxonomic_level == 'species')

get_stats <- function(df, digits = 2) {
  df.mean <- mean(df, na.rm = T)
  df.sd <- sd(df, na.rm = T)
  df.format <- paste('µ=', round(df.mean, digits = digits),  '±', round(df.sd, digits = digits), sep = '')
  return(
    list(
      'mean' = df.mean, 
      'sd' = df.sd,
      'format' = df.format))
}
digits = 1
stats.g.diff <- get_stats(counts.data.genus$difference, digits = digits)
stats.g.mp <- get_stats(counts.data.genus$mp, digits = digits)
stats.g.sf <- get_stats(counts.data.genus$sf, digits = digits)
stats.s.diff <- get_stats(counts.data.species$difference, digits = digits)
stats.s.mp <- get_stats(counts.data.species$mp, digits = digits)
stats.s.sf <- get_stats(counts.data.species$sf, digits = digits)

Statistics

counts.data %>% 
  dplyr::select(-Sample, -Name) %>% 
  group_by(taxonomic_level) %>% 
  summarize_all(.funs = funs(mean, sd)) %>% 
  dplyr::select(taxonomic_level, starts_with('sf_'), starts_with('mp_'), starts_with('dif')) %>% 
  mutate_at(.vars = vars(everything(), -taxonomic_level),
            .funs = funs(round(., digits = 2)))

genus:
sf = 23.78 ± 16.67
mp = 50.54 ± 15.0

species:
sf = 48.48 ± 33.88
mp = 97.62 ± 39.6

Detected Taxa per Sample

yy.s = 80
yy.g = 30
nudge = 70

plot <-
  counts.data %>% 
  ggplot(aes(y = mp, x = sf, fill = taxonomic_level)) +
  geom_point(shape = 21, col = 'black', size = 5) +

  annotate(geom = 'text', y = yy.s, x = yy.s + nudge + 5, label = stats.s.diff$format, size = 5, parse = F) +
  annotate(geom = 'text', y = yy.g, x = yy.g + nudge, label = stats.g.diff$format, size = 5, parse = F) +
  
  annotate("segment", show.legend = F, 
           arrow = arrow(length = unit(0.3, "cm"), type = "closed"), size = 1, col = 'black',
   y=yy.s, yend=yy.s + stats.s.diff$mean, x=yy.s, xend=yy.s) + 
  annotate("segment", show.legend = F, 
           arrow = arrow(length = unit(0.3, "cm"), type = "closed"), size = 1, col = 'black',
   y=yy.g, yend=yy.g + stats.g.diff$mean, x=yy.g, xend=yy.g) + 
    
  geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + 
  
  theme_cowplot() +
  scale_x_continuous(limits = c(0, 200)) + 
  scale_y_continuous(limits = c(0, 200)) + 
  scale_fill_manual(values = c('grey25', 'white')) +
  scale_color_manual(values = c('black', 'black')) + 
  theme(aspect.ratio = 1,
        plot.subtitle = element_text(hjust = 0.5), 
        legend.title = element_blank(), 
        legend.position = c(0.02, 0.9)) +
  labs(y = 'SameStr / MetaPhlAn2', x = 'Strain Finder / mg-OTUs', subtitle = 'Detected Taxa per Sample')
plot

Exporting Plot

output_name = 'DetectedTaxa.Species.Alm'

ggsave(plot, 
       device = 'pdf', dpi = 300, width = 3.75, height = 3.75,
       filename = paste0(results_dir, fig, output_name, '.CorrPlot.pdf'))

Taxa Differences

Number of detected mg-OTUs:
species == 306
genus == 116

sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>% 
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(bact) %>% 
  summarize(present = as.numeric(sum(rel_abund, na.rm = T) > 0)) %>% 
  summarize(count = sum(present, na.rm = T)) 

sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0)) %>% 
  left_join(., sf_species_table %>% dplyr::select(bact, genus), by = 'bact') %>% 
  group_by(genus) %>% 
  summarize(present = as.numeric(sum(rel_abund, na.rm = T) > 0)) %>% 
  summarize(count = sum(present, na.rm = T))

Number of detected MetaPhlAn2 species:
species == 399
genus == 154

mp_species.long %>% 
  dplyr::select(species, Name, rel_abund) %>% 
  filter(grepl(Name, pattern = '^ALM'), rel_abund > 0) %>% 
  filter(!grepl(species, pattern = 'unclassified')) %>% 
  group_by(species) %>% 
  summarize(present = sum(rel_abund > 0, na.rm = T)) %>% 
  summarize(count = sum(present > 0, na.rm = T))

mp_species.long %>% 
  dplyr::select(genus, Name, rel_abund) %>% 
  filter(grepl(Name, pattern = '^ALM'), rel_abund > 0) %>% 
  filter(!grepl(genus, pattern = 'unclassified')) %>% 
  group_by(genus) %>% 
  summarize(present = sum(rel_abund > 0, na.rm = T)) %>% 
  summarize(count = sum(present > 0, na.rm = T))

Format

found_sf <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > 0))) %>%
  pivot_longer(names_to = 'tax', values_to = 'rel_abund', cols = starts_with('BACT')) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  dplyr::select(-gdna) %>% 
  group_by(Sample, bact) %>% 
  summarize(rel_abund = as.numeric(sum(rel_abund, na.rm = T) > 0), .groups = 'drop') %>% 
  left_join(., sf_species_table %>% dplyr::select(bact, genome, genus), by = 'bact') %>% 
  group_by(Sample, genus) %>% 
  summarize(species_count.sf = sum(rel_abund, na.rm = T), .groups = 'drop')
found_mp <-
  mp_species.long %>% 
  dplyr::select(genus, species, Name, rel_abund) %>% 
  filter(grepl(Name, pattern = '^ALM'), rel_abund > 0) %>% 
  mutate(genus = gsub(genus, pattern = '_noname', replacement = '')) %>% 
  group_by(genus, Name) %>% 
  summarize(species_count.mp = sum(rel_abund > 0, na.rm =T), .groups = 'drop') %>% 
  mutate(Sample = gsub(str_split_fixed(Name, pattern = '_', 2)[,2], pattern = '.pair', replacement = '')) %>% 
  dplyr::select(-Name)

taxa.genus.important <-
  full_join(found_sf, found_mp) %>% 
  mutate_at(.vars = vars(starts_with('species_count')),
            .funs = funs(ifelse(is.na(.), 0, .))) %>% 
  group_by(genus) %>% 
  summarize(species_count.sf = sum(species_count.sf, na.rm = T), 
            species_count.mp = sum(species_count.mp, na.rm = T), .groups = 'drop') %>% 
  mutate(total = species_count.mp + species_count.sf) %>% 
  arrange(desc(total)) %>% 
  top_n(30, total) %>% 
  pull(genus)

found <- 
  full_join(found_sf, found_mp) %>% 
  mutate_at(.vars = vars(starts_with('species_count')),
            .funs = funs(ifelse(is.na(.), 0, .))) %>% 
  filter(genus %in% taxa.genus.important)

Ridge Plot

plot <- 
  found %>% 
  pivot_longer(names_to = 'metric', values_to = 'count', cols = c('species_count.sf', 'species_count.mp')) %>% 
  mutate(metric = case_when(
    metric == 'species_count.sf' ~ 'mg-OTUs', 
    metric == 'species_count.mp' ~ 'MetaPhlAn2', 
  )) %>% 

  ggplot(aes(count, genus, fill = metric)) + 
  geom_density_ridges(panel_scaling = F, alpha = 0.75, scale = 2) + 
  scale_x_continuous(trans = pseudo_log_trans()) + 
  labs(x = 'Detected Species') + 
  theme_cowplot() + 
  theme(axis.title.y = element_blank()) +
  scale_fill_manual(values = c('black','white')) 

plot + theme(legend.position = 'none')

legend <- cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)

Exporting Plot

output_name = 'DetectedTaxa.Species.Alm'

ggsave(plot + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 4, height = 6,
       filename = paste0(results_dir, fig, output_name, '.RidgePlot.pdf'))
ggsave(legend, 
       device = 'pdf', dpi = 300, width = 1.25, height = 1.5,
       filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))

found %>% 
  ungroup() %>% 
  dplyr::select(-Sample) %>% 
  group_by(genus) %>% 
  summarize_all(.funs = funs(mean, sd)) %>% 
  mutate(diff = species_count.sf_mean - species_count.mp_mean) %>% 
  group_by(genus) %>% 
  mutate_all(.funs = funs(round(., 2))) %>% 
  ungroup() %>% 
  rename_at(.vars = vars(starts_with('species_')), 
            .funs = funs(gsub(., pattern = 'species_count.', replacement = ''))) %>% 
  dplyr::select(genus, starts_with('sf'), starts_with('mp'), diff) %>% 
  arrange(diff) %>% 
  head(10)

Genus SF vs. MP

Streptococcus 2.68±4.02 vs. 8.15±2.90 (-5.47)
Lachnospiraceae 0.00±0.00 vs. 4.83±2.54 (-4.83)
Bacteroides 3.87±4.70 vs. 8.44±4.70 (-4.57)
Lactobacillus 1.41±2.37 vs. 5.89±2.69 (-4.48)
Clostridium 2.43±3.25 vs. 6.18±3.57 (-3.75)

SF Majority Strain

sf_min_abund <- function(min_abund) {

  sf_strain_table.matrix <-
  sf_strain_table %>% 
  mutate_at(.vars = vars(everything(), -Sample), 
            .funs = funs(as.numeric(. > min_abund))) %>% # set > .5 filter here to compare SF majority strains
  column_to_rownames('Sample') %>% 
  as.matrix()

colnames(sf_strain_table.matrix) <- 
  tibble(tax = colnames(sf_strain_table.matrix)) %>% 
  separate(col = tax, into = c('bact', 'gdna'), sep = ';') %>% 
  group_by(bact) %>% 
  mutate(strain_id = paste0(bact, '.', dense_rank(gdna))) %>% 
  pull(strain_id)

# strain
sf_strain_table.dist <- sf_strain_table.matrix %*% t(sf_strain_table.matrix)
sf_strain_table.long <- 
  distmat_to_long(distmat = as.data.frame(sf_strain_table.dist), 
                  value_name = paste0('n.shared_strains.', 'min_abund_', min_abund),
                  rm_diag = T)
return(sf_strain_table.long)
}

df <- sf_min_abund(0)
for (m in c(5e-3, 5e-2, 5e-1)) {
  df <- full_join(df, sf_min_abund(m), by = c('row','col')) 
}


related.v <-
  alm_table.n %>% 
  filter(taxonomic_level == 'strain', tool == 'sf') %>% 
  dplyr::select(Name.paste, Related.text)

sf <-
  df %>% 
  
  rename(Sample.row = row, 
         Sample.col = col, 
         ) %>% 
  
  rename_at(.vars = vars(matches('n.*col'), matches('n.*row')),
            .funs = funs(gsub(gsub(., pattern = '.row', replacement = '.sf.row'), 
                              pattern = '.col', replacement = '.sf.col'))) %>%
  left_join(., samples.metadata, by = c('Sample.row' = 'Sample')) %>% 
  left_join(., samples.metadata, by = c('Sample.col' = 'Sample'), suffix = c('.row', '.col')) %>% 
  filter(!is.na(Name.row), !is.na(Name.col)) %>% 
  rowwise() %>% 
  mutate(Name.paste = paste0(sort(c(Name.row, Name.col)), collapse = ',')) %>% 
  ungroup() %>% 
  left_join(., related.v) %>% 
  # mutate(related = ifelse(replace_na(Donor.Subject.row == Donor.Subject.col, F), 'Related', 'Unrelated')) %>% 
  pivot_longer(names_to = 'metric', values_to = 'value', 
               cols = starts_with('n.shared_strains')) %>% 
  separate(metric, into = c('metric','min_abund'), sep = '.abund_') %>% 
  mutate(min_abund = as.numeric(min_abund))

Strain Finder shows a high number of cooccurring strains in unrelated sample pairs. Cooccurrence distributions of related / unrelated sample pairs overlap widely, even when considering only strains that were detected at >=50% strain abundance in the species population - a problem that might be due to the amphora markers.

plot <- 
  sf %>% 
  
  mutate(metric = paste0('>=', round(min_abund * 100, 5), '%')) %>% 

  ggplot(aes(value, fct_reorder(metric, min_abund, .desc = T), fill = Related.text)) + 
  geom_density_ridges(alpha = 0.75, scale = 2) +
  scale_fill_manual(values = c('black','white')) +
  scale_x_continuous(expand = c(0,0), 
                     trans = pseudo_log_trans(1, base = 10),
                     breaks = c(0, 1, 10, 30, 100, 300)
) + 
  theme_cowplot() +
  theme(aspect.ratio = 1) + 
  labs(x = 'Shared Strains', y = 'Strain Finder\nStrain Abundance') +
  theme(legend.title = element_blank(),
        legend.position = 'top')
plot

Exporting plot

output_name = 'StrainFinder.StrainCooccurrence'

ggsave(plot, 
       device = 'pdf', dpi = 300, width = 4, height = 4,
       filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))

Mock Multi-Strain Species Population

Detecting strains hidden in Mock Multi-Strain Species Population with varying numbers of strains, target coverage, noise coverage and total coverage.

Format

percent_format_signif <-
  function(x, label.precision=3, label.percent=T) {
    if (label.percent == T) {
      paste0(signif(100 * x, label.precision), "%")
    } else {
      signif(digits = label.precision)
    }
  }

Calculate metrics

df <- 
  mock.sstr_data %>% 

  pivot_longer(names_to = 'metric', 
               values_to = 'distance', 
               cols = c('distance.mvs', 'distance.cvs')) %>% 
  mutate(shared = distance > min_similarity & overlap > min_overlap, 
         should = tsc > 0,
         TP = shared & should,
         FP = shared & !should, 
         TN = !shared & !should, 
         FN = !shared & should, 
         event = case_when(
           TP ~ 'TP', 
           FP ~ 'FP', 
           TN ~ 'TN', 
           FN ~ 'FN',
           T ~ 'Unknown'
         )) %>% 
  mutate(metric = case_when(
    metric == 'distance.cvs' ~ 'Consensus (CVS)',
    metric == 'distance.mvs' ~ 'SameStr (MVS)',
    T ~ 'Unknown'
  )) %>% 
  group_by(metric, tsc, ngc, total_coverage, rel_abund.target) %>% 
  summarize(TP = sum(TP, na.rm = T), 
            FP = sum(FP, na.rm = T), 
            TN = sum(TN, na.rm = T),
            FN = sum(FN, na.rm = T), 
            .groups = 'drop') %>% 
  mutate(Precision = TP / (TP + FP), 
    Recall = TP / (TP + FN), 
    `Precision/Recall` = Precision / Recall, 
    Accuracy = (TP+TN)/(TP+FP+FN+TN),
    `Error Rate` = 1 - Accuracy)

N Mock Observations

nrow(mock.sstr_data)

Accuracy Heatmap

plot <-
  df %>% 
  mutate(`Error Rate` = 
         Hmisc::cut2(`Error Rate`, 
                   cuts = seq(0, 1, 0.2), 
                   minmax = T, 
                   formatfun = scales::percent, 
                   oneval = F) %>% 
         as.character() %>% 
         gsub(pattern = '\\[', replacement = '') %>% 
         gsub(pattern = '\\]', replacement = '') %>% 
         gsub(pattern = '\\)', replacement = '') %>% 
         gsub(pattern = ',', replacement = '-')) %>% 
  filter(ngc > 0) %>% 
  ggplot(aes(
    tsc, 
    as.factor(ngc))
    ) +  
  geom_tile(aes(fill = `Error Rate`), col = 'black') +
  scale_x_continuous(expand = c(0,0),
                     breaks = seq(0, 20, 4)) + 
  scale_y_discrete(expand = c(0,0)) + 
  facet_wrap(~ metric, scales = 'free_y') +
  theme_cowplot() + 
  theme(aspect.ratio = 1,
        strip.background = element_blank(),
        panel.spacing.x = unit(1, "lines"),
        axis.text = element_text(size = 9),
        axis.title = element_text(size = 12),
        axis.title.x = element_text(hjust = 0.5),
        legend.position = 'right',
        legend.title = element_text(size = 12),
        legend.key = element_rect(colour = "black")) + 
  labs(x = 'Mean Fold-Coverage of the Target Strain', 
       y = 'Noise Coverage') + 
  scale_fill_manual(values = c('white', 'gray75', 'gray50', 'gray25', 'black'))

plot + theme(legend.position = 'none')

legend <- cowplot::get_legend(plot)
grid.newpage()
grid.draw(legend)

Exporting Plot

output_name = 'AccuracyHeatmap.Mock'

ggsave(plot + theme(legend.position = 'none'), 
       device = 'pdf', dpi = 300, width = 5.5, height = 3,
       filename = paste0(results_dir, fig, output_name, '.Plot.pdf'))
ggsave(legend, 
       device = 'pdf', dpi = 300, width = 1.25, height = 1.5,
       filename = paste0(results_dir, fig, output_name, '.Legend.pdf'))

Comparison of average detection accuracy using Consensus vs. SameStr MVS with at least 5x coverage when target strain is dominant (>50% species population) or sub-dominant (15-50% species population)

df %>% 
  mutate(miss_rate = TP / (FN + TP)) %>% 
  filter(rel_abund.target > .5, total_coverage > 5) %>% 
  group_by(metric) %>% 
  summarize(Accuracy = 1 - mean(`Error Rate`, na.rm = T))

df %>% 
  filter(rel_abund.target > .15) %>% 
  filter(rel_abund.target <= .5, total_coverage > 5) %>%
  group_by(metric) %>% 
  summarize(Accuracy = 1 - mean(`Error Rate`, na.rm = T))

Phylogenetic Resolution of MP Markers

Comparing full-genome based Fast-ANI scores to MetaPhlAn2 based similarity

Reference Genomes

Similarity Distribution of Reference Genomes

percent_format_signif <- function(x, label.precision=3, label.percent=T) {
    if (label.percent == T) {
      paste0(signif(100 * x, label.precision), "%")
    } else {
      signif(digits = label.precision)
    }
}

species_ridge <-
  refs %>% 
  filter(distance > .97 | ani > .97) %>%
  pivot_longer(values_to = 'value', names_to = 'metric', cols = c('distance', 'ani')) %>% 
  mutate(metric = case_when(
    metric == 'distance' ~ 'Nucleotide Similarity of\nMetaPhlAn2 Markers',
    metric == 'ani' ~ 'fastANI Identity of\nFull Sequences',
    T ~ metric
  )) %>% 
  ggplot(aes(x = value, 
             y = fct_reorder(genus, genus_avg_marker_len), 
             group = species, 
             fill = species)) +
  geom_density_ridges(scale = 3, 
                     alpha = 0.5, stat = 'binline', show.legend = F,
                     ) + 
  scale_x_reverse(labels = percent_format_signif, 
                  breaks=c(min_similarity, 0.99, 0.98, 0.97),
                  limits = c(NA, 0.97), 
                  expand = c(0.1, 0)) +
  theme_cowplot() + 
  geom_vline(xintercept = min_similarity, show.legend = T, linetype = 'dashed') + 
  guides(fill = guide_legend(ncol = 1)) +
  labs(x = 'Sequence Identity') +
  facet_grid(cols = vars(metric)) +
  theme(strip.background = element_blank(),
        axis.title.y = element_blank())

species_ridge

refs %>% 
  pivot_longer(values_to = 'value', names_to = 'metric', cols = c('distance', 'ani')) %>% 
  group_by(metric) %>% 
  summarize(not_shared = sum(value < min_similarity, na.rm = T),
            shared = sum(value > min_similarity, na.rm = T)) %>% 
  mutate(total = not_shared + shared,
         rate = shared / total * 100)

Correlation of MP marker vs. fastANI based similarity

min = .97
refs.cor <- 
  cor.test(~ ani + distance,
           data = refs %>% 
             filter(!is.na(ani), !is.na(distance)) %>%
             filter(ani > min & distance > min),
           method = 'spearman') %>% 
  broom::tidy(.) %>% 
  mutate(estimate.round = round(estimate, 2), 
         sig = cut(p.value, 
                   breaks = c(-Inf, 0.001, 0.01, 0.05, Inf),
                   labels = c("***", "**", "*", "")))
refs.cor

plot.scatter <-
  ggscatter(data = refs %>% 
  filter(!is.na(ani), !is.na(distance)) %>%
  filter(ani > min & distance > min),
  'ani', 'distance', 
  color = 'black',  
  fill = 'black',
  alpha = 0.025,
  xlab = 'Full-Sequence fastANI', ylab = 'MetaPhlAn2 Markers',
  ggtheme = theme_cowplot()) + 
  theme(aspect.ratio = 1) +
  scale_x_continuous(labels = percent_format_signif, breaks=c(min_similarity, 0.99, 0.98, 0.97)) +
  scale_y_continuous(labels = percent_format_signif, breaks=c(min_similarity, 0.99, 0.98, 0.97)) + 
  annotate('text', label = paste0('r=', 
                                  refs.cor$estimate.round,
                                  ' ', 
                                  refs.cor$sig), 
           hjust = 0, 
           x = .97, y = .999, size = 6) + 
  geom_density_2d(col = 'black')

plot <-
  ggExtra::ggMarginal(
    plot.scatter,
    type = 'density',
    color = 'black',
    fill = 'grey75', 
    size = 10)
plot

在这里插入图片描述

Exporting Plots

output_name = 'MetaPhlAn2_FastANI.References'
ggsave(species_ridge, 
       device = 'pdf', dpi = 300, width = 6, height = 8,
       filename = paste0(results_dir, fig, output_name, '.RidgePlot.pdf'))

ggsave(plot, 
       device = 'pdf', dpi = 300, width = 4, height = 4,
       filename = paste0(results_dir, fig, output_name, '.CorrPlot.pdf'))