Last updated on Sun Feb 11 22:55:44 2024.
If you use the following figures in your own work, please cite:
# Load required packages
library(magrittr)
library(dplyr)
library(purrr)
library(jsonlite)
library(forcats)
library(ggplot2)
library(plotly)
library(here)
library(stringr)
# We need two queries: one for uppercase titles, the other for lowercase ones
pdb_queries <- c(
    uppercase = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*UCLEOSOM*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json',
    lowercase = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*ucleosom*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json',
    ncp = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*NCP*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json'
)
# The following PDB entries have the word "nucleosome in their title, but do not
# actually contain a nucleosome, so we need to exclude them from the analysis.
# Unfortunately, there is no good way to automate this.
non_nucleosome_structures <- c(
    "1hst",
    "2z2r",
    "5x7v",
    "3uv2",
    "3fs3",
    "1wg3",
    "1nw3",
    "5ikf",
    "3gyw",
    "3gyv",
    "1ofc",
    "2ayu",
    "2iw5",
    "3hfd",
    "6uch",
    "5r4k",
    "5r4m",
    "5r4l",
    "5r4o",
    "5r4g",
    "5r4h",
    "5r4j",
    "5r4i",
    "5r4n",
    "1bj6",
    "6qds",
    "2iwj",
    "4dvk",
    "1a6b",
    "6qdu",
    "6ne8",
    "1tsu",
    "1esk",
    "7c4j"
)
# This is a helper pipeline to extract data
dig_up_data <- . %>%
    .$response %>%
    .$docs %>%
    as_tibble()
# This is a helper function to detect the presence of a binding factor
has_binding_factor_one <- function(number_of_protein_chains, title) {
    # It takes 8 histone chains to make a nucleosome, so if the number of
    # protein chains is not divisible by 8, this means there is a binding factor,
    # unless we're seeing the overlapping dinucleosome (14 proteins chains, but
    # they are all histones... one octamer + one hexamer)
    oldn <- str_detect(title, "unusual")
    if (number_of_protein_chains %% 8 != 0 & !oldn) {
        return(TRUE)
    } else {
        # But for multiple of 8 chains > 8 (i.e. 16, 24, 32, 40), we can have this
        # number of chains by chance even with binding factors. There is no good
        # way to automatically find these cases, unfortunately
        compass <- str_detect(title, "COMPASS")
        corest <- str_detect(title, "LSD1/CoREST")
        binding_factor <- compass || corest
        if (binding_factor) {
            return(TRUE)
        } else {
            return(FALSE)
        }
    }
}
# Vectorize the above function
has_binding_factor <- function(number_of_protein_chains_vector, title) {
    map2_lgl(number_of_protein_chains_vector, title, has_binding_factor_one)
}
# Query the PDB and clean up data
pdb_data <- pdb_queries %>% 
    map(fromJSON) %>% 
    map(dig_up_data) %>% 
    bind_rows() %>% 
    filter(!(pdb_id %in% non_nucleosome_structures)) %>% 
    mutate(
      has_binding_factor    = has_binding_factor(number_of_protein_chains, title),
      experimental_method   = as_factor(as.character(experimental_method)),
      citation_year         = as.integer(citation_year),
      molecule_name         = as.character(molecule_name),
      molecule_type         = as_factor(as.character(molecule_type))
      ) %>% 
    distinct(pdb_id, .keep_all = TRUE)All figures are interactive (you can zoom in, and hovering over elements will show more information).
nucleosome_structures_year <- pdb_data %>% 
    ggplot() +
    geom_bar(mapping = aes(x = citation_year, fill = experimental_method)) +
    guides(fill = guide_legend(title = "Experimental method")) +
    ggtitle("Structures of nucleosomes by year") +
    xlab("Publication year") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_structures_year)nucleosome_structures_year_binding_factor <- pdb_data %>% 
    ggplot() +
    geom_bar(mapping = aes(x = citation_year, fill = has_binding_factor)) +
    guides(fill = guide_legend(title = "Contains a binding factor")) +
    ggtitle("Structures of nucleosomes by year") +
    xlab("Publication year") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_structures_year_binding_factor)nucleosome_structures_method <- pdb_data %>% 
    ggplot() +
    geom_bar(mapping = aes(x = experimental_method, fill = experimental_method)) +
    guides(fill = guide_legend(title = "Experimental method")) +
    ggtitle("Structures of nucleosomes by experimental method") +
    xlab("") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_structures_method)nucleosome_binding_factors_methods <- pdb_data %>% 
    ggplot() +
    geom_bar(mapping = aes(x = experimental_method, fill = has_binding_factor)) +
    guides(fill = guide_legend(title = "Presence of a binding factor")) +
    ggtitle("Structures of nucleosomes by presence of a binding factor") +
    xlab("Experimental method") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_binding_factors_methods)nucleosome_binding_factors <- pdb_data %>% 
    ggplot() +
    geom_bar(mapping = aes(x = has_binding_factor, fill = has_binding_factor)) +
    guides(fill = guide_legend(title = "Presence of a binding factor")) +
    ggtitle("Structures of nucleosomes by presence of a binding factor") +
    xlab("Presence of a binding factor") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_binding_factors)nucleosome_binding_factors_methods_2 <- pdb_data %>% 
    ggplot() +
    geom_bar(mapping = aes(x = has_binding_factor, fill = experimental_method)) +
    guides(fill = guide_legend(title = "Experimental method")) +
    ggtitle("Structures of nucleosomes by presence of a binding factor") +
    xlab("Presence of a binding factor") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_binding_factors_methods_2)nucleosome_xtal_species <- pdb_data %>% 
    filter(experimental_method == "X-ray diffraction") %>% 
    mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>% 
    filter(is_histone == TRUE) %>% 
    mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>% 
    ggplot() +
    geom_bar(mapping = aes(x = organism_scientific_name,
                           fill = organism_scientific_name)) +
    guides(fill = guide_legend(title = "Histone species")) +
    ggtitle("Crystal structures of nucleosomes by histone species") +
    xlab("") +
    ylab("Number of PDB entries") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))
ggplotly(nucleosome_xtal_species)nucleosome_cryoem_species <- pdb_data %>% 
    filter(experimental_method == "Electron Microscopy") %>% 
    mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>% 
    filter(is_histone == TRUE) %>% 
    mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>% 
    ggplot() +
    geom_bar(mapping = aes(x = organism_scientific_name,
                           fill = organism_scientific_name)) +
    guides(fill = guide_legend(title = "Histone species")) +
    ggtitle("Cryo-EM structures of nucleosomes by histone species") +
    xlab("") +
    ylab("Number of PDB entries") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))
ggplotly(nucleosome_cryoem_species)pdb_data %>% dplyr::filter(experimental_method == "X-ray diffraction") %>% .$resolution %>% summary()##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.580   2.450   2.773   2.899   3.085   9.700nucleosome_xtal_resolution_species <- pdb_data %>% 
    filter(experimental_method == "X-ray diffraction") %>% 
    mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>% 
    filter(is_histone == TRUE) %>% 
    mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>% 
    select(resolution, organism_scientific_name) %>% 
    ggplot() +
    geom_histogram(aes(x = resolution, fill = organism_scientific_name),
                   binwidth = 0.2) +
    guides(fill = guide_legend(title = "Histone species")) +
    ggtitle("Resolution of nucleosome crystal structures by histone species") +
    xlab("Resolution (Å)") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_xtal_resolution_species)pdb_data %>% dplyr::filter(experimental_method == "Electron Microscopy") %>% .$resolution %>% summary()##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.890   3.200   3.700   4.219   4.312  20.000       2nucleosome_cryoem_resolution_species <- pdb_data %>% 
    filter(experimental_method == "Electron Microscopy") %>% 
    mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>% 
    filter(is_histone == TRUE) %>% 
    mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>% 
    select(resolution, organism_scientific_name) %>% 
    ggplot() +
    geom_histogram(aes(x = resolution, fill = organism_scientific_name),
                   binwidth = 0.2) +
    guides(fill = guide_legend(title = "Histone species")) +
    ggtitle("Resolution of nucleosome cryo-EM structures by histone species") +
    xlab("Resolution (Å)") +
    ylab("Number of PDB entries") +
    theme_bw()
ggplotly(nucleosome_cryoem_resolution_species)The graphs presented above are derived from the following dataset:
# Format table for display
pdb_table <- pdb_data %>% 
    arrange(desc(citation_year)) %>% 
    select(`PDB code` = pdb_id,
           `Citation year` = citation_year,
           `Experimental method` = experimental_method,
           Title = title)
pdb_table