Last updated on Sun Feb 11 22:55:44 2024.
If you use the following figures in your own work, please cite:
# Load required packages
library(magrittr)
library(dplyr)
library(purrr)
library(jsonlite)
library(forcats)
library(ggplot2)
library(plotly)
library(here)
library(stringr)
# We need two queries: one for uppercase titles, the other for lowercase ones
pdb_queries <- c(
uppercase = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*UCLEOSOM*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json',
lowercase = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*ucleosom*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json',
ncp = 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=title:*NCP*%20AND%20status:REL&fl=pdb_id,citation_year,title,experimental_method,resolution,organism_scientific_name,molecule_name,molecule_type,number_of_protein_chains&rows=1000000&wt=json'
)
# The following PDB entries have the word "nucleosome in their title, but do not
# actually contain a nucleosome, so we need to exclude them from the analysis.
# Unfortunately, there is no good way to automate this.
non_nucleosome_structures <- c(
"1hst",
"2z2r",
"5x7v",
"3uv2",
"3fs3",
"1wg3",
"1nw3",
"5ikf",
"3gyw",
"3gyv",
"1ofc",
"2ayu",
"2iw5",
"3hfd",
"6uch",
"5r4k",
"5r4m",
"5r4l",
"5r4o",
"5r4g",
"5r4h",
"5r4j",
"5r4i",
"5r4n",
"1bj6",
"6qds",
"2iwj",
"4dvk",
"1a6b",
"6qdu",
"6ne8",
"1tsu",
"1esk",
"7c4j"
)
# This is a helper pipeline to extract data
dig_up_data <- . %>%
.$response %>%
.$docs %>%
as_tibble()
# This is a helper function to detect the presence of a binding factor
has_binding_factor_one <- function(number_of_protein_chains, title) {
# It takes 8 histone chains to make a nucleosome, so if the number of
# protein chains is not divisible by 8, this means there is a binding factor,
# unless we're seeing the overlapping dinucleosome (14 proteins chains, but
# they are all histones... one octamer + one hexamer)
oldn <- str_detect(title, "unusual")
if (number_of_protein_chains %% 8 != 0 & !oldn) {
return(TRUE)
} else {
# But for multiple of 8 chains > 8 (i.e. 16, 24, 32, 40), we can have this
# number of chains by chance even with binding factors. There is no good
# way to automatically find these cases, unfortunately
compass <- str_detect(title, "COMPASS")
corest <- str_detect(title, "LSD1/CoREST")
binding_factor <- compass || corest
if (binding_factor) {
return(TRUE)
} else {
return(FALSE)
}
}
}
# Vectorize the above function
has_binding_factor <- function(number_of_protein_chains_vector, title) {
map2_lgl(number_of_protein_chains_vector, title, has_binding_factor_one)
}
# Query the PDB and clean up data
pdb_data <- pdb_queries %>%
map(fromJSON) %>%
map(dig_up_data) %>%
bind_rows() %>%
filter(!(pdb_id %in% non_nucleosome_structures)) %>%
mutate(
has_binding_factor = has_binding_factor(number_of_protein_chains, title),
experimental_method = as_factor(as.character(experimental_method)),
citation_year = as.integer(citation_year),
molecule_name = as.character(molecule_name),
molecule_type = as_factor(as.character(molecule_type))
) %>%
distinct(pdb_id, .keep_all = TRUE)
All figures are interactive (you can zoom in, and hovering over elements will show more information).
nucleosome_structures_year <- pdb_data %>%
ggplot() +
geom_bar(mapping = aes(x = citation_year, fill = experimental_method)) +
guides(fill = guide_legend(title = "Experimental method")) +
ggtitle("Structures of nucleosomes by year") +
xlab("Publication year") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_structures_year)
nucleosome_structures_year_binding_factor <- pdb_data %>%
ggplot() +
geom_bar(mapping = aes(x = citation_year, fill = has_binding_factor)) +
guides(fill = guide_legend(title = "Contains a binding factor")) +
ggtitle("Structures of nucleosomes by year") +
xlab("Publication year") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_structures_year_binding_factor)
nucleosome_structures_method <- pdb_data %>%
ggplot() +
geom_bar(mapping = aes(x = experimental_method, fill = experimental_method)) +
guides(fill = guide_legend(title = "Experimental method")) +
ggtitle("Structures of nucleosomes by experimental method") +
xlab("") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_structures_method)
nucleosome_binding_factors_methods <- pdb_data %>%
ggplot() +
geom_bar(mapping = aes(x = experimental_method, fill = has_binding_factor)) +
guides(fill = guide_legend(title = "Presence of a binding factor")) +
ggtitle("Structures of nucleosomes by presence of a binding factor") +
xlab("Experimental method") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_binding_factors_methods)
nucleosome_binding_factors <- pdb_data %>%
ggplot() +
geom_bar(mapping = aes(x = has_binding_factor, fill = has_binding_factor)) +
guides(fill = guide_legend(title = "Presence of a binding factor")) +
ggtitle("Structures of nucleosomes by presence of a binding factor") +
xlab("Presence of a binding factor") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_binding_factors)
nucleosome_binding_factors_methods_2 <- pdb_data %>%
ggplot() +
geom_bar(mapping = aes(x = has_binding_factor, fill = experimental_method)) +
guides(fill = guide_legend(title = "Experimental method")) +
ggtitle("Structures of nucleosomes by presence of a binding factor") +
xlab("Presence of a binding factor") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_binding_factors_methods_2)
nucleosome_xtal_species <- pdb_data %>%
filter(experimental_method == "X-ray diffraction") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
ggplot() +
geom_bar(mapping = aes(x = organism_scientific_name,
fill = organism_scientific_name)) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Crystal structures of nucleosomes by histone species") +
xlab("") +
ylab("Number of PDB entries") +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
ggplotly(nucleosome_xtal_species)
nucleosome_cryoem_species <- pdb_data %>%
filter(experimental_method == "Electron Microscopy") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
ggplot() +
geom_bar(mapping = aes(x = organism_scientific_name,
fill = organism_scientific_name)) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Cryo-EM structures of nucleosomes by histone species") +
xlab("") +
ylab("Number of PDB entries") +
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
ggplotly(nucleosome_cryoem_species)
pdb_data %>% dplyr::filter(experimental_method == "X-ray diffraction") %>% .$resolution %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.580 2.450 2.773 2.899 3.085 9.700
nucleosome_xtal_resolution_species <- pdb_data %>%
filter(experimental_method == "X-ray diffraction") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
select(resolution, organism_scientific_name) %>%
ggplot() +
geom_histogram(aes(x = resolution, fill = organism_scientific_name),
binwidth = 0.2) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Resolution of nucleosome crystal structures by histone species") +
xlab("Resolution (Å)") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_xtal_resolution_species)
pdb_data %>% dplyr::filter(experimental_method == "Electron Microscopy") %>% .$resolution %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.890 3.200 3.700 4.219 4.312 20.000 2
nucleosome_cryoem_resolution_species <- pdb_data %>%
filter(experimental_method == "Electron Microscopy") %>%
mutate(is_histone = str_detect(molecule_name, pattern = "Histone H")) %>%
filter(is_histone == TRUE) %>%
mutate(organism_scientific_name = as_factor(as.character(organism_scientific_name))) %>%
select(resolution, organism_scientific_name) %>%
ggplot() +
geom_histogram(aes(x = resolution, fill = organism_scientific_name),
binwidth = 0.2) +
guides(fill = guide_legend(title = "Histone species")) +
ggtitle("Resolution of nucleosome cryo-EM structures by histone species") +
xlab("Resolution (Å)") +
ylab("Number of PDB entries") +
theme_bw()
ggplotly(nucleosome_cryoem_resolution_species)
The graphs presented above are derived from the following dataset:
# Format table for display
pdb_table <- pdb_data %>%
arrange(desc(citation_year)) %>%
select(`PDB code` = pdb_id,
`Citation year` = citation_year,
`Experimental method` = experimental_method,
Title = title)
pdb_table