Last updated on Sun Feb 11 22:55:10 2024.
Structures of protein-DNA complexes by experimental method
All figures are interactive (you can zoom in, and hovering over
elements will show more information).
# Load required packages
library(magrittr)
library(jsonlite)
library(dplyr)
library(forcats)
library(stringr)
library(ggplot2)
library(plotly)
library(here)
# Query the PDB for all DNA molecules in crystal structures of protein-DNA
# complexes
pdb_query <- 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=molecule_type:%22DNA%22%20AND%20assembly_composition:%22DNA/protein%20complex%22&fl=pdb_id,molecule_sequence,experimental_method&rows=1000000&wt=json'
pdb_data <- pdb_query %>%
fromJSON() %>%
.$response %>%
.$docs %>%
as_tibble() %>%
distinct(pdb_id, .keep_all = TRUE) %>%
filter(experimental_method %in% c("X-ray diffraction",
"Electron Microscopy",
"Solution NMR")) %>%
mutate(dna_length = str_length(molecule_sequence),
experimental_method = as_factor(as.character(experimental_method)))
# Summary statistics
protein_dna_cplx_structures <- ggplot(data = pdb_data) +
geom_bar(mapping = aes(x = experimental_method)) +
theme_bw() +
xlab("") +
ylab("Number of PDB entries") +
ggtitle("Structures of protein-DNA complexes")
ggplotly(protein_dna_cplx_structures)
Download
figure in SVG format
DNA length in crystal structures of protein-DNA complexes
Entire distribution
dna_length_xtal <- pdb_data %>%
filter(experimental_method == "X-ray diffraction") %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in crystal structures of protein-DNA complexes") +
xlab("DNA length (bp)") +
ylab("Number of crystal structures")
ggplotly(dna_length_xtal)
Download
figure in SVG format
0-150 bp range
dna_length_xtal_150 <- pdb_data %>%
filter(experimental_method == "X-ray diffraction" & dna_length < 151) %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in crystal structures of protein-DNA complexes (0-150 bp)") +
xlab("DNA length (bp)") +
ylab("Number of crystal structures")
ggplotly(dna_length_xtal_150)
Download
figure in SVG format
DNA length in cryo-EM structures of protein-DNA complexes
dna_length_cryoem <- pdb_data %>%
filter(experimental_method == "Electron Microscopy") %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in cryo-EM structures of protein-DNA complexes") +
xlab("DNA length (bp)") +
ylab("Number of cryo-EM structures")
ggplotly(dna_length_cryoem)
Download
figure in SVG format
DNA length in NMR structures of protein-DNA complexes
dna_length_nmr <- pdb_data %>%
filter(experimental_method == "Solution NMR") %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in NMR structures of protein-DNA complexes") +
xlab("DNA length (bp)") +
ylab("Number of NMR structures")
ggplotly(dna_length_nmr)
Download
figure in SVG format
Dataset
The histograms presented above are derived from the following
dataset:
# Format table for display
pdb_table <- pdb_data %>%
arrange(desc(dna_length)) %>%
select(`PDB code` = pdb_id,
`DNA length` = dna_length)
pdb_table
Download
raw dataset in JSON format
LS0tCnRpdGxlOiAiRE5BIGxlbmd0aCBpbiBwcm90ZWluLUROQSBjb21wbGV4ZXMiCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSkKYGBgCgoqKkxhc3QgdXBkYXRlZCBvbiBgciBkYXRlKClgLioqCgojIyBIb3cgdG8gcmUtdXNlIHRoaXMgd29yawoKSWYgeW91IHVzZSB0aGVzZSBmaWd1cmVzIGluIHlvdXIgb3duIHdvcmssIHBsZWFzZSBjaXRlIHRoaXMgd2Vic2l0ZTogPGh0dHBzOi8vZG9pLm9yZy8xMC41MjgxL3plbm9kby4zNDcwMTE5PgoKIyMgU3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMgYnkgZXhwZXJpbWVudGFsIG1ldGhvZAoKQWxsIGZpZ3VyZXMgYXJlIGludGVyYWN0aXZlICh5b3UgY2FuIHpvb20gaW4sIGFuZCBob3ZlcmluZyBvdmVyIGVsZW1lbnRzIHdpbGwKc2hvdyBtb3JlIGluZm9ybWF0aW9uKS4KCmBgYHtyIERvd25sb2FkIGFuZCBjbGVhbiB1cCBkYXRhLCBnZW5lcmF0ZSBzdW1tYXJ5fQojIExvYWQgcmVxdWlyZWQgcGFja2FnZXMKbGlicmFyeShtYWdyaXR0cikKbGlicmFyeShqc29ubGl0ZSkKbGlicmFyeShkcGx5cikKbGlicmFyeShmb3JjYXRzKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShwbG90bHkpCmxpYnJhcnkoaGVyZSkKCiMgUXVlcnkgdGhlIFBEQiBmb3IgYWxsIEROQSBtb2xlY3VsZXMgaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIG9mIHByb3RlaW4tRE5BCiMgY29tcGxleGVzCnBkYl9xdWVyeSA8LSAnaHR0cHM6Ly93d3cuZWJpLmFjLnVrL3BkYmUvc2VhcmNoL3BkYi9zZWxlY3Q/cT1tb2xlY3VsZV90eXBlOiUyMkROQSUyMiUyMEFORCUyMGFzc2VtYmx5X2NvbXBvc2l0aW9uOiUyMkROQS9wcm90ZWluJTIwY29tcGxleCUyMiZmbD1wZGJfaWQsbW9sZWN1bGVfc2VxdWVuY2UsZXhwZXJpbWVudGFsX21ldGhvZCZyb3dzPTEwMDAwMDAmd3Q9anNvbicKCnBkYl9kYXRhIDwtIHBkYl9xdWVyeSAlPiUgCiAgICBmcm9tSlNPTigpICU+JSAKICAgIC4kcmVzcG9uc2UgJT4lIAogICAgLiRkb2NzICU+JSAKICAgIGFzX3RpYmJsZSgpICU+JSAKICAgIGRpc3RpbmN0KHBkYl9pZCwgLmtlZXBfYWxsID0gVFJVRSkgJT4lIAogICAgZmlsdGVyKGV4cGVyaW1lbnRhbF9tZXRob2QgJWluJSBjKCJYLXJheSBkaWZmcmFjdGlvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIkVsZWN0cm9uIE1pY3Jvc2NvcHkiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJTb2x1dGlvbiBOTVIiKSkgJT4lIAogICAgbXV0YXRlKGRuYV9sZW5ndGggPSBzdHJfbGVuZ3RoKG1vbGVjdWxlX3NlcXVlbmNlKSwKICAgICAgICAgICBleHBlcmltZW50YWxfbWV0aG9kID0gYXNfZmFjdG9yKGFzLmNoYXJhY3RlcihleHBlcmltZW50YWxfbWV0aG9kKSkpCgojIFN1bW1hcnkgc3RhdGlzdGljcwpwcm90ZWluX2RuYV9jcGx4X3N0cnVjdHVyZXMgPC0gZ2dwbG90KGRhdGEgPSBwZGJfZGF0YSkgKwogICAgZ2VvbV9iYXIobWFwcGluZyA9IGFlcyh4ID0gZXhwZXJpbWVudGFsX21ldGhvZCkpICsKICAgIHRoZW1lX2J3KCkgKwogICAgeGxhYigiIikgKwogICAgeWxhYigiTnVtYmVyIG9mIFBEQiBlbnRyaWVzIikgKwogICAgZ2d0aXRsZSgiU3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMiKQpnZ3Bsb3RseShwcm90ZWluX2RuYV9jcGx4X3N0cnVjdHVyZXMpCmBgYAoKYGBge3IgU2F2ZSBTVkcgZmlsZSBtZXRob2RzLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gInByb3RlaW4tZG5hLWNwbHgtc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IHByb3RlaW5fZG5hX2NwbHhfc3RydWN0dXJlcywKICAgICAgIGRldmljZSA9ICJzdmciLAogICAgICAgcGF0aCA9IGhlcmUoImZpZ3VyZXMiKSkKYGBgCgpbKipEb3dubG9hZCBmaWd1cmUgaW4gU1ZHIGZvcm1hdCoqXShmaWd1cmVzL3Byb3RlaW4tZG5hLWNwbHgtc3RydWN0dXJlcy5zdmcpCgojIyBETkEgbGVuZ3RoIGluIGNyeXN0YWwgc3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMKIyMjIEVudGlyZSBkaXN0cmlidXRpb24KCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzfQpkbmFfbGVuZ3RoX3h0YWwgPC0gcGRiX2RhdGEgJT4lIAogICAgZmlsdGVyKGV4cGVyaW1lbnRhbF9tZXRob2QgPT0gIlgtcmF5IGRpZmZyYWN0aW9uIikgJT4lIAogICAgZ2dwbG90KCkgKwogICAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gZG5hX2xlbmd0aCksIGJpbndpZHRoID0gMSkgKwogICAgdGhlbWVfYncoKSArCiAgICBnZ3RpdGxlKCJETkEgbGVuZ3RoIGluIGNyeXN0YWwgc3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMiKSArCiAgICB4bGFiKCJETkEgbGVuZ3RoIChicCkiKSArCiAgICB5bGFiKCJOdW1iZXIgb2YgY3J5c3RhbCBzdHJ1Y3R1cmVzIikKZ2dwbG90bHkoZG5hX2xlbmd0aF94dGFsKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgeHRhbCwgaW5jbHVkZT1GQUxTRX0KIyBTYXZlIGZpZ3VyZSBmb3IgZG93bmxvYWQKaWYgKCFkaXIuZXhpc3RzKGhlcmUoImZpZ3VyZXMiKSkpIHsKICAgIGRpci5jcmVhdGUoaGVyZSgiZmlndXJlcyIpKQp9Cmdnc2F2ZShmaWxlbmFtZSA9ICJkbmEtbGVuZ3RoLWluLXByb3RlaW4tZG5hLWNwbHgteHRhbC1zdHJ1Y3R1cmVzLnN2ZyIsCiAgICAgICBwbG90ID0gZG5hX2xlbmd0aF94dGFsLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LXh0YWwtc3RydWN0dXJlcy5zdmcpCgojIyMgMC0xNTAgYnAgcmFuZ2UKCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIGluIDAtMTUwIGJwIHJhbmdlfQpkbmFfbGVuZ3RoX3h0YWxfMTUwIDwtIHBkYl9kYXRhICU+JSAKICAgIGZpbHRlcihleHBlcmltZW50YWxfbWV0aG9kID09ICJYLXJheSBkaWZmcmFjdGlvbiIgJiBkbmFfbGVuZ3RoIDwgMTUxKSAlPiUgCiAgICBnZ3Bsb3QoKSArCiAgICBnZW9tX2hpc3RvZ3JhbShtYXBwaW5nID0gYWVzKHggPSBkbmFfbGVuZ3RoKSwgYmlud2lkdGggPSAxKSArCiAgICB0aGVtZV9idygpICsKICAgIGdndGl0bGUoIkROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIG9mIHByb3RlaW4tRE5BIGNvbXBsZXhlcyAoMC0xNTAgYnApIikgKwogICAgeGxhYigiRE5BIGxlbmd0aCAoYnApIikgKwogICAgeWxhYigiTnVtYmVyIG9mIGNyeXN0YWwgc3RydWN0dXJlcyIpCmdncGxvdGx5KGRuYV9sZW5ndGhfeHRhbF8xNTApCmBgYAoKYGBge3IgU2F2ZSBTVkcgZmlsZSB4dGFsIDAtMTUwIGJwIHJhbmdlLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tcHJvdGVpbi1kbmEtY3BseC14dGFsLXN0cnVjdHVyZXMtMC0xNTAuc3ZnIiwKICAgICAgIHBsb3QgPSBkbmFfbGVuZ3RoX3h0YWxfMTUwLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LXh0YWwtc3RydWN0dXJlcy0wLTE1MC5zdmcpCgojIyBETkEgbGVuZ3RoIGluIGNyeW8tRU0gc3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMKCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5by1FTSBzdHJ1Y3R1cmVzfQpkbmFfbGVuZ3RoX2NyeW9lbSA8LSBwZGJfZGF0YSAlPiUgCiAgICBmaWx0ZXIoZXhwZXJpbWVudGFsX21ldGhvZCA9PSAiRWxlY3Ryb24gTWljcm9zY29weSIpICU+JSAKICAgIGdncGxvdCgpICsKICAgIGdlb21faGlzdG9ncmFtKG1hcHBpbmcgPSBhZXMoeCA9IGRuYV9sZW5ndGgpLCBiaW53aWR0aCA9IDEpICsKICAgIHRoZW1lX2J3KCkgKwogICAgZ2d0aXRsZSgiRE5BIGxlbmd0aCBpbiBjcnlvLUVNIHN0cnVjdHVyZXMgb2YgcHJvdGVpbi1ETkEgY29tcGxleGVzIikgKwogICAgeGxhYigiRE5BIGxlbmd0aCAoYnApIikgKwogICAgeWxhYigiTnVtYmVyIG9mIGNyeW8tRU0gc3RydWN0dXJlcyIpCmdncGxvdGx5KGRuYV9sZW5ndGhfY3J5b2VtKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgY3J5b2VtLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tcHJvdGVpbi1kbmEtY3BseC1jcnlvZW0tc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IGRuYV9sZW5ndGhfY3J5b2VtLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LWNyeW9lbS1zdHJ1Y3R1cmVzLnN2ZykKCiMjIEROQSBsZW5ndGggaW4gTk1SIHN0cnVjdHVyZXMgb2YgcHJvdGVpbi1ETkEgY29tcGxleGVzCgpgYGB7ciBETkEgbGVuZ3RoIGluIE5NUiBzdHJ1Y3R1cmVzfQpkbmFfbGVuZ3RoX25tciA8LSBwZGJfZGF0YSAlPiUgCiAgICBmaWx0ZXIoZXhwZXJpbWVudGFsX21ldGhvZCA9PSAiU29sdXRpb24gTk1SIikgJT4lIAogICAgZ2dwbG90KCkgKwogICAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gZG5hX2xlbmd0aCksIGJpbndpZHRoID0gMSkgKwogICAgdGhlbWVfYncoKSArCiAgICBnZ3RpdGxlKCJETkEgbGVuZ3RoIGluIE5NUiBzdHJ1Y3R1cmVzIG9mIHByb3RlaW4tRE5BIGNvbXBsZXhlcyIpICsKICAgIHhsYWIoIkROQSBsZW5ndGggKGJwKSIpICsKICAgIHlsYWIoIk51bWJlciBvZiBOTVIgc3RydWN0dXJlcyIpCmdncGxvdGx5KGRuYV9sZW5ndGhfbm1yKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgbm1yLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tcHJvdGVpbi1kbmEtY3BseC1ubXItc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IGRuYV9sZW5ndGhfbm1yLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LW5tci1zdHJ1Y3R1cmVzLnN2ZykKCiMjIERhdGFzZXQKClRoZSBoaXN0b2dyYW1zIHByZXNlbnRlZCBhYm92ZSBhcmUgZGVyaXZlZCBmcm9tIHRoZSBmb2xsb3dpbmcgZGF0YXNldDoKCmBgYHtyIEVudGlyZSBkYXRhc2V0fQojIEZvcm1hdCB0YWJsZSBmb3IgZGlzcGxheQpwZGJfdGFibGUgPC0gcGRiX2RhdGEgJT4lIAogICAgYXJyYW5nZShkZXNjKGRuYV9sZW5ndGgpKSAlPiUgCiAgICBzZWxlY3QoYFBEQiBjb2RlYCA9IHBkYl9pZCwKICAgICAgICAgICBgRE5BIGxlbmd0aGAgPSBkbmFfbGVuZ3RoKQpwZGJfdGFibGUKYGBgCgoKYGBge3IgU2F2ZSBkYXRhc2V0IGFzIEpTT04sIGluY2x1ZGU9RkFMU0V9CiMgU2F2ZSBkYXRhc2V0IGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZGF0YXNldHMiKSkpIHsKICAgIGRpci5jcmVhdGUoaGVyZSgiZGF0YXNldHMiKSkKfQoKd3JpdGVfanNvbihwZGJfZGF0YSwgaGVyZSgiZGF0YXNldHMiLCAicHJvdGVpbi1kbmEtY3BseC1zdHJ1Y3R1cmVzLmpzb24iKSkKYGBgCgpbKipEb3dubG9hZCByYXcgZGF0YXNldCBpbiBKU09OIGZvcm1hdCoqXShkYXRhc2V0cy9wcm90ZWluLWRuYS1jcGx4LXN0cnVjdHVyZXMuanNvbikK