Last updated on Tue Dec 1 14:24:10 2020.

How to re-use this work

If you use these figures in your own work, please cite this website: https://doi.org/10.5281/zenodo.3470119

Structures of protein-DNA complexes by experimental method

All figures are interactive (you can zoom in, and hovering over elements will show more information).

# Load required packages
library(magrittr)
library(jsonlite)
library(dplyr)
library(forcats)
library(stringr)
library(ggplot2)
library(plotly)
library(here)

# Query the PDB for all DNA molecules in crystal structures of protein-DNA
# complexes
pdb_query <- 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=molecule_type:%22DNA%22%20AND%20assembly_composition:%22DNA/protein%20complex%22&fl=pdb_id,molecule_sequence,experimental_method&rows=1000000&wt=json'

pdb_data <- pdb_query %>% 
    fromJSON() %>% 
    .$response %>% 
    .$docs %>% 
    as_tibble() %>% 
    distinct(pdb_id, .keep_all = TRUE) %>% 
    filter(experimental_method %in% c("X-ray diffraction",
                                      "Electron Microscopy",
                                      "Solution NMR")) %>% 
    mutate(dna_length = str_length(molecule_sequence),
           experimental_method = as_factor(as.character(experimental_method)))

# Summary statistics
protein_dna_cplx_structures <- ggplot(data = pdb_data) +
    geom_bar(mapping = aes(x = experimental_method)) +
    theme_bw() +
    xlab("") +
    ylab("Number of PDB entries") +
    ggtitle("Structures of protein-DNA complexes")
ggplotly(protein_dna_cplx_structures)

Download figure in SVG format

DNA length in crystal structures of protein-DNA complexes

Entire distribution

dna_length_xtal <- pdb_data %>% 
    filter(experimental_method == "X-ray diffraction") %>% 
    ggplot() +
    geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
    theme_bw() +
    ggtitle("DNA length in crystal structures of protein-DNA complexes") +
    xlab("DNA length (bp)") +
    ylab("Number of crystal structures")
ggplotly(dna_length_xtal)

Download figure in SVG format

0-150 bp range

dna_length_xtal_150 <- pdb_data %>% 
    filter(experimental_method == "X-ray diffraction" & dna_length < 151) %>% 
    ggplot() +
    geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
    theme_bw() +
    ggtitle("DNA length in crystal structures of protein-DNA complexes (0-150 bp)") +
    xlab("DNA length (bp)") +
    ylab("Number of crystal structures")
ggplotly(dna_length_xtal_150)

Download figure in SVG format

DNA length in cryo-EM structures of protein-DNA complexes

dna_length_cryoem <- pdb_data %>% 
    filter(experimental_method == "Electron Microscopy") %>% 
    ggplot() +
    geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
    theme_bw() +
    ggtitle("DNA length in cryo-EM structures of protein-DNA complexes") +
    xlab("DNA length (bp)") +
    ylab("Number of cryo-EM structures")
ggplotly(dna_length_cryoem)

Download figure in SVG format

DNA length in NMR structures of protein-DNA complexes

dna_length_nmr <- pdb_data %>% 
    filter(experimental_method == "Solution NMR") %>% 
    ggplot() +
    geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
    theme_bw() +
    ggtitle("DNA length in NMR structures of protein-DNA complexes") +
    xlab("DNA length (bp)") +
    ylab("Number of NMR structures")
ggplotly(dna_length_nmr)

Download figure in SVG format

Dataset

The histograms presented above are derived from the following dataset:

# Format table for display
pdb_table <- pdb_data %>% 
    arrange(desc(dna_length)) %>% 
    select(`PDB code` = pdb_id,
           `DNA length` = dna_length)
pdb_table

Download raw dataset in JSON format

LS0tCnRpdGxlOiAiRE5BIGxlbmd0aCBpbiBwcm90ZWluLUROQSBjb21wbGV4ZXMiCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSkKYGBgCgoqKkxhc3QgdXBkYXRlZCBvbiBgciBkYXRlKClgLioqCgojIyBIb3cgdG8gcmUtdXNlIHRoaXMgd29yawoKSWYgeW91IHVzZSB0aGVzZSBmaWd1cmVzIGluIHlvdXIgb3duIHdvcmssIHBsZWFzZSBjaXRlIHRoaXMgd2Vic2l0ZTogPGh0dHBzOi8vZG9pLm9yZy8xMC41MjgxL3plbm9kby4zNDcwMTE5PgoKIyMgU3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMgYnkgZXhwZXJpbWVudGFsIG1ldGhvZAoKQWxsIGZpZ3VyZXMgYXJlIGludGVyYWN0aXZlICh5b3UgY2FuIHpvb20gaW4sIGFuZCBob3ZlcmluZyBvdmVyIGVsZW1lbnRzIHdpbGwKc2hvdyBtb3JlIGluZm9ybWF0aW9uKS4KCmBgYHtyIERvd25sb2FkIGFuZCBjbGVhbiB1cCBkYXRhLCBnZW5lcmF0ZSBzdW1tYXJ5fQojIExvYWQgcmVxdWlyZWQgcGFja2FnZXMKbGlicmFyeShtYWdyaXR0cikKbGlicmFyeShqc29ubGl0ZSkKbGlicmFyeShkcGx5cikKbGlicmFyeShmb3JjYXRzKQpsaWJyYXJ5KHN0cmluZ3IpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShwbG90bHkpCmxpYnJhcnkoaGVyZSkKCiMgUXVlcnkgdGhlIFBEQiBmb3IgYWxsIEROQSBtb2xlY3VsZXMgaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIG9mIHByb3RlaW4tRE5BCiMgY29tcGxleGVzCnBkYl9xdWVyeSA8LSAnaHR0cHM6Ly93d3cuZWJpLmFjLnVrL3BkYmUvc2VhcmNoL3BkYi9zZWxlY3Q/cT1tb2xlY3VsZV90eXBlOiUyMkROQSUyMiUyMEFORCUyMGFzc2VtYmx5X2NvbXBvc2l0aW9uOiUyMkROQS9wcm90ZWluJTIwY29tcGxleCUyMiZmbD1wZGJfaWQsbW9sZWN1bGVfc2VxdWVuY2UsZXhwZXJpbWVudGFsX21ldGhvZCZyb3dzPTEwMDAwMDAmd3Q9anNvbicKCnBkYl9kYXRhIDwtIHBkYl9xdWVyeSAlPiUgCiAgICBmcm9tSlNPTigpICU+JSAKICAgIC4kcmVzcG9uc2UgJT4lIAogICAgLiRkb2NzICU+JSAKICAgIGFzX3RpYmJsZSgpICU+JSAKICAgIGRpc3RpbmN0KHBkYl9pZCwgLmtlZXBfYWxsID0gVFJVRSkgJT4lIAogICAgZmlsdGVyKGV4cGVyaW1lbnRhbF9tZXRob2QgJWluJSBjKCJYLXJheSBkaWZmcmFjdGlvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIkVsZWN0cm9uIE1pY3Jvc2NvcHkiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJTb2x1dGlvbiBOTVIiKSkgJT4lIAogICAgbXV0YXRlKGRuYV9sZW5ndGggPSBzdHJfbGVuZ3RoKG1vbGVjdWxlX3NlcXVlbmNlKSwKICAgICAgICAgICBleHBlcmltZW50YWxfbWV0aG9kID0gYXNfZmFjdG9yKGFzLmNoYXJhY3RlcihleHBlcmltZW50YWxfbWV0aG9kKSkpCgojIFN1bW1hcnkgc3RhdGlzdGljcwpwcm90ZWluX2RuYV9jcGx4X3N0cnVjdHVyZXMgPC0gZ2dwbG90KGRhdGEgPSBwZGJfZGF0YSkgKwogICAgZ2VvbV9iYXIobWFwcGluZyA9IGFlcyh4ID0gZXhwZXJpbWVudGFsX21ldGhvZCkpICsKICAgIHRoZW1lX2J3KCkgKwogICAgeGxhYigiIikgKwogICAgeWxhYigiTnVtYmVyIG9mIFBEQiBlbnRyaWVzIikgKwogICAgZ2d0aXRsZSgiU3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMiKQpnZ3Bsb3RseShwcm90ZWluX2RuYV9jcGx4X3N0cnVjdHVyZXMpCmBgYAoKYGBge3IgU2F2ZSBTVkcgZmlsZSBtZXRob2RzLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gInByb3RlaW4tZG5hLWNwbHgtc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IHByb3RlaW5fZG5hX2NwbHhfc3RydWN0dXJlcywKICAgICAgIGRldmljZSA9ICJzdmciLAogICAgICAgcGF0aCA9IGhlcmUoImZpZ3VyZXMiKSkKYGBgCgpbKipEb3dubG9hZCBmaWd1cmUgaW4gU1ZHIGZvcm1hdCoqXShmaWd1cmVzL3Byb3RlaW4tZG5hLWNwbHgtc3RydWN0dXJlcy5zdmcpCgojIyBETkEgbGVuZ3RoIGluIGNyeXN0YWwgc3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMKIyMjIEVudGlyZSBkaXN0cmlidXRpb24KCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzfQpkbmFfbGVuZ3RoX3h0YWwgPC0gcGRiX2RhdGEgJT4lIAogICAgZmlsdGVyKGV4cGVyaW1lbnRhbF9tZXRob2QgPT0gIlgtcmF5IGRpZmZyYWN0aW9uIikgJT4lIAogICAgZ2dwbG90KCkgKwogICAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gZG5hX2xlbmd0aCksIGJpbndpZHRoID0gMSkgKwogICAgdGhlbWVfYncoKSArCiAgICBnZ3RpdGxlKCJETkEgbGVuZ3RoIGluIGNyeXN0YWwgc3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMiKSArCiAgICB4bGFiKCJETkEgbGVuZ3RoIChicCkiKSArCiAgICB5bGFiKCJOdW1iZXIgb2YgY3J5c3RhbCBzdHJ1Y3R1cmVzIikKZ2dwbG90bHkoZG5hX2xlbmd0aF94dGFsKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgeHRhbCwgaW5jbHVkZT1GQUxTRX0KIyBTYXZlIGZpZ3VyZSBmb3IgZG93bmxvYWQKaWYgKCFkaXIuZXhpc3RzKGhlcmUoImZpZ3VyZXMiKSkpIHsKICAgIGRpci5jcmVhdGUoaGVyZSgiZmlndXJlcyIpKQp9Cmdnc2F2ZShmaWxlbmFtZSA9ICJkbmEtbGVuZ3RoLWluLXByb3RlaW4tZG5hLWNwbHgteHRhbC1zdHJ1Y3R1cmVzLnN2ZyIsCiAgICAgICBwbG90ID0gZG5hX2xlbmd0aF94dGFsLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LXh0YWwtc3RydWN0dXJlcy5zdmcpCgojIyMgMC0xNTAgYnAgcmFuZ2UKCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIGluIDAtMTUwIGJwIHJhbmdlfQpkbmFfbGVuZ3RoX3h0YWxfMTUwIDwtIHBkYl9kYXRhICU+JSAKICAgIGZpbHRlcihleHBlcmltZW50YWxfbWV0aG9kID09ICJYLXJheSBkaWZmcmFjdGlvbiIgJiBkbmFfbGVuZ3RoIDwgMTUxKSAlPiUgCiAgICBnZ3Bsb3QoKSArCiAgICBnZW9tX2hpc3RvZ3JhbShtYXBwaW5nID0gYWVzKHggPSBkbmFfbGVuZ3RoKSwgYmlud2lkdGggPSAxKSArCiAgICB0aGVtZV9idygpICsKICAgIGdndGl0bGUoIkROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIG9mIHByb3RlaW4tRE5BIGNvbXBsZXhlcyAoMC0xNTAgYnApIikgKwogICAgeGxhYigiRE5BIGxlbmd0aCAoYnApIikgKwogICAgeWxhYigiTnVtYmVyIG9mIGNyeXN0YWwgc3RydWN0dXJlcyIpCmdncGxvdGx5KGRuYV9sZW5ndGhfeHRhbF8xNTApCmBgYAoKYGBge3IgU2F2ZSBTVkcgZmlsZSB4dGFsIDAtMTUwIGJwIHJhbmdlLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tcHJvdGVpbi1kbmEtY3BseC14dGFsLXN0cnVjdHVyZXMtMC0xNTAuc3ZnIiwKICAgICAgIHBsb3QgPSBkbmFfbGVuZ3RoX3h0YWxfMTUwLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LXh0YWwtc3RydWN0dXJlcy0wLTE1MC5zdmcpCgojIyBETkEgbGVuZ3RoIGluIGNyeW8tRU0gc3RydWN0dXJlcyBvZiBwcm90ZWluLUROQSBjb21wbGV4ZXMKCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5by1FTSBzdHJ1Y3R1cmVzfQpkbmFfbGVuZ3RoX2NyeW9lbSA8LSBwZGJfZGF0YSAlPiUgCiAgICBmaWx0ZXIoZXhwZXJpbWVudGFsX21ldGhvZCA9PSAiRWxlY3Ryb24gTWljcm9zY29weSIpICU+JSAKICAgIGdncGxvdCgpICsKICAgIGdlb21faGlzdG9ncmFtKG1hcHBpbmcgPSBhZXMoeCA9IGRuYV9sZW5ndGgpLCBiaW53aWR0aCA9IDEpICsKICAgIHRoZW1lX2J3KCkgKwogICAgZ2d0aXRsZSgiRE5BIGxlbmd0aCBpbiBjcnlvLUVNIHN0cnVjdHVyZXMgb2YgcHJvdGVpbi1ETkEgY29tcGxleGVzIikgKwogICAgeGxhYigiRE5BIGxlbmd0aCAoYnApIikgKwogICAgeWxhYigiTnVtYmVyIG9mIGNyeW8tRU0gc3RydWN0dXJlcyIpCmdncGxvdGx5KGRuYV9sZW5ndGhfY3J5b2VtKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgY3J5b2VtLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tcHJvdGVpbi1kbmEtY3BseC1jcnlvZW0tc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IGRuYV9sZW5ndGhfY3J5b2VtLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LWNyeW9lbS1zdHJ1Y3R1cmVzLnN2ZykKCiMjIEROQSBsZW5ndGggaW4gTk1SIHN0cnVjdHVyZXMgb2YgcHJvdGVpbi1ETkEgY29tcGxleGVzCgpgYGB7ciBETkEgbGVuZ3RoIGluIE5NUiBzdHJ1Y3R1cmVzfQpkbmFfbGVuZ3RoX25tciA8LSBwZGJfZGF0YSAlPiUgCiAgICBmaWx0ZXIoZXhwZXJpbWVudGFsX21ldGhvZCA9PSAiU29sdXRpb24gTk1SIikgJT4lIAogICAgZ2dwbG90KCkgKwogICAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gZG5hX2xlbmd0aCksIGJpbndpZHRoID0gMSkgKwogICAgdGhlbWVfYncoKSArCiAgICBnZ3RpdGxlKCJETkEgbGVuZ3RoIGluIE5NUiBzdHJ1Y3R1cmVzIG9mIHByb3RlaW4tRE5BIGNvbXBsZXhlcyIpICsKICAgIHhsYWIoIkROQSBsZW5ndGggKGJwKSIpICsKICAgIHlsYWIoIk51bWJlciBvZiBOTVIgc3RydWN0dXJlcyIpCmdncGxvdGx5KGRuYV9sZW5ndGhfbm1yKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgbm1yLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tcHJvdGVpbi1kbmEtY3BseC1ubXItc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IGRuYV9sZW5ndGhfbm1yLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1wcm90ZWluLWRuYS1jcGx4LW5tci1zdHJ1Y3R1cmVzLnN2ZykKCiMjIERhdGFzZXQKClRoZSBoaXN0b2dyYW1zIHByZXNlbnRlZCBhYm92ZSBhcmUgZGVyaXZlZCBmcm9tIHRoZSBmb2xsb3dpbmcgZGF0YXNldDoKCmBgYHtyIEVudGlyZSBkYXRhc2V0fQojIEZvcm1hdCB0YWJsZSBmb3IgZGlzcGxheQpwZGJfdGFibGUgPC0gcGRiX2RhdGEgJT4lIAogICAgYXJyYW5nZShkZXNjKGRuYV9sZW5ndGgpKSAlPiUgCiAgICBzZWxlY3QoYFBEQiBjb2RlYCA9IHBkYl9pZCwKICAgICAgICAgICBgRE5BIGxlbmd0aGAgPSBkbmFfbGVuZ3RoKQpwZGJfdGFibGUKYGBgCgoKYGBge3IgU2F2ZSBkYXRhc2V0IGFzIEpTT04sIGluY2x1ZGU9RkFMU0V9CiMgU2F2ZSBkYXRhc2V0IGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZGF0YXNldHMiKSkpIHsKICAgIGRpci5jcmVhdGUoaGVyZSgiZGF0YXNldHMiKSkKfQoKd3JpdGVfanNvbihwZGJfZGF0YSwgaGVyZSgiZGF0YXNldHMiLCAicHJvdGVpbi1kbmEtY3BseC1zdHJ1Y3R1cmVzLmpzb24iKSkKYGBgCgpbKipEb3dubG9hZCByYXcgZGF0YXNldCBpbiBKU09OIGZvcm1hdCoqXShkYXRhc2V0cy9wcm90ZWluLWRuYS1jcGx4LXN0cnVjdHVyZXMuanNvbikK