Last updated on Sun Feb 11 22:55:27 2024.
Structures of free DNA by experimental method
All figures are interactive (you can zoom in, and hovering over
elements will show more information).
# Load required packages
library(magrittr)
library(jsonlite)
library(dplyr)
library(forcats)
library(stringr)
library(ggplot2)
library(plotly)
library(here)
# Query the PDB for all DNA molecules in structures of free-DNA
pdb_query <- 'https://www.ebi.ac.uk/pdbe/search/pdb/select?q=molecule_type:%22DNA%22%20AND%20assembly_composition:%22DNA%20structure%22&fl=pdb_id,molecule_sequence,experimental_method&rows=1000000&wt=json'
pdb_data <- pdb_query %>%
fromJSON() %>%
.$response %>%
.$docs %>%
as_tibble() %>%
distinct(pdb_id, .keep_all = TRUE) %>%
filter(experimental_method %in% c("X-ray diffraction",
"Electron Microscopy",
"Solution NMR")) %>%
mutate(dna_length = str_length(molecule_sequence),
experimental_method = as_factor(as.character(experimental_method)))
# Summary statistics
free_dna_structures <- ggplot(data = pdb_data) +
geom_bar(mapping = aes(x = experimental_method)) +
theme_bw() +
xlab("") +
ylab("Number of PDB entries") +
ggtitle("Structures of free DNA by experimental method")
ggplotly(free_dna_structures)
Download figure in
SVG format
DNA length in crystal structures of free DNA
free_dna_length_xtal <- pdb_data %>%
filter(experimental_method == "X-ray diffraction") %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in crystal structures of free DNA") +
xlab("DNA length (bp)") +
ylab("Number of crystal structures")
ggplotly(free_dna_length_xtal)
Download
figure in SVG format
DNA length in cryo-EM structures of free DNA
free_dna_length_cryoem <- pdb_data %>%
filter(experimental_method == "Electron Microscopy") %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in cryo-EM structures of free DNA") +
xlab("DNA length (bp)") +
ylab("Number of cryo-EM structures")
ggplotly(free_dna_length_cryoem)
Download
figure in SVG format
DNA length in NMR structures of free DNA
free_dna_length_nmr <- pdb_data %>%
filter(experimental_method == "Solution NMR") %>%
ggplot() +
geom_histogram(mapping = aes(x = dna_length), binwidth = 1) +
theme_bw() +
ggtitle("DNA length in NMR structures of free DNA") +
xlab("DNA length (bp)") +
ylab("Number of NMR structures")
ggplotly(free_dna_length_nmr)
Download
figure in SVG format
Dataset
The histograms presented above are derived from the following
dataset:
# Format table for display
pdb_table <- pdb_data %>%
arrange(desc(dna_length)) %>%
select(`PDB code` = pdb_id,
`DNA length` = dna_length)
pdb_table
Download raw
dataset in JSON format
LS0tCnRpdGxlOiAiRnJlZSBETkEgbGVuZ3RoIGluIHRoZSBQREIiCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChtZXNzYWdlID0gRkFMU0UsIHdhcm5pbmcgPSBGQUxTRSkKYGBgCgoqKkxhc3QgdXBkYXRlZCBvbiBgciBkYXRlKClgLioqCgojIyBIb3cgdG8gcmUtdXNlIHRoaXMgd29yawoKSWYgeW91IHVzZSB0aGVzZSBmaWd1cmVzIGluIHlvdXIgb3duIHdvcmssIHBsZWFzZSBjaXRlIHRoaXMgd2Vic2l0ZTogPGh0dHBzOi8vZG9pLm9yZy8xMC41MjgxL3plbm9kby4zNDcwMTE5PgoKIyMgU3RydWN0dXJlcyBvZiBmcmVlIEROQSBieSBleHBlcmltZW50YWwgbWV0aG9kCgpBbGwgZmlndXJlcyBhcmUgaW50ZXJhY3RpdmUgKHlvdSBjYW4gem9vbSBpbiwgYW5kIGhvdmVyaW5nIG92ZXIgZWxlbWVudHMgd2lsbApzaG93IG1vcmUgaW5mb3JtYXRpb24pLgoKYGBge3IgRG93bmxvYWQgYW5kIGNsZWFuIHVwIGRhdGEsIGdlbmVyYXRlIHN1bW1hcnl9CiMgTG9hZCByZXF1aXJlZCBwYWNrYWdlcwpsaWJyYXJ5KG1hZ3JpdHRyKQpsaWJyYXJ5KGpzb25saXRlKQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGZvcmNhdHMpCmxpYnJhcnkoc3RyaW5ncikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShoZXJlKQoKIyBRdWVyeSB0aGUgUERCIGZvciBhbGwgRE5BIG1vbGVjdWxlcyBpbiBzdHJ1Y3R1cmVzIG9mIGZyZWUtRE5BCnBkYl9xdWVyeSA8LSAnaHR0cHM6Ly93d3cuZWJpLmFjLnVrL3BkYmUvc2VhcmNoL3BkYi9zZWxlY3Q/cT1tb2xlY3VsZV90eXBlOiUyMkROQSUyMiUyMEFORCUyMGFzc2VtYmx5X2NvbXBvc2l0aW9uOiUyMkROQSUyMHN0cnVjdHVyZSUyMiZmbD1wZGJfaWQsbW9sZWN1bGVfc2VxdWVuY2UsZXhwZXJpbWVudGFsX21ldGhvZCZyb3dzPTEwMDAwMDAmd3Q9anNvbicKCgpwZGJfZGF0YSA8LSBwZGJfcXVlcnkgJT4lIAogICAgZnJvbUpTT04oKSAlPiUgCiAgICAuJHJlc3BvbnNlICU+JSAKICAgIC4kZG9jcyAlPiUgCiAgICBhc190aWJibGUoKSAlPiUgCiAgICBkaXN0aW5jdChwZGJfaWQsIC5rZWVwX2FsbCA9IFRSVUUpICU+JSAKICAgIGZpbHRlcihleHBlcmltZW50YWxfbWV0aG9kICVpbiUgYygiWC1yYXkgZGlmZnJhY3Rpb24iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJFbGVjdHJvbiBNaWNyb3Njb3B5IiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAiU29sdXRpb24gTk1SIikpICU+JSAKICAgIG11dGF0ZShkbmFfbGVuZ3RoID0gc3RyX2xlbmd0aChtb2xlY3VsZV9zZXF1ZW5jZSksCiAgICAgICAgICAgZXhwZXJpbWVudGFsX21ldGhvZCA9IGFzX2ZhY3Rvcihhcy5jaGFyYWN0ZXIoZXhwZXJpbWVudGFsX21ldGhvZCkpKQoKIyBTdW1tYXJ5IHN0YXRpc3RpY3MKZnJlZV9kbmFfc3RydWN0dXJlcyA8LSBnZ3Bsb3QoZGF0YSA9IHBkYl9kYXRhKSArCiAgICBnZW9tX2JhcihtYXBwaW5nID0gYWVzKHggPSBleHBlcmltZW50YWxfbWV0aG9kKSkgKwogICAgdGhlbWVfYncoKSArCiAgICB4bGFiKCIiKSArCiAgICB5bGFiKCJOdW1iZXIgb2YgUERCIGVudHJpZXMiKSArCiAgICBnZ3RpdGxlKCJTdHJ1Y3R1cmVzIG9mIGZyZWUgRE5BIGJ5IGV4cGVyaW1lbnRhbCBtZXRob2QiKQpnZ3Bsb3RseShmcmVlX2RuYV9zdHJ1Y3R1cmVzKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgbWV0aG9kcywgaW5jbHVkZT1GQUxTRX0KIyBTYXZlIGZpZ3VyZSBmb3IgZG93bmxvYWQKaWYgKCFkaXIuZXhpc3RzKGhlcmUoImZpZ3VyZXMiKSkpIHsKICAgIGRpci5jcmVhdGUoaGVyZSgiZmlndXJlcyIpKQp9Cmdnc2F2ZShmaWxlbmFtZSA9ICJmcmVlLWRuYS1zdHJ1Y3R1cmVzLnN2ZyIsCiAgICAgICBwbG90ID0gZnJlZV9kbmFfc3RydWN0dXJlcywKICAgICAgIGRldmljZSA9ICJzdmciLAogICAgICAgcGF0aCA9IGhlcmUoImZpZ3VyZXMiKSkKYGBgCgpbKipEb3dubG9hZCBmaWd1cmUgaW4gU1ZHIGZvcm1hdCoqXShmaWd1cmVzL2ZyZWUtZG5hLXN0cnVjdHVyZXMuc3ZnKQoKIyMgRE5BIGxlbmd0aCBpbiBjcnlzdGFsIHN0cnVjdHVyZXMgb2YgZnJlZSBETkEKCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzfQpmcmVlX2RuYV9sZW5ndGhfeHRhbCA8LSBwZGJfZGF0YSAlPiUgCiAgICBmaWx0ZXIoZXhwZXJpbWVudGFsX21ldGhvZCA9PSAiWC1yYXkgZGlmZnJhY3Rpb24iKSAlPiUgCiAgICBnZ3Bsb3QoKSArCiAgICBnZW9tX2hpc3RvZ3JhbShtYXBwaW5nID0gYWVzKHggPSBkbmFfbGVuZ3RoKSwgYmlud2lkdGggPSAxKSArCiAgICB0aGVtZV9idygpICsKICAgIGdndGl0bGUoIkROQSBsZW5ndGggaW4gY3J5c3RhbCBzdHJ1Y3R1cmVzIG9mIGZyZWUgRE5BIikgKwogICAgeGxhYigiRE5BIGxlbmd0aCAoYnApIikgKwogICAgeWxhYigiTnVtYmVyIG9mIGNyeXN0YWwgc3RydWN0dXJlcyIpCmdncGxvdGx5KGZyZWVfZG5hX2xlbmd0aF94dGFsKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgeHRhbCwgaW5jbHVkZT1GQUxTRX0KIyBTYXZlIGZpZ3VyZSBmb3IgZG93bmxvYWQKaWYgKCFkaXIuZXhpc3RzKGhlcmUoImZpZ3VyZXMiKSkpIHsKICAgIGRpci5jcmVhdGUoaGVyZSgiZmlndXJlcyIpKQp9Cmdnc2F2ZShmaWxlbmFtZSA9ICJkbmEtbGVuZ3RoLWluLWZyZWUtZG5hLXh0YWwtc3RydWN0dXJlcy5zdmciLAogICAgICAgcGxvdCA9IGZyZWVfZG5hX2xlbmd0aF94dGFsLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1mcmVlLWRuYS14dGFsLXN0cnVjdHVyZXMuc3ZnKQoKIyMgRE5BIGxlbmd0aCBpbiBjcnlvLUVNIHN0cnVjdHVyZXMgb2YgZnJlZSBETkEKCmBgYHtyIEROQSBsZW5ndGggaW4gY3J5by1FTSBzdHJ1Y3R1cmVzfQpmcmVlX2RuYV9sZW5ndGhfY3J5b2VtIDwtIHBkYl9kYXRhICU+JSAKICAgIGZpbHRlcihleHBlcmltZW50YWxfbWV0aG9kID09ICJFbGVjdHJvbiBNaWNyb3Njb3B5IikgJT4lIAogICAgZ2dwbG90KCkgKwogICAgZ2VvbV9oaXN0b2dyYW0obWFwcGluZyA9IGFlcyh4ID0gZG5hX2xlbmd0aCksIGJpbndpZHRoID0gMSkgKwogICAgdGhlbWVfYncoKSArCiAgICBnZ3RpdGxlKCJETkEgbGVuZ3RoIGluIGNyeW8tRU0gc3RydWN0dXJlcyBvZiBmcmVlIEROQSIpICsKICAgIHhsYWIoIkROQSBsZW5ndGggKGJwKSIpICsKICAgIHlsYWIoIk51bWJlciBvZiBjcnlvLUVNIHN0cnVjdHVyZXMiKQpnZ3Bsb3RseShmcmVlX2RuYV9sZW5ndGhfY3J5b2VtKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgY3J5b2VtLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tZnJlZS1kbmEtY3J5b2VtLXN0cnVjdHVyZXMuc3ZnIiwKICAgICAgIHBsb3QgPSBmcmVlX2RuYV9sZW5ndGhfY3J5b2VtLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1mcmVlLWRuYS1jcnlvZW0tc3RydWN0dXJlcy5zdmcpCgojIyBETkEgbGVuZ3RoIGluIE5NUiBzdHJ1Y3R1cmVzIG9mIGZyZWUgRE5BCgpgYGB7ciBETkEgbGVuZ3RoIGluIE5NUiBzdHJ1Y3R1cmVzfQpmcmVlX2RuYV9sZW5ndGhfbm1yIDwtIHBkYl9kYXRhICU+JSAKICAgIGZpbHRlcihleHBlcmltZW50YWxfbWV0aG9kID09ICJTb2x1dGlvbiBOTVIiKSAlPiUgCiAgICBnZ3Bsb3QoKSArCiAgICBnZW9tX2hpc3RvZ3JhbShtYXBwaW5nID0gYWVzKHggPSBkbmFfbGVuZ3RoKSwgYmlud2lkdGggPSAxKSArCiAgICB0aGVtZV9idygpICsKICAgIGdndGl0bGUoIkROQSBsZW5ndGggaW4gTk1SIHN0cnVjdHVyZXMgb2YgZnJlZSBETkEiKSArCiAgICB4bGFiKCJETkEgbGVuZ3RoIChicCkiKSArCiAgICB5bGFiKCJOdW1iZXIgb2YgTk1SIHN0cnVjdHVyZXMiKQpnZ3Bsb3RseShmcmVlX2RuYV9sZW5ndGhfbm1yKQpgYGAKCmBgYHtyIFNhdmUgU1ZHIGZpbGUgbm1yLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZmlndXJlIGZvciBkb3dubG9hZAppZiAoIWRpci5leGlzdHMoaGVyZSgiZmlndXJlcyIpKSkgewogICAgZGlyLmNyZWF0ZShoZXJlKCJmaWd1cmVzIikpCn0KZ2dzYXZlKGZpbGVuYW1lID0gImRuYS1sZW5ndGgtaW4tZnJlZS1kbmEtbm1yLXN0cnVjdHVyZXMuc3ZnIiwKICAgICAgIHBsb3QgPSBmcmVlX2RuYV9sZW5ndGhfbm1yLAogICAgICAgZGV2aWNlID0gInN2ZyIsCiAgICAgICBwYXRoID0gaGVyZSgiZmlndXJlcyIpKQpgYGAKClsqKkRvd25sb2FkIGZpZ3VyZSBpbiBTVkcgZm9ybWF0KipdKGZpZ3VyZXMvZG5hLWxlbmd0aC1pbi1mcmVlLWRuYS1ubXItc3RydWN0dXJlcy5zdmcpCgojIyBEYXRhc2V0CgpUaGUgaGlzdG9ncmFtcyBwcmVzZW50ZWQgYWJvdmUgYXJlIGRlcml2ZWQgZnJvbSB0aGUgZm9sbG93aW5nIGRhdGFzZXQ6CgpgYGB7ciBFbnRpcmUgZGF0YXNldH0KIyBGb3JtYXQgdGFibGUgZm9yIGRpc3BsYXkKcGRiX3RhYmxlIDwtIHBkYl9kYXRhICU+JSAKICAgIGFycmFuZ2UoZGVzYyhkbmFfbGVuZ3RoKSkgJT4lIAogICAgc2VsZWN0KGBQREIgY29kZWAgPSBwZGJfaWQsCiAgICAgICAgICAgYEROQSBsZW5ndGhgID0gZG5hX2xlbmd0aCkKcGRiX3RhYmxlCmBgYAoKCmBgYHtyIFNhdmUgZGF0YXNldCBhcyBKU09OLCBpbmNsdWRlPUZBTFNFfQojIFNhdmUgZGF0YXNldCBmb3IgZG93bmxvYWQKaWYgKCFkaXIuZXhpc3RzKGhlcmUoImRhdGFzZXRzIikpKSB7CiAgICBkaXIuY3JlYXRlKGhlcmUoImRhdGFzZXRzIikpCn0KCndyaXRlX2pzb24ocGRiX2RhdGEsIGhlcmUoImRhdGFzZXRzIiwgImZyZWUtZG5hLXN0cnVjdHVyZXMuanNvbiIpKQpgYGAKClsqKkRvd25sb2FkIHJhdyBkYXRhc2V0IGluIEpTT04gZm9ybWF0KipdKGRhdGFzZXRzL2ZyZWUtZG5hLXN0cnVjdHVyZXMuanNvbikK