This vignette has two layers.
The first layer is researcher-facing: a concrete compound-triage workflow that answers the question “what can I actually do with PubChemR?”.
The second layer is systems-facing: the transport, caching, async, batching, and typed-result machinery that makes those workflows reproducible and robust.
It is designed to be:
How to use this guide:
How to read each section:
Purpose tells
you when to use a function, the Minimal example shows the
smallest valid call, the Typical example shows a realistic
workflow call, the Advanced example shows how the function
behaves in robust pipeline code, and Interpretation
explains how to read the returned object.success,
error, pending, from_cache, and
request metadata from request_args().CID, SID,
AID), and whether the output is already suitable for
joining, plotting, or modeling.This section answers the practical question a typical PubChemR user starts with:
“Given a known compound, how do I triage related compounds by properties and assay activity?”
The example below uses an aspirin-like seed structure. In live mode it can query PubChem similarity results; in offline mode it falls back to exported deterministic example data so the workflow still runs end-to-end.
library(PubChemR)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tibble)
run_live <- identical(Sys.getenv("PUBCHEMR_RUN_LIVE"), "true")
`%||%` <- function(x, y) if (is.null(x)) y else x
safe_call <- function(expr) {
tryCatch(
expr,
error = function(e) structure(list(message = conditionMessage(e)), class = "pc_error")
)
}
summarize_any <- function(x) {
if (inherits(x, "pc_error")) {
return(tibble(ok = FALSE, class = "pc_error", note = x$message, rows = NA_integer_, cols = NA_integer_))
}
if (inherits(x, "PubChemResult")) {
return(tibble(
ok = isTRUE(x$success),
class = class(x)[1],
note = if (isTRUE(x$success)) "success" else (x$error$code %||% "error"),
rows = nrow(as_tibble(x)),
cols = ncol(as_tibble(x))
))
}
if (inherits(x, "PubChemAsyncQuery")) {
return(tibble(ok = TRUE, class = class(x)[1], note = "async query object", rows = NA_integer_, cols = NA_integer_))
}
if (is.data.frame(x)) {
return(tibble(ok = TRUE, class = class(x)[1], note = "tabular output", rows = nrow(x), cols = ncol(x)))
}
tibble(ok = TRUE, class = class(x)[1], note = "non-tabular output", rows = NA_integer_, cols = NA_integer_)
}
Goal:
research_seed_smiles <- "CC(=O)OC1=CC=CC=C1C(=O)O"
assay_payload <- pc_example_assaysummary_payload()
feature_tbl_synthetic <- pc_example_feature_table() %>%
mutate(
CanonicalSMILES = c(
"CC(=O)OC1=CC=CC=C1C(=O)O",
"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
"CN1C=NC2=C1C(=O)N(C(=O)N2)C"
)
)
research_similarity <- if (run_live) {
pc_similarity_search(
identifier = research_seed_smiles,
namespace = "smiles",
threshold = 90,
max_records = 25,
cache = TRUE
)
} else {
pc_similarity_search(
identifier = research_seed_smiles,
namespace = "smiles",
threshold = 90,
max_records = 25,
cache = TRUE,
offline = TRUE
)
}
research_cids <- if (inherits(research_similarity, "PubChemResult") && isTRUE(research_similarity$success)) {
as_tibble(research_similarity) %>%
filter(!is.na(CID)) %>%
transmute(CID = as.character(CID)) %>%
distinct()
} else {
feature_tbl_synthetic %>%
transmute(CID)
}
research_features_try <- pc_feature_table(
identifier = research_cids$CID,
properties = c("MolecularWeight", "XLogP", "TPSA", "HBondDonorCount", "HBondAcceptorCount"),
namespace = "cid",
cache = TRUE,
offline = !run_live,
error_mode = "result"
)
research_features <- if (inherits(research_features_try, "PubChemResult")) {
feature_tbl_synthetic
} else {
research_features_try %>%
mutate(CID = as.character(CID))
}
research_assay_long <- pc_assay_activity_long(
x = assay_payload,
add_outcome_value = TRUE
)
research_activity_summary <- research_assay_long %>%
group_by(CID) %>%
summarise(
n_assays = n(),
outcomes = paste(sort(unique(ActivityOutcome)), collapse = ", "),
any_active = any(ActivityOutcomeValue == 1, na.rm = TRUE),
median_activity_uM = if (all(is.na(ActivityValue_uM))) NA_real_ else median(ActivityValue_uM, na.rm = TRUE),
.groups = "drop"
)
research_triage <- research_features %>%
left_join(research_activity_summary, by = "CID") %>%
mutate(
n_assays = ifelse(is.na(n_assays), 0L, as.integer(n_assays)),
any_active = ifelse(is.na(any_active), FALSE, any_active),
outcomes = ifelse(is.na(outcomes), "No assay evidence", outcomes),
passes_rule_of_five_like =
MolecularWeight <= 500 &
XLogP <= 5 &
HBondDonorCount <= 5 &
HBondAcceptorCount <= 10,
activity_bucket = case_when(
any_active ~ "Any Active",
n_assays > 0 ~ "Assayed, no active hit",
TRUE ~ "No assay evidence in fixture"
)
) %>%
arrange(desc(any_active), desc(passes_rule_of_five_like), MolecularWeight)
research_triage %>%
select(CID, MolecularWeight, XLogP, TPSA, n_assays, any_active, passes_rule_of_five_like, activity_bucket)
#> # A tibble: 3 × 8
#> CID MolecularWeight XLogP TPSA n_assays any_active passes_rule_of_five_like
#> <chr> <dbl> <dbl> <dbl> <int> <lgl> <lgl>
#> 1 2244 180. 1.2 63.6 2 TRUE TRUE
#> 2 5957 194. 0.1 86.6 0 FALSE TRUE
#> 3 3672 206. 3.5 37.3 1 FALSE TRUE
#> # ℹ 1 more variable: activity_bucket <chr>
What this shows:
run_live.research_ranked <- research_triage %>%
filter(passes_rule_of_five_like) %>%
mutate(
potency_score = ifelse(is.na(median_activity_uM), 0, 1 / (1 + median_activity_uM)),
rank_score =
2 * as.numeric(any_active) +
potency_score +
0.10 * n_assays -
abs(XLogP - 2.5) / 5
) %>%
arrange(desc(rank_score), MolecularWeight) %>%
mutate(rank = row_number())
list(
total_candidates = nrow(research_triage),
druglike_candidates = nrow(research_ranked),
top_candidates = research_ranked %>%
select(rank, CID, MolecularWeight, XLogP, n_assays, any_active, median_activity_uM, rank_score)
)
#> $total_candidates
#> [1] 3
#>
#> $druglike_candidates
#> [1] 3
#>
#> $top_candidates
#> # A tibble: 3 × 8
#> rank CID MolecularWeight XLogP n_assays any_active median_activity_uM
#> <int> <chr> <dbl> <dbl> <int> <lgl> <dbl>
#> 1 1 2244 180. 1.2 2 TRUE 1.2
#> 2 2 3672 206. 3.5 1 FALSE 12.5
#> 3 3 5957 194. 0.1 0 FALSE NA
#> # ℹ 1 more variable: rank_score <dbl>
Interpretation:

Interpretation:
op <- par(mfrow = c(2, 2), mar = c(4, 4, 2, 1))
hist(
research_triage$MolecularWeight,
main = "Candidate Molecular Weight",
xlab = "MolecularWeight",
col = "#CDE8F6",
border = "white"
)
barplot(
table(research_assay_long$ActivityOutcome),
main = "Activity Outcome Counts",
ylab = "Count",
las = 2,
col = c("#DCECC9", "#F9E3B4", "#F6D6D0")
)
plot(
research_triage$XLogP,
research_triage$MolecularWeight,
pch = 19,
col = ifelse(research_triage$any_active, "#58713A", "#9B4D3A"),
xlab = "XLogP",
ylab = "MolecularWeight",
main = "Property Space by Activity"
)
text(
research_triage$XLogP,
research_triage$MolecularWeight,
labels = research_triage$CID,
pos = 3,
cex = 0.8
)
barplot(
research_ranked$rank_score,
names.arg = research_ranked$CID,
main = "Ranked Candidate Scores",
ylab = "rank_score",
col = "#E4D8F5",
las = 2
)

par(op)
Interpretation:
The rest of this vignette explains the machinery behind the workflow above:
The next-generation API in PubChemR centers around typed result objects, policy-controlled transport, orchestration helpers, and analysis-layer tools.
Read this section as a map, not as something you need to memorize. Its purpose is to show where each function family belongs in a real workflow and why the same package contains both scientific helpers and infrastructure helpers.
nextgen_function_map <- tibble(
family = c(
rep("Transport", 6),
rep("Wrappers", 7),
rep("Async", 3),
rep("Batch/Benchmark", 4),
rep("Analysis", 10),
rep("Helpers", 6)
),
function_name = c(
"pc_profile", "pc_config", "pc_request", "pc_response", "pc_cache_info/pc_cache_clear", "pc_capabilities",
"pc_compound", "pc_substance", "pc_assay", "pc_property", "pc_identifier_map", "pc_similarity_search", "pc_sdq_bioactivity",
"pc_submit", "pc_poll", "pc_collect",
"pc_batch", "pc_resume_batch", "pc_benchmark", "pc_benchmark_harness",
"pc_assay_activity_long", "pc_activity_outcome_map", "pc_activity_matrix", "pc_cross_domain_join",
"pc_feature_table", "pc_model_matrix", "pc_export_model_data", "pc_to_rcdk", "pc_to_chemminer", "pc_lifecycle_policy",
"pc_example_assaysummary_payload", "pc_example_feature_table", "as_tibble", "request_args", "has_hits", "retrieve"
)
)
nextgen_function_map
#> # A tibble: 36 × 2
#> family function_name
#> <chr> <chr>
#> 1 Transport pc_profile
#> 2 Transport pc_config
#> 3 Transport pc_request
#> 4 Transport pc_response
#> 5 Transport pc_cache_info/pc_cache_clear
#> 6 Transport pc_capabilities
#> 7 Wrappers pc_compound
#> 8 Wrappers pc_substance
#> 9 Wrappers pc_assay
#> 10 Wrappers pc_property
#> # ℹ 26 more rows
What to expect:
run_live.How to interpret the roadmap:
Transport functions control how requests are made and
how failures are represented.Wrappers tell PubChem what kind of record you
want.Async and Batch/Benchmark functions become
relevant when a workflow must scale or recover from interruptions.Analysis functions turn retrieved data into tables,
matrices, and exports a scientist can use directly.Helpers are the glue for inspecting, standardizing, and
reusing outputs.Core object classes used throughout:
PubChemResult: typed transport result
(success, data, error,
metadata).PubChemRecord/PubChemIdMap: specialized
PubChemResult subclasses.PubChemAsyncQuery: async container for
submit/poll/collect workflows.PubChemBatchResult: chunk-level execution summary and
checkpoint metadata.PubChemBenchmarkReport: scenario-level benchmark
summary with pass/fail gates.PubChemModelMatrix/PubChemSparseActivityMatrix:
modeling-ready matrix wrappers.A robust nextgen workflow starts with explicit transport policy and deterministic runtime settings.
Section contract:
cfg_before <- pc_config()
cfg_before
#> $rate_limit
#> [1] 5
#>
#> $timeout
#> [1] 60
#>
#> $retries
#> [1] 3
#>
#> $pause_base
#> [1] 1
#>
#> $pause_cap
#> [1] 8
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpHAl5DI/PubChemR_cache"
#>
#> $cache_ttl
#> [1] 86400
#>
#> $offline
#> [1] FALSE
pc_profile("default")
#> $rate_limit
#> [1] 5
#>
#> $timeout
#> [1] 60
#>
#> $retries
#> [1] 3
#>
#> $pause_base
#> [1] 1
#>
#> $pause_cap
#> [1] 8
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpHAl5DI/PubChemR_cache"
#>
#> $cache_ttl
#> [1] 86400
#>
#> $offline
#> [1] FALSE
cfg_after_profile <- pc_config()
cfg_after_profile[c("rate_limit", "timeout", "retries", "cache_ttl")]
#> $rate_limit
#> [1] 5
#>
#> $timeout
#> [1] 60
#>
#> $retries
#> [1] 3
#>
#> $cache_ttl
#> [1] 86400
# Keep cache in a temp path during vignette execution.
work_cache <- file.path(tempdir(), "pubchemr-vignette-cache")
dir.create(work_cache, recursive = TRUE, showWarnings = FALSE)
pc_config(cache_dir = work_cache, offline = FALSE)
#> $rate_limit
#> [1] 5
#>
#> $timeout
#> [1] 60
#>
#> $retries
#> [1] 3
#>
#> $pause_base
#> [1] 1
#>
#> $pause_cap
#> [1] 8
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpjLjM6G/pubchemr-vignette-cache"
#>
#> $cache_ttl
#> [1] 86400
#>
#> $offline
#> [1] FALSE
pc_cache_clear(cache_dir = work_cache, memory = TRUE, disk = TRUE)
pc_cache_info(cache_dir = work_cache)
#> # A tibble: 1 × 4
#> memory_entries disk_entries disk_size_bytes cache_dir
#> <int> <int> <dbl> <chr>
#> 1 0 0 0 /var/folders/dj/y28dp44x303ggfg6r…
What to expect:
pc_profile() applies a baseline policy.pc_config() confirms active runtime state.tempdir() to avoid modifying user
directories.pc_capabilities(cache_dir = work_cache, check_network = FALSE)
#> # A tibble: 1 × 9
#> live_enabled offline_mode cache_dir cache_exists cache_writable optional_rcdk
#> <lgl> <lgl> <chr> <lgl> <lgl> <lgl>
#> 1 TRUE FALSE /var/fold… TRUE TRUE FALSE
#> # ℹ 3 more variables: optional_chemminer <lgl>, optional_matrix <lgl>,
#> # network_reachable <lgl>
Interpretation:
pc_capabilities() gives a preflight summary for
optional dependencies, cache writability, and offline/live mode.Key pitfalls and notes:
rate_limit and cache_ttl are validated;
invalid values fail early.pc_config(...)).The transport contract is the foundation for every other nextgen function.
If results feel abstract later in the vignette, come back here. This section defines what a “good” result looks like, what a controlled failure looks like, and why the package often returns objects rather than immediately returning a data frame.
Purpose:
success, data, error,
metadata).ok_text <- '{"PropertyTable":{"Properties":[{"CID":2244,"MolecularWeight":180.16}]}}'
res_ok <- pc_response(
ok_text,
request = list(domain = "compound", namespace = "cid", identifier = 2244, operation = "property/MolecularWeight")
)
class(res_ok)
#> [1] "PubChemResult"
res_ok$success
#> [1] TRUE
as_tibble(res_ok)
#> # A tibble: 1 × 6
#> success status from_cache pending CID MolecularWeight
#> <lgl> <int> <lgl> <lgl> <dbl> <dbl>
#> 1 TRUE NA FALSE FALSE 2244 180.
Interpretation:
success is TRUE and
as_tibble() exposes payload content as a flat table.fault_text <- '{"Fault":{"Code":"PUGREST.BadRequest","Message":"Invalid request"}}'
res_fault <- pc_response(fault_text, request = list(domain = "compound"))
res_fault$success
#> [1] FALSE
res_fault$error$code
#> [1] "PUGREST.BadRequest"
res_fault$error$message
#> [1] "PUGREST.BadRequest" "Invalid request"
Interpretation:
success and
error$code.waiting_text <- '{"Waiting":{"ListKey":"example-listkey"}}'
res_wait <- pc_response(waiting_text, request = list(domain = "compound", operation = "cids"))
tibble(
success = res_wait$success,
pending = res_wait$pending,
listkey = res_wait$listkey
)
#> # A tibble: 1 × 3
#> success pending listkey
#> <lgl> <lgl> <chr>
#> 1 TRUE TRUE example-listkey
Interpretation:
listkey, enabling async flow
(pc_poll() / pc_collect()).Performance and reproducibility notes:
pc_response() is local parsing, so it is deterministic
and fast.Purpose:
request_args()), convert
typed results (as_tibble()), and detect hit availability
(has_hits()).request_args(res_ok)
#> $domain
#> [1] "compound"
#>
#> $namespace
#> [1] "cid"
#>
#> $identifier
#> [1] 2244
#>
#> $operation
#> [1] "property/MolecularWeight"
request_args(res_ok, "identifier")
#> [1] 2244
Interpretation:
id_text <- '{"IdentifierList":{"CID":[2244,3672,5957]}}'
id_res <- pc_response(id_text, request = list(domain = "compound", namespace = "name", identifier = "aspirin", operation = "cids"))
as_tibble(id_res)
#> # A tibble: 3 × 5
#> success status from_cache pending CID
#> <lgl> <int> <lgl> <lgl> <dbl>
#> 1 TRUE NA FALSE FALSE 2244
#> 2 TRUE NA FALSE FALSE 3672
#> 3 TRUE NA FALSE FALSE 5957
Interpretation:
as_tibble() handles common payload families
(PropertyTable, IdentifierList,
InformationList).if (exists("has_hits", mode = "function")) {
synthetic_hit_flags <- structure(
list(
request_args = list(identifier = c("aspirin", "unknown")),
has_hits = c(TRUE, FALSE),
success = c(TRUE, FALSE)
),
class = "PubChemInstance_CIDs"
)
has_hits(synthetic_hit_flags)
} else {
"has_hits() is unavailable in this installed package build."
}
#> aspirin unknown
#> TRUE FALSE
Interpretation:
has_hits() is useful when output formatting depends on
non-empty match sets.Purpose:
.slot = "data")
or nested fields (.slot = "IdentifierList/CID",
.slot = "error/code").retrieve(id_res)
#> # A tibble: 3 × 1
#> CID
#> <dbl>
#> 1 2244
#> 2 3672
#> 3 5957
Interpretation:
PubChemResult, default retrieval returns
payload data in a tabular form when possible.retrieve(id_res, .slot = "IdentifierList/CID", .to.data.frame = FALSE)
#> [1] 2244 3672 5957
Interpretation:
retrieve(res_fault, .slot = "error/code", .to.data.frame = FALSE)
#> [1] "PUGREST.BadRequest"
Interpretation:
Purpose:
list(
result_class = class(res_ok),
result_request_fields = names(request_args(res_ok)),
result_error_fields = names(res_fault$error)
)
#> $result_class
#> [1] "PubChemResult"
#>
#> $result_request_fields
#> [1] "domain" "namespace" "identifier" "operation"
#>
#> $result_error_fields
#> [1] "code" "message" "status" "details"
capture.output(print(res_fault))
#> [1] ""
#> [2] " PubChemResult "
#> [3] ""
#> [4] " - Success: FALSE"
#> [5] " - Status: NA"
#> [6] " - Pending: FALSE"
#> [7] " - From cache: FALSE"
#> [8] " - Error Code: PUGREST.BadRequest"
#> [9] " - Error Message: PUGREST.BadRequestInvalid request"
typed_surface <- list(
ok = res_ok[c("success", "status", "pending", "from_cache")],
fault = res_fault[c("success", "status", "pending")],
wait = res_wait[c("success", "pending", "listkey")]
)
typed_surface
#> $ok
#> $ok$success
#> [1] TRUE
#>
#> $ok$status
#> [1] NA
#>
#> $ok$pending
#> [1] FALSE
#>
#> $ok$from_cache
#> [1] FALSE
#>
#>
#> $fault
#> $fault$success
#> [1] FALSE
#>
#> $fault$status
#> [1] NA
#>
#> $fault$pending
#> [1] FALSE
#>
#>
#> $wait
#> $wait$success
#> [1] TRUE
#>
#> $wait$pending
#> [1] TRUE
#>
#> $wait$listkey
#> [1] "example-listkey"
Interpretation:
success, pending, listkey,
status, and error for automation.request_args() is the stable provenance interface when
objects move between orchestration, analysis, and export steps.This section covers pc_profile(),
pc_config(), pc_capabilities(),
pc_request(), pc_cache_info(), and
pc_cache_clear() with escalating complexity.
Section contract:
PubChemResult objects and cache
diagnostics.force_refresh in reproducible workflows.rate_limit,
timeout, and cache policy together; tuning only one
parameter rarely yields stable gains.Purpose:
default,
cloud, high_throughput).pc_profile("default")$rate_limit
#> [1] 5
pc_profile("cloud", retries = 4, pause_cap = 12)[c("rate_limit", "retries", "pause_cap")]
#> $rate_limit
#> [1] 3
#>
#> $retries
#> [1] 4
#>
#> $pause_cap
#> [1] 12
old_cfg <- pc_config()
pc_profile("high_throughput", cache_ttl = 3600, timeout = 45)
#> $rate_limit
#> [1] 10
#>
#> $timeout
#> [1] 45
#>
#> $retries
#> [1] 4
#>
#> $pause_base
#> [1] 0.5
#>
#> $pause_cap
#> [1] 10
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpjLjM6G/pubchemr-vignette-cache"
#>
#> $cache_ttl
#> [1] 3600
#>
#> $offline
#> [1] FALSE
new_cfg <- pc_config()
list(
before = old_cfg[c("rate_limit", "timeout", "cache_ttl")],
after = new_cfg[c("rate_limit", "timeout", "cache_ttl")]
)
#> $before
#> $before$rate_limit
#> [1] 3
#>
#> $before$timeout
#> [1] 120
#>
#> $before$cache_ttl
#> [1] 604800
#>
#>
#> $after
#> $after$rate_limit
#> [1] 10
#>
#> $after$timeout
#> [1] 45
#>
#> $after$cache_ttl
#> [1] 3600
Interpretation:
Purpose:
offline, retries, and cache policy.pc_config()[c("rate_limit", "timeout", "cache_ttl", "offline")]
#> $rate_limit
#> [1] 10
#>
#> $timeout
#> [1] 45
#>
#> $cache_ttl
#> [1] 3600
#>
#> $offline
#> [1] FALSE
pc_config(rate_limit = 6, timeout = 50, cache_ttl = 1800, offline = FALSE)
#> $rate_limit
#> [1] 6
#>
#> $timeout
#> [1] 50
#>
#> $retries
#> [1] 4
#>
#> $pause_base
#> [1] 0.5
#>
#> $pause_cap
#> [1] 10
#>
#> $user_agent
#> [1] "PubChemR/3.0.0"
#>
#> $cache_dir
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpjLjM6G/pubchemr-vignette-cache"
#>
#> $cache_ttl
#> [1] 1800
#>
#> $offline
#> [1] FALSE
pc_config()[c("rate_limit", "timeout", "cache_ttl", "offline")]
#> $rate_limit
#> [1] 6
#>
#> $timeout
#> [1] 50
#>
#> $cache_ttl
#> [1] 1800
#>
#> $offline
#> [1] FALSE
safe_call(pc_config(rate_limit = 0))
#> $message
#> [1] "'rate_limit' must be a finite numeric scalar > 0."
#>
#> attr(,"class")
#> [1] "pc_error"
Interpretation:
pc_config() mutates package-level state, so scripts
should set and restore it intentionally.Purpose:
pc_capabilities(cache_dir = work_cache, check_network = FALSE)
#> # A tibble: 1 × 9
#> live_enabled offline_mode cache_dir cache_exists cache_writable optional_rcdk
#> <lgl> <lgl> <chr> <lgl> <lgl> <lgl>
#> 1 TRUE FALSE /var/fold… TRUE TRUE FALSE
#> # ℹ 3 more variables: optional_chemminer <lgl>, optional_matrix <lgl>,
#> # network_reachable <lgl>
pc_capabilities(
cache_dir = work_cache,
check_network = run_live,
network_timeout = 2
)
#> # A tibble: 1 × 9
#> live_enabled offline_mode cache_dir cache_exists cache_writable optional_rcdk
#> <lgl> <lgl> <chr> <lgl> <lgl> <lgl>
#> 1 TRUE FALSE /var/fold… TRUE TRUE FALSE
#> # ℹ 3 more variables: optional_chemminer <lgl>, optional_matrix <lgl>,
#> # network_reachable <lgl>
Interpretation:
check_network = FALSE keeps examples
deterministic.check_network = TRUE is a
practical early warning before running longer live workflows.Purpose:
pc_* wrappers.PubChemResult with
success, error, and replay metadata.req_min <- pc_request(identifier = 2244, offline = TRUE, cache = TRUE)
summarize_any(req_min)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
Interpretation:
OfflineCacheMiss failure.req_typed <- pc_request(
domain = "compound",
namespace = "cid",
identifier = 2244,
operation = "property/MolecularWeight,XLogP",
output = "JSON",
cache = TRUE,
offline = TRUE
)
summarize_any(req_typed)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
Interpretation:
req_post <- pc_request(
domain = "compound",
namespace = "smiles",
identifier = "CCO",
operation = "cids",
method = "POST",
body = list(smiles = "CCO"),
output = "JSON",
cache = TRUE,
offline = TRUE
)
list(
summary = summarize_any(req_post),
request = request_args(req_post)[c("method", "namespace", "identifier", "operation")]
)
#> $summary
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
#>
#> $request
#> $request$method
#> [1] "POST"
#>
#> $request$namespace
#> [1] "smiles"
#>
#> $request$identifier
#> [1] "CCO"
#>
#> $request$operation
#> [1] "cids"
Interpretation:
pc_request() supports both GET and
POST; wrappers inherit that transport surface through
....offline = TRUE and
assert on structured error codes.Purpose:
pc_cache_info().pc_cache_info(cache_dir = work_cache)
#> # A tibble: 1 × 4
#> memory_entries disk_entries disk_size_bytes cache_dir
#> <int> <int> <dbl> <chr>
#> 1 0 0 0 /var/folders/dj/y28dp44x303ggfg6r…
pc_cache_clear(cache_dir = work_cache, memory = TRUE, disk = FALSE)
pc_cache_info(cache_dir = work_cache)
#> # A tibble: 1 × 4
#> memory_entries disk_entries disk_size_bytes cache_dir
#> <int> <int> <dbl> <chr>
#> 1 0 0 0 /var/folders/dj/y28dp44x303ggfg6r…
pc_cache_clear(cache_dir = work_cache, memory = TRUE, disk = TRUE)
pc_cache_info(cache_dir = work_cache)
#> # A tibble: 1 × 4
#> memory_entries disk_entries disk_size_bytes cache_dir
#> <int> <int> <dbl> <chr>
#> 1 0 0 0 /var/folders/dj/y28dp44x303ggfg6r…
Interpretation:
pc_capabilities() when
debugging environment-specific behavior.These wrappers express intent while preserving the
PubChemResult contract. pc_similarity_search()
and pc_sdq_bioactivity() are introduced here, then
revisited in more detail in the analysis section because they often feed
downstream modeling workflows.
Section contract:
PubChemRecord/PubChemIdMap) or table-shaped
outputs for SDQ and higher-level analysis wrappers.wrapper_minimal <- list(
pc_compound = safe_call(pc_compound(2244, offline = TRUE)),
pc_substance = safe_call(pc_substance(5360534, offline = TRUE)),
pc_assay = safe_call(pc_assay(367, offline = TRUE)),
pc_property = safe_call(pc_property(2244, properties = "MolecularWeight", offline = TRUE)),
pc_identifier_map = safe_call(pc_identifier_map("aspirin", namespace = "name", to = "cids", offline = TRUE)),
pc_similarity_search = safe_call(pc_similarity_search("CCO", namespace = "smiles", offline = TRUE)),
pc_sdq_bioactivity = if (run_live) {
safe_call(pc_sdq_bioactivity(2244, rate_limit = FALSE, limit = 200L))
} else {
structure(list(message = "Skipped (set PUBCHEMR_RUN_LIVE=true to run SDQ live examples)."), class = "pc_error")
}
)
bind_rows(lapply(wrapper_minimal, summarize_any), .id = "function")
#> # A tibble: 7 × 6
#> `function` ok class note rows cols
#> <chr> <lgl> <chr> <chr> <int> <int>
#> 1 pc_compound FALSE PubChemRecord OfflineCacheMi… 1 4
#> 2 pc_substance FALSE PubChemRecord OfflineCacheMi… 1 4
#> 3 pc_assay FALSE PubChemRecord OfflineCacheMi… 1 4
#> 4 pc_property FALSE PubChemRecord OfflineCacheMi… 1 4
#> 5 pc_identifier_map FALSE PubChemIdMap OfflineCacheMi… 1 5
#> 6 pc_similarity_search FALSE PubChemSimilarityResult OfflineCacheMi… 1 5
#> 7 pc_sdq_bioactivity FALSE pc_error Skipped (set P… NA NA
Interpretation:
pc_sdq_bioactivity() is inherently live-network; this
vignette guards it.retrieve() or as_tibble() immediately
after wrapper calls to normalize downstream expectations.wrapper_typical <- list(
pc_compound = safe_call(pc_compound(identifier = c(2244, 3672), namespace = "cid", operation = "record", offline = TRUE)),
pc_substance = safe_call(pc_substance(identifier = c(5360534, 5360535), namespace = "sid", operation = "record", offline = TRUE)),
pc_assay = safe_call(pc_assay(identifier = c(367, 2551), namespace = "aid", operation = "description", offline = TRUE)),
pc_property = safe_call(pc_property(identifier = c(2244, 3672), properties = c("MolecularWeight", "XLogP"), namespace = "cid", offline = TRUE)),
pc_identifier_map = safe_call(pc_identifier_map(identifier = c("aspirin", "caffeine"), namespace = "name", to = "cids", domain = "compound", offline = TRUE)),
pc_similarity_search = safe_call(pc_similarity_search(identifier = "CC(=O)OC1=CC=CC=C1C(=O)O", namespace = "smiles", threshold = 90, max_records = 25, offline = TRUE)),
pc_sdq_bioactivity = if (run_live) {
safe_call(pc_sdq_bioactivity(identifier = 2244, namespace = "cid", limit = 200L, order = "activity,asc", rate_limit = FALSE))
} else {
structure(list(message = "Skipped (no live network)."), class = "pc_error")
}
)
bind_rows(lapply(wrapper_typical, summarize_any), .id = "function")
#> # A tibble: 7 × 6
#> `function` ok class note rows cols
#> <chr> <lgl> <chr> <chr> <int> <int>
#> 1 pc_compound FALSE PubChemRecord OfflineCacheMi… 1 4
#> 2 pc_substance FALSE PubChemRecord OfflineCacheMi… 1 4
#> 3 pc_assay FALSE PubChemRecord OfflineCacheMi… 1 4
#> 4 pc_property FALSE PubChemRecord OfflineCacheMi… 1 4
#> 5 pc_identifier_map FALSE PubChemIdMap OfflineCacheMi… 1 5
#> 6 pc_similarity_search FALSE PubChemSimilarityResult OfflineCacheMi… 1 5
#> 7 pc_sdq_bioactivity FALSE pc_error Skipped (no li… NA NA
Interpretation:
operation and to arguments
over defaults in shared code.wrapper_advanced <- list(
pc_compound = safe_call(pc_compound(identifier = 2244, namespace = "cid", operation = "record", method = "GET", cache = TRUE, offline = TRUE)),
pc_substance = safe_call(pc_substance(identifier = 5360534, namespace = "sid", operation = "record", cache = TRUE, offline = TRUE)),
pc_assay = safe_call(pc_assay(identifier = 367, namespace = "aid", operation = "summary", cache = TRUE, offline = TRUE)),
pc_property = safe_call(pc_property(identifier = c(2244, 3672), properties = c("MolecularWeight", "TPSA", "HBondDonorCount"), namespace = "cid", cache = TRUE, offline = TRUE)),
pc_identifier_map = safe_call(pc_identifier_map(identifier = c("aspirin", "ibuprofen", "caffeine"), namespace = "name", to = "cids", cache = TRUE, offline = TRUE)),
pc_similarity_search = safe_call(pc_similarity_search(identifier = "CCO", namespace = "smiles", searchtype = "fastsimilarity_2d", threshold = 85, max_records = 50, cache = TRUE, offline = TRUE)),
pc_sdq_bioactivity = if (run_live) {
safe_call(pc_sdq_bioactivity(identifier = 2244, namespace = "cid", collection = "bioactivity", limit = 500L, cache = TRUE, cache_dir = work_cache, cache_ttl = 3600, force_refresh = TRUE, rate_limit = FALSE))
} else {
structure(list(message = "Skipped (no live network)."), class = "pc_error")
}
)
bind_rows(lapply(wrapper_advanced, summarize_any), .id = "function")
#> # A tibble: 7 × 6
#> `function` ok class note rows cols
#> <chr> <lgl> <chr> <chr> <int> <int>
#> 1 pc_compound FALSE PubChemRecord OfflineCacheMi… 1 4
#> 2 pc_substance FALSE PubChemRecord OfflineCacheMi… 1 4
#> 3 pc_assay FALSE PubChemRecord OfflineCacheMi… 1 4
#> 4 pc_property FALSE PubChemRecord OfflineCacheMi… 1 4
#> 5 pc_identifier_map FALSE PubChemIdMap OfflineCacheMi… 1 5
#> 6 pc_similarity_search FALSE PubChemSimilarityResult OfflineCacheMi… 1 5
#> 7 pc_sdq_bioactivity FALSE pc_error Skipped (no li… NA NA
Interpretation:
offline = TRUE for deterministic local replay
checks.pc_sdq_bioactivity() now supports
error_mode = "result" for typed failure handling.Common wrapper pitfalls:
pc_property() requires at least one property name.pc_identifier_map() target must match the supported set
(cids, sids, aids).pc_similarity_search() validates threshold and search
mode.pc_sdq_bioactivity() is live-network by design; guard
in reproducible docs and CI.Purpose:
Section contract:
interval, max_attempts).PubChemAsyncQuery objects followed by terminal
PubChemResult objects.pending/listkey status.q_min <- pc_submit(
domain = "compound",
namespace = "name",
identifier = "aspirin",
operation = "cids",
searchtype = "similarity",
options = list(Threshold = 90),
offline = TRUE
)
class(q_min)
#> [1] "PubChemAsyncQuery"
summarize_any(q_min$initial)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
Interpretation:
pc_submit() always returns a
PubChemAsyncQuery container.poll_min <- pc_poll("dummy-listkey", max_attempts = 1, interval = 0, offline = TRUE)
summarize_any(poll_min)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
Interpretation:
collect_min <- pc_collect(q_min)
summarize_any(collect_min)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
Interpretation:
pc_collect() centralizes branching logic (listkey vs no
listkey).retrieve(..., .slot = "data") or
as_tibble().Performance and reproducibility notes:
interval and max_attempts according
to endpoint latency.This family includes pc_batch(),
pc_resume_batch(), pc_benchmark(), and
pc_benchmark_harness().
Section contract:
Purpose:
batch_min <- pc_batch(
ids = 1:6,
fn = function(chunk_ids, ...) tibble(id = chunk_ids, score = chunk_ids * 10),
chunk_size = 2
)
batch_min
#>
#> PubChemBatchResult
#>
#> - Chunks: 3
#> - Chunk size: 2
#> - Parallel: FALSE
#> - Successful chunks: 3/3
as_tibble(batch_min)
#> # A tibble: 3 × 4
#> chunk n_ids success error
#> <int> <int> <lgl> <chr>
#> 1 1 2 TRUE ""
#> 2 2 2 TRUE ""
#> 3 3 2 TRUE ""
Interpretation:
chunks, per-chunk success flags, and
worker outputs.cp_dir <- file.path(tempdir(), "pc-batch-checkpoints")
cp_id <- "vignette-batch-demo"
batch_cp <- pc_batch(
ids = 1:8,
fn = function(chunk_ids, ...) sum(chunk_ids),
chunk_size = 3,
checkpoint_dir = cp_dir,
checkpoint_id = cp_id
)
batch_cp$checkpoint
#> $enabled
#> [1] TRUE
#>
#> $id
#> [1] "vignette-batch-demo"
#>
#> $dir
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpjLjM6G/pc-batch-checkpoints"
#>
#> $manifest
#> [1] "/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T//RtmpjLjM6G/pc-batch-checkpoints/pc_batch_vignette-batch-demo_manifest.rds"
#>
#> $resumed
#> [1] FALSE
#>
#> $rerun_failed
#> [1] TRUE
Interpretation:
batch_adv <- pc_batch(
ids = letters[1:12],
fn = function(chunk_ids, ...) {
tibble(id = chunk_ids, nchar = nchar(chunk_ids), hash = as.integer(factor(chunk_ids)))
},
chunk_size = 4,
parallel = FALSE,
workers = 1
)
as_tibble(batch_adv)
#> # A tibble: 3 × 4
#> chunk n_ids success error
#> <int> <int> <lgl> <chr>
#> 1 1 4 TRUE ""
#> 2 2 4 TRUE ""
#> 3 3 4 TRUE ""
Interpretation:
Purpose:
resume_min <- pc_resume_batch(
fn = function(chunk_ids, ...) sum(chunk_ids),
checkpoint_dir = cp_dir,
checkpoint_id = cp_id
)
resume_min$checkpoint$resumed
#> [1] TRUE
resume_typical <- pc_resume_batch(
fn = function(chunk_ids, ...) sum(chunk_ids),
checkpoint_dir = cp_dir,
checkpoint_id = cp_id,
rerun_failed = TRUE
)
as_tibble(resume_typical)
#> # A tibble: 3 × 4
#> chunk n_ids success error
#> <int> <int> <lgl> <chr>
#> 1 1 3 TRUE ""
#> 2 2 3 TRUE ""
#> 3 3 2 TRUE ""
resume_adv <- pc_resume_batch(
fn = function(chunk_ids, ...) tibble(id = chunk_ids, value = as.numeric(chunk_ids)^2),
checkpoint_dir = cp_dir,
checkpoint_id = cp_id,
parallel = FALSE,
workers = 1,
rerun_failed = FALSE
)
resume_adv$chunk_status
#> [1] "success" "success" "success"
Interpretation:
rerun_failed controls whether historical failed chunks
are retried.Purpose:
bm_min <- pc_benchmark(
ids = 1:20,
fn = function(chunk_ids, ...) sum(chunk_ids),
chunk_sizes = c(5, 10),
parallel_options = FALSE
)
bm_min
#> # A tibble: 2 × 7
#> chunk_size parallel workers elapsed_sec chunks successful_chunks failed_chunks
#> <int> <lgl> <dbl> <dbl> <int> <int> <int>
#> 1 5 FALSE 1 0 4 4 0
#> 2 10 FALSE 1 0.00100 2 2 0
bm_typical <- pc_benchmark(
ids = 1:40,
fn = function(chunk_ids, ...) {
out <- sum(chunk_ids)
out
},
chunk_sizes = c(4, 8, 20),
parallel_options = c(FALSE)
)
bm_typical %>% arrange(elapsed_sec)
#> # A tibble: 3 × 7
#> chunk_size parallel workers elapsed_sec chunks successful_chunks failed_chunks
#> <int> <lgl> <dbl> <dbl> <int> <int> <int>
#> 1 20 FALSE 1 0 2 2 0
#> 2 4 FALSE 1 0.001000 10 10 0
#> 3 8 FALSE 1 0.00100 5 5 0
bm_adv <- pc_benchmark(
ids = rep(2244, 100),
fn = function(chunk_ids, ...) {
# transport-style worker without network side effects
pc_response(
'{"IdentifierList":{"CID":[2244]}}',
request = list(domain = "compound", namespace = "cid", identifier = chunk_ids)
)
},
chunk_sizes = c(10, 25),
parallel_options = c(FALSE)
)
bm_adv
#> # A tibble: 2 × 7
#> chunk_size parallel workers elapsed_sec chunks successful_chunks failed_chunks
#> <int> <lgl> <dbl> <dbl> <int> <int> <int>
#> 1 10 FALSE 1 0.00100 10 10 0
#> 2 25 FALSE 1 0.00100 4 4 0
Interpretation:
Purpose:
harness_min <- pc_benchmark_harness(
fn = function(chunk_ids, ...) sum(chunk_ids),
ids = 1:60,
scenario_sizes = c(10L, 20L),
chunk_sizes = c(5L),
parallel_options = FALSE
)
harness_min$summary
#> # A tibble: 2 × 9
#> scenario_size runs min_elapsed_sec max_elapsed_sec max_failed_chunk_ratio
#> <int> <int> <dbl> <dbl> <dbl>
#> 1 10 1 0.00100 0.00100 0
#> 2 20 1 0.00100 0.00100 0
#> # ℹ 4 more variables: elapsed_threshold <dbl>, failed_ratio_threshold <dbl>,
#> # all_runs_pass <lgl>, any_run_pass <lgl>
harness_path <- file.path(tempdir(), "pubchemr-benchmark-report.md")
harness_typical <- pc_benchmark_harness(
fn = function(chunk_ids, ...) sum(chunk_ids),
ids = 1:100,
scenario_sizes = c(10L, 30L),
chunk_sizes = c(5L, 10L),
parallel_options = FALSE,
report_path = harness_path,
report_format = "markdown"
)
file.exists(harness_path)
#> [1] TRUE
custom_thresholds <- list(
elapsed_sec = c(`10` = 60, `30` = 120),
failed_chunk_ratio = c(`10` = 0, `30` = 0)
)
rds_report_path <- file.path(tempdir(), "pubchemr-benchmark-report.rds")
harness_adv <- pc_benchmark_harness(
fn = function(chunk_ids, ...) sum(chunk_ids),
ids = NULL,
scenario_sizes = c(10L, 30L),
chunk_sizes = c(5L),
parallel_options = FALSE,
thresholds = custom_thresholds,
id_generator = function(n) rep(2244L, n),
report_path = rds_report_path,
report_format = "rds"
)
list(
summary = harness_adv$summary %>% select(scenario_size, all_runs_pass, elapsed_threshold, failed_ratio_threshold),
report_exists = file.exists(rds_report_path),
report_class = class(readRDS(rds_report_path))[1]
)
#> $summary
#> # A tibble: 2 × 4
#> scenario_size all_runs_pass elapsed_threshold failed_ratio_threshold
#> <int> <lgl> <dbl> <dbl>
#> 1 10 TRUE 60 0
#> 2 30 TRUE 120 0
#>
#> $report_exists
#> [1] TRUE
#>
#> $report_class
#> [1] "PubChemBenchmarkReport"
Interpretation:
id_generator is useful when you want scenario sizes to
scale without recycling a short seed identifier vector.This section covers pc_assay_activity_long(),
pc_activity_outcome_map(),
pc_activity_matrix(), pc_cross_domain_join(),
pc_feature_table(), pc_model_matrix(),
pc_export_model_data(), pc_to_rcdk(),
pc_to_chemminer(), and
pc_lifecycle_policy().
Section contract:
CID,
AID, SID) and silent join-key mismatches.assay_payload <- pc_example_assaysummary_payload()
feature_tbl_synthetic <- pc_example_feature_table() %>%
mutate(
XLogP = c(1.2, 3.1, 2.7),
TPSA = c(63.6, 37.3, 48.4),
CanonicalSMILES = c(
"CC(=O)OC1=CC=CC=C1C(=O)O",
"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O",
"CN1C=NC2=C1C(=O)N(C(=O)N2)C"
)
)
assay_long <- pc_assay_activity_long(x = assay_payload)
list(
assay_rows = nrow(assay_long),
assay_cols = names(assay_long),
feature_rows = nrow(feature_tbl_synthetic),
feature_cols = names(feature_tbl_synthetic)
)
#> $assay_rows
#> [1] 3
#>
#> $assay_cols
#> [1] "AID" "SID" "CID"
#> [4] "ActivityOutcome" "ActivityValue_uM" "ActivityOutcomeValue"
#>
#> $feature_rows
#> [1] 3
#>
#> $feature_cols
#> [1] "CID" "MolecularWeight" "XLogP"
#> [4] "TPSA" "HBondDonorCount" "HBondAcceptorCount"
#> [7] "CanonicalSMILES"
Interpretation:
assay_long is the canonical long format for downstream
matrix/modeling operations.Purpose:
x (payload or PubChemResult)
or identifier + namespace.PubChemTable with canonical columns
(CID, AID, ActivityOutcome,
optional ActivityOutcomeValue).pc_assay_activity_long(x = assay_payload)
#> # A tibble: 3 × 6
#> AID SID CID ActivityOutcome ActivityValue_uM ActivityOutcomeValue
#> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1001 9001 2244 Active 1.2 1
#> 2 1002 9002 2244 Inactive NA 0
#> 3 1003 9003 3672 Inconclusive 12.5 NA
pc_assay_activity_long(
x = assay_payload,
unique_rows = TRUE,
add_outcome_value = TRUE
)
#> # A tibble: 3 × 6
#> AID SID CID ActivityOutcome ActivityValue_uM ActivityOutcomeValue
#> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1001 9001 2244 Active 1.2 1
#> 2 1002 9002 2244 Inactive NA 0
#> 3 1003 9003 3672 Inconclusive 12.5 NA
pc_assay_activity_long(
x = assay_payload,
add_outcome_value = TRUE,
strict_outcome = FALSE,
unknown_outcome = -1
)
#> # A tibble: 3 × 6
#> AID SID CID ActivityOutcome ActivityValue_uM ActivityOutcomeValue
#> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 1001 9001 2244 Active 1.2 1
#> 2 1002 9002 2244 Inactive NA 0
#> 3 1003 9003 3672 Inconclusive 12.5 NA
Interpretation:
ActivityOutcomeValue is derived with explicit mapping
controls.assay_long_result <- pc_assay_activity_long(
x = 1,
error_mode = "result"
)
summarize_any(assay_long_result)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult InvalidInput 1 4
Interpretation:
error_mode = "result" is the right choice when
long-table normalization is part of a larger orchestrated pipeline that
should not stop immediately.Pitfalls:
identifier, failed chunks
now error explicitly.error_mode = "result" when you need failure objects
instead of hard stops.Purpose:
map.pc_activity_outcome_map(c("Active", "Inactive", "Inconclusive"))
#> [1] 1 0 NA
pc_activity_outcome_map(
c("Hit", "Non-Hit", "UnknownLabel"),
strict = FALSE,
unknown = -1
)
#> [1] 1 0 -1
safe_call(
pc_activity_outcome_map(
c("Active", "mystery-state"),
strict = TRUE
)
)
#> $message
#> [1] "Unknown activity outcome label(s): mystery-state. Provide 'map' entries or set strict = FALSE."
#>
#> attr(,"class")
#> [1] "pc_error"
Interpretation:
strict = TRUE in production scoring to fail on
unexpected labels.strict = FALSE with explicit
unknown makes data-loss tradeoffs explicit.Purpose:
CID,
AID, and mapped numeric outcome.PubChemSparseActivityMatrix.mat_dense <- pc_activity_matrix(assay_long)
mat_dense
#> # A tibble: 2 × 4
#> CID AID_1001 AID_1002 AID_1003
#> <chr> <dbl> <dbl> <dbl>
#> 1 2244 1 0 NA
#> 2 3672 NA NA NA
mat_sparse <- pc_activity_matrix(
assay_long,
output = "sparse",
aggregate = "max"
)
mat_sparse
#>
#> PubChemSparseActivityMatrix
#>
#> - Rows (compounds): 2
#> - Columns (assays): 3
#> - Non-zero entries: 3
#> - Implicit fill: NA
assay_with_dupes <- bind_rows(
assay_long,
tibble(CID = "2244", AID = "1001", ActivityOutcome = "Inactive", ActivityValue_uM = 0.5, ActivityOutcomeValue = 0)
)
pc_activity_matrix(
assay_with_dupes,
aggregate = "mean",
output = "tibble",
fill = NA_real_,
prefix = "AID_"
)
#> # A tibble: 2 × 4
#> CID AID_1001 AID_1002 AID_1003
#> <chr> <dbl> <dbl> <dbl>
#> 1 2244 0.5 0 NA
#> 2 3672 NA NA NA
Interpretation:
aggregate based on assay repeat semantics
(max, mean, first).Purpose:
by list and
join type).pc_cross_domain_join(
compounds = feature_tbl_synthetic,
assays = assay_long %>% select(CID, AID, ActivityOutcome)
)
#> # A tibble: 4 × 9
#> CID MolecularWeight XLogP TPSA HBondDonorCount HBondAcceptorCount
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2244 180. 1.2 63.6 1 4
#> 2 2244 180. 1.2 63.6 1 4
#> 3 3672 206. 3.1 37.3 0 2
#> 4 5957 194. 2.7 48.4 1 4
#> # ℹ 3 more variables: CanonicalSMILES <chr>, AID <chr>, ActivityOutcome <chr>
substances_tbl <- tibble(CID = c("2244", "3672"), SID = c("111", "222"))
target_tbl <- tibble(AID = c("1001", "1002"), target_symbol = c("PTGS1", "PTGS2"))
pc_cross_domain_join(
compounds = feature_tbl_synthetic,
substances = substances_tbl,
assays = assay_long %>% select(CID, AID, ActivityOutcomeValue),
targets = target_tbl,
join = "left"
)
#> # A tibble: 4 × 11
#> CID MolecularWeight XLogP TPSA HBondDonorCount HBondAcceptorCount
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2244 180. 1.2 63.6 1 4
#> 2 2244 180. 1.2 63.6 1 4
#> 3 3672 206. 3.1 37.3 0 2
#> 4 5957 194. 2.7 48.4 1 4
#> # ℹ 5 more variables: CanonicalSMILES <chr>, SID <chr>, AID <chr>,
#> # ActivityOutcomeValue <dbl>, target_symbol <chr>
pc_cross_domain_join(
compounds = feature_tbl_synthetic,
assays = assay_long %>% select(CID, AID, ActivityOutcomeValue),
by = list(compound_assay = "CID"),
join = "full"
)
#> # A tibble: 4 × 9
#> CID MolecularWeight XLogP TPSA HBondDonorCount HBondAcceptorCount
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2244 180. 1.2 63.6 1 4
#> 2 2244 180. 1.2 63.6 1 4
#> 3 3672 206. 3.1 37.3 0 2
#> 4 5957 194. 2.7 48.4 1 4
#> # ℹ 3 more variables: CanonicalSMILES <chr>, AID <chr>,
#> # ActivityOutcomeValue <dbl>
Interpretation:
character) before joining to
prevent accidental row loss.Purpose:
feature_try <- safe_call(
pc_feature_table(
identifier = c(2244, 3672),
properties = c("MolecularWeight", "XLogP", "TPSA"),
namespace = "cid",
offline = TRUE
)
)
summarize_any(feature_try)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE pc_error Property retrieval failed: Offline mode enabled an… NA NA
if (run_live) {
feature_live <- pc_feature_table(
identifier = c(2244, 3672),
properties = c("MolecularWeight", "XLogP", "TPSA"),
namespace = "cid",
numeric_only = TRUE,
cache = TRUE
)
feature_live
} else {
feature_tbl_synthetic
}
#> # A tibble: 3 × 7
#> CID MolecularWeight XLogP TPSA HBondDonorCount HBondAcceptorCount
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2244 180. 1.2 63.6 1 4
#> 2 3672 206. 3.1 37.3 0 2
#> 3 5957 194. 2.7 48.4 1 4
#> # ℹ 1 more variable: CanonicalSMILES <chr>
if (run_live) {
feature_live_adv <- pc_feature_table(
identifier = c(2244, 3672, 5957),
properties = c("MolecularWeight", "XLogP", "TPSA", "HBondDonorCount", "HBondAcceptorCount"),
namespace = "cid",
numeric_only = TRUE,
cache = TRUE,
force_refresh = TRUE
)
feature_live_adv
} else {
feature_result_adv <- pc_feature_table(
identifier = c(2244, 3672, 5957),
properties = c("MolecularWeight", "XLogP", "TPSA", "HBondDonorCount", "HBondAcceptorCount"),
namespace = "cid",
offline = TRUE,
error_mode = "result"
)
summarize_any(feature_result_adv)
}
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemRecord OfflineCacheMiss 1 4
Interpretation:
pc_feature_table() intentionally fails
on cache miss.error_mode = "result" when pipeline orchestration
requires typed failures.Purpose:
outcome, ID columns, NA and
scaling policy.PubChemModelMatrix with x,
optional y, and ID metadata.mm_min <- pc_model_matrix(
x = assay_long %>%
select(CID, AID, ActivityOutcomeValue) %>%
mutate(dummy = as.numeric(factor(AID))),
outcome = "ActivityOutcomeValue",
id_cols = c("CID", "AID")
)
mm_min
#>
#> PubChemModelMatrix
#>
#> - Rows: 3
#> - Features: 1
#> - Outcome: Provided
joined_tbl <- pc_cross_domain_join(
compounds = feature_tbl_synthetic,
assays = assay_long %>% select(CID, AID, ActivityOutcomeValue)
)
mm_typical <- pc_model_matrix(
x = joined_tbl,
outcome = "ActivityOutcomeValue",
id_cols = c("CID", "AID"),
na_fill = 0,
scale = TRUE
)
dim(mm_typical$x)
#> [1] 4 5
mm_adv <- pc_model_matrix(
x = joined_tbl,
outcome = NULL,
id_cols = c("CID", "AID", "ActivityOutcomeValue"),
na_fill = 0,
scale = FALSE
)
list(features = length(mm_adv$feature_names), has_outcome = !is.null(mm_adv$y), has_ids = !is.null(mm_adv$ids))
#> $features
#> [1] 5
#>
#> $has_outcome
#> [1] FALSE
#>
#> $has_ids
#> [1] TRUE
Interpretation:
pc_model_matrix() is strict about non-empty numeric
predictor sets.feature_names before modeling to ensure
expected predictors survived preprocessing.Purpose:
PubChemModelMatrix or data frame, output
path, format flags.path,
format, rows, cols).out_csv <- file.path(tempdir(), "pubchemr_mm_min.csv")
meta_csv <- pc_export_model_data(mm_typical, path = out_csv, format = "csv")
meta_csv
#> $path
#> [1] "/private/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T/RtmpjLjM6G/pubchemr_mm_min.csv"
#>
#> $format
#> [1] "csv"
#>
#> $rows
#> [1] 4
#>
#> $cols
#> [1] 8
out_rds <- file.path(tempdir(), "pubchemr_mm_typical.rds")
meta_rds <- pc_export_model_data(mm_typical, path = out_rds, format = "rds")
meta_rds
#> $path
#> [1] "/private/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T/RtmpjLjM6G/pubchemr_mm_typical.rds"
#>
#> $format
#> [1] "rds"
#>
#> $rows
#> [1] 4
#>
#> $cols
#> [1] 8
out_csv_no_ids <- file.path(tempdir(), "pubchemr_mm_no_ids.csv")
meta_csv_no_ids <- pc_export_model_data(
mm_typical,
path = out_csv_no_ids,
format = "csv",
include_ids = FALSE,
include_outcome = FALSE
)
meta_csv_no_ids
#> $path
#> [1] "/private/var/folders/dj/y28dp44x303ggfg6rg8n2v0h0000gn/T/RtmpjLjM6G/pubchemr_mm_no_ids.csv"
#>
#> $format
#> [1] "csv"
#>
#> $rows
#> [1] 4
#>
#> $cols
#> [1] 5
Interpretation:
tempdir() in reproducible examples; in projects,
write to explicit versioned artifact directories.Purpose:
to).as_tibble() or retrieve().sim_min <- safe_call(
pc_similarity_search(
identifier = "CC(=O)OC1=CC=CC=C1C(=O)O",
namespace = "smiles",
offline = TRUE
)
)
summarize_any(sim_min)
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemSimilarityResult OfflineCacheMiss 1 5
if (run_live) {
sim_live <- pc_similarity_search(
identifier = "CC(=O)OC1=CC=CC=C1C(=O)O",
namespace = "smiles",
threshold = 90,
max_records = 50,
cache = TRUE
)
as_tibble(sim_live) %>% head(10)
} else {
tibble(note = "Skipped live similarity retrieval.")
}
#> # A tibble: 1 × 1
#> note
#> <chr>
#> 1 Skipped live similarity retrieval.
sim_adv <- pc_similarity_search(
identifier = "CCO",
namespace = "smiles",
searchtype = "fastsimilarity_2d",
threshold = 80,
to = "aids",
max_records = 100,
offline = TRUE
)
list(
summary = summarize_any(sim_adv),
request = list(
searchtype = request_args(sim_adv, "searchtype"),
to = request_args(sim_adv, "to"),
threshold = request_args(sim_adv, "options")$Threshold,
max_records = request_args(sim_adv, "options")$MaxRecords
)
)
#> $summary
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemSimilarityResult OfflineCacheMiss 1 5
#>
#> $request
#> $request$searchtype
#> [1] "fastsimilarity_2d"
#>
#> $request$to
#> [1] "aids"
#>
#> $request$threshold
#> [1] 80
#>
#> $request$max_records
#> [1] 100
Interpretation:
searchtype, threshold, and
to intentionally; these strongly affect downstream payload
shape.SID/AID
targets before returning.Purpose:
PubChemTable with SDQ-specific columns
(variable by record).if (run_live) {
sdq_min <- pc_sdq_bioactivity(2244, namespace = "cid", limit = 100L, rate_limit = FALSE)
summarize_any(sdq_min)
} else {
tibble(note = "Skipped (live-only function).")
}
#> # A tibble: 1 × 1
#> note
#> <chr>
#> 1 Skipped (live-only function).
if (run_live) {
sdq_typical <- pc_sdq_bioactivity(
identifier = 2244,
namespace = "cid",
collection = "bioactivity",
limit = 200L,
order = "activity,asc",
cache = TRUE,
cache_dir = work_cache,
cache_ttl = 3600,
rate_limit = FALSE
)
sdq_typical %>% head(5)
} else {
tibble(note = "Skipped (live-only function).")
}
#> # A tibble: 1 × 1
#> note
#> <chr>
#> 1 Skipped (live-only function).
if (run_live) {
sdq_adv <- pc_sdq_bioactivity(
identifier = "aspirin",
namespace = "name",
limit = 250L,
cache = TRUE,
cache_dir = work_cache,
force_refresh = TRUE,
rate_limit = FALSE
)
summarize_any(sdq_adv)
} else {
sdq_adv <- pc_sdq_bioactivity(
identifier = "aspirin",
namespace = "name",
cache = TRUE,
cache_dir = work_cache,
offline = TRUE,
error_mode = "result"
)
list(
summary = summarize_any(sdq_adv),
request = request_args(sdq_adv)[c("identifier", "namespace", "caller")]
)
}
#> $summary
#> # A tibble: 1 × 5
#> ok class note rows cols
#> <lgl> <chr> <chr> <int> <int>
#> 1 FALSE PubChemResult OfflineCacheMiss 1 4
#>
#> $request
#> $request$identifier
#> [1] "aspirin"
#>
#> $request$namespace
#> [1] "name"
#>
#> $request$<NA>
#> NULL
Interpretation:
error_mode = "result" is recommended for robust
pipelines that must continue after some retrieval failures.Purpose:
rcdk molecule lists or
ChemmineR SDF objects.rcdk_min <- safe_call(pc_to_rcdk(feature_tbl_synthetic, smiles_col = "CanonicalSMILES", id_col = "CID"))
chemminer_min <- safe_call(pc_to_chemminer(feature_tbl_synthetic, smiles_col = "CanonicalSMILES"))
bind_rows(
list(pc_to_rcdk = summarize_any(rcdk_min), pc_to_chemminer = summarize_any(chemminer_min)),
.id = "function"
)
#> # A tibble: 2 × 6
#> `function` ok class note rows cols
#> <chr> <lgl> <chr> <chr> <int> <int>
#> 1 pc_to_rcdk FALSE pc_error Package 'rcdk' is required for pc_… NA NA
#> 2 pc_to_chemminer FALSE pc_error Package 'ChemmineR' is required fo… NA NA
if (requireNamespace("rcdk", quietly = TRUE)) {
mols <- pc_to_rcdk(feature_tbl_synthetic, smiles_col = "CanonicalSMILES", id_col = "CID")
length(mols)
} else {
"rcdk not installed; conversion skipped."
}
#> [1] "rcdk not installed; conversion skipped."
if (requireNamespace("ChemmineR", quietly = TRUE)) {
sdf_obj <- pc_to_chemminer(feature_tbl_synthetic, smiles_col = "CanonicalSMILES")
class(sdf_obj)
} else {
"ChemmineR not installed; conversion skipped."
}
#> [1] "ChemmineR not installed; conversion skipped."
if (requireNamespace("rcdk", quietly = TRUE)) {
mols_named <- pc_to_rcdk(feature_tbl_synthetic, smiles_col = "CanonicalSMILES", id_col = "CID")
names(mols_named)
} else {
"rcdk not installed; advanced conversion skipped."
}
#> [1] "rcdk not installed; advanced conversion skipped."
if (requireNamespace("ChemmineR", quietly = TRUE)) {
sdf_adv <- pc_to_chemminer(feature_tbl_synthetic, smiles_col = "CanonicalSMILES")
length(sdf_adv)
} else {
"ChemmineR not installed; advanced conversion skipped."
}
#> [1] "ChemmineR not installed; advanced conversion skipped."
Interpretation:
pc_capabilities() early to detect optional
dependency availability.Purpose:
pc_lifecycle_policy()
#> # A tibble: 2 × 5
#> stream stability support_window deprecation_notice breaking_change_window
#> <chr> <chr> <chr> <chr> <chr>
#> 1 legacy maintenance bugfix-only >= 1 minor release major release only
#> 2 nextgen stable minor+patch >= 2 minor releases major release only
pc_lifecycle_policy() %>% filter(stream == "nextgen")
#> # A tibble: 1 × 5
#> stream stability support_window deprecation_notice breaking_change_window
#> <chr> <chr> <chr> <chr> <chr>
#> 1 nextgen stable minor+patch >= 2 minor releases major release only
policy <- pc_lifecycle_policy()
stopifnot(any(policy$stream == "nextgen"), any(policy$stream == "legacy"))
policy
#> # A tibble: 2 × 5
#> stream stability support_window deprecation_notice breaking_change_window
#> <chr> <chr> <chr> <chr> <chr>
#> 1 legacy maintenance bugfix-only >= 1 minor release major release only
#> 2 nextgen stable minor+patch >= 2 minor releases major release only
Interpretation:
This section provides two coherent pipelines users can adapt directly.
Section contract:
Goal:
# 1) Start from assay payload normalization.
assay_long_a <- pc_assay_activity_long(x = assay_payload, add_outcome_value = TRUE)
# 2) Build dense and sparse activity matrices.
activity_dense_a <- pc_activity_matrix(assay_long_a, output = "tibble")
activity_sparse_a <- pc_activity_matrix(assay_long_a, output = "sparse")
# 3) Join synthetic features with assay outcomes.
joined_a <- pc_cross_domain_join(
compounds = feature_tbl_synthetic,
assays = assay_long_a %>% select(CID, AID, ActivityOutcomeValue)
)
# 4) Create model matrix.
mm_a <- pc_model_matrix(
x = joined_a,
outcome = "ActivityOutcomeValue",
id_cols = c("CID", "AID"),
na_fill = 0,
scale = TRUE
)
# 5) Export artifacts.
out_a_csv <- file.path(tempdir(), "pipeline_a_model.csv")
out_a_rds <- file.path(tempdir(), "pipeline_a_model.rds")
meta_a_csv <- pc_export_model_data(mm_a, path = out_a_csv, format = "csv")
meta_a_rds <- pc_export_model_data(mm_a, path = out_a_rds, format = "rds")
list(
assay_rows = nrow(assay_long_a),
dense_dim = dim(activity_dense_a),
sparse_dim = dim(activity_sparse_a$x),
model_dim = dim(mm_a$x),
csv_exists = file.exists(meta_a_csv$path),
rds_exists = file.exists(meta_a_rds$path)
)
#> $assay_rows
#> [1] 3
#>
#> $dense_dim
#> [1] 2 4
#>
#> $sparse_dim
#> [1] 2 3
#>
#> $model_dim
#> [1] 4 5
#>
#> $csv_exists
#> [1] TRUE
#>
#> $rds_exists
#> [1] TRUE
Interpretation:
Goal:
pipe_b_cp_dir <- file.path(tempdir(), "pipeline_b_checkpoints")
pipe_b_cp_id <- "pipeline-b-demo"
# Chunked workflow using a deterministic local worker.
pipe_b_batch <- pc_batch(
ids = 1:24,
fn = function(chunk_ids, ...) {
tibble(id = chunk_ids, value = chunk_ids * 2, parity = ifelse(chunk_ids %% 2 == 0, "even", "odd"))
},
chunk_size = 6,
checkpoint_dir = pipe_b_cp_dir,
checkpoint_id = pipe_b_cp_id
)
# Resume using the same checkpoint.
pipe_b_resume <- pc_resume_batch(
fn = function(chunk_ids, ...) tibble(id = chunk_ids, value = chunk_ids * 2),
checkpoint_dir = pipe_b_cp_dir,
checkpoint_id = pipe_b_cp_id
)
# Benchmark and harness on local worker.
pipe_b_bm <- pc_benchmark(
ids = 1:120,
fn = function(chunk_ids, ...) sum(chunk_ids),
chunk_sizes = c(10, 20, 40),
parallel_options = FALSE
)
pipe_b_harness <- pc_benchmark_harness(
fn = function(chunk_ids, ...) sum(chunk_ids),
ids = 1:240,
scenario_sizes = c(20L, 60L),
chunk_sizes = c(10L, 20L),
parallel_options = FALSE,
report_path = file.path(tempdir(), "pipeline_b_harness.md"),
report_format = "markdown"
)
list(
batch_chunks = length(pipe_b_batch$chunks),
resumed = pipe_b_resume$checkpoint$resumed,
benchmark_rows = nrow(pipe_b_bm),
harness_rows = nrow(pipe_b_harness$summary)
)
#> $batch_chunks
#> [1] 4
#>
#> $resumed
#> [1] TRUE
#>
#> $benchmark_rows
#> [1] 3
#>
#> $harness_rows
#> [1] 2
Interpretation:
pc_batch(), pc_resume_batch(),
pc_benchmark(), and
pc_benchmark_harness().Section contract:
success = FALSE.error$code, error$message,
pending, from_cache) before changing the
workflow.Cause:
offline = TRUE with no cached response for that
request.Diagnostic:
x <- pc_request(identifier = 2244, cache = TRUE, offline = TRUE)
tibble(success = x$success, error_code = x$error$code, error_message = x$error$message)
#> # A tibble: 1 × 3
#> success error_code error_message
#> <lgl> <chr> <chr>
#> 1 FALSE OfflineCacheMiss Offline mode enabled and no cached response found fo…
Best practice:
cache = TRUE, then replay with
offline = TRUE.Diagnostic pattern:
x <- safe_call(pc_property(2244, properties = "MolecularWeight", offline = TRUE))
if (inherits(x, "pc_error")) {
x$message
} else {
list(success = x$success, error = if (x$success) NA_character_ else x$error$message)
}
#> $success
#> [1] FALSE
#>
#> $error
#> [1] "Offline mode enabled and no cached response found for this request."
Best practice:
success and inspect
error$code/error$message.retrieve(x, "error/code", .to.data.frame = FALSE)
when building programmatic routing logic.Cause:
Diagnostic:
safe_call(
pc_feature_table(
identifier = c(2244, 3672),
properties = c("MolecularWeight", "XLogP"),
offline = TRUE
)
)
#> $message
#> [1] "Property retrieval failed: Offline mode enabled and no cached response found for this request."
#>
#> attr(,"class")
#> [1] "pc_error"
Best practice:
error_mode = "result" for typed failures in
orchestration code.Best practice checklist:
chunk_size.pc_config().pc_resume_batch().PubChemBatchResult$error.Checklist:
set.seed(...)) for local
stochastic steps.pc_profile() + explicit
pc_config() overrides.tempdir() for vignette/testing artifacts.run_live).For production nextgen workflows:
pc_profile() and explicit
pc_config() overrides.success,
error, metadata).retrieve() (or as_tibble())
consistently to normalize pc_* outputs.