{
  "version": "1.0.0",
  "generated": "2026-05-12",
  "count": 31,
  "total_benchmarks_tracked": 1749,
  "initiatives": [
    {
      "id": "tdc",
      "name": "Therapeutics Data Commons (TDC)",
      "kind": "meta-platform",
      "url": "https://tdcommons.ai/",
      "github": "https://github.com/mims-harvard/TDC",
      "description": "Open-science platform curating ML datasets/tasks across the drug discovery pipeline with unified API, splits, and leaderboards.",
      "benchmarks_tracked": 83,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Scraped tdcommons.ai single_pred/multi_pred/generation overview pages 2026-05-12: single-pred ~38 datasets (ADME/Tox/HTS/QM/Yields/Epitope/Develop/CRISPROutcome), multi-pred ~32 datasets (DTI/DDI/PPI/GDA/DrugRes/DrugSyn/PeptideMHC/AntibodyAff/MTI/Catalyst/TCREpitope/TrialOutcome/ProteinPeptide/PerturbOutcome/scDTI), generation ~13 (MolGen/RetroSyn/Reaction/SBDD). 8 named leaderboard groups.",
      "breakdown": {
        "single_prediction": 38,
        "multi_prediction": 32,
        "generation": 13,
        "leaderboard_groups": 8
      },
      "host_organization": "Zitnik Lab, Harvard Medical School (+ MIT, Stanford, Georgia Tech collaborators)",
      "primary_contacts": [
        "Marinka Zitnik",
        "Kexin Huang",
        "Tianfan Fu"
      ],
      "founded": "2021-02",
      "license_model": "MIT (code); per-dataset licenses for data",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Most comprehensive ML-ready therapeutics benchmark hub. NeurIPS 2021 + Nat Chem Bio 2022.",
      "composite_score": 100.0
    },
    {
      "id": "casp",
      "name": "CASP (Critical Assessment of Structure Prediction)",
      "kind": "competition",
      "url": "https://predictioncenter.org/",
      "github": "N/A",
      "description": "Biennial blind evaluation of protein structure prediction; drove AlphaFold's validation.",
      "benchmarks_tracked": 16,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "predictioncenter.org archives: CASP1 (1994) through CASP16 (2024) = 16 editions; ~100 targets \u00d7 ~5 categories per edition.",
      "breakdown": {
        "editions": 16,
        "categories_per_edition": 5,
        "targets_avg": 100
      },
      "host_organization": "Prediction Center, UC Davis",
      "primary_contacts": [
        "John Moult",
        "Andriy Kryshtafovych"
      ],
      "founded": "1994",
      "license_model": "Public",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Historical gold standard for blind evaluation. CASP15 added ligands; CASP16 added multimer + RNA.",
      "composite_score": 100.0
    },
    {
      "id": "proteingym",
      "name": "ProteinGym",
      "kind": "meta-platform",
      "url": "https://proteingym.org/",
      "github": "https://github.com/OATML-Markslab/ProteinGym",
      "description": "Large-scale benchmark for protein fitness prediction from DMS + clinical variant effects.",
      "benchmarks_tracked": 217,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "ProteinGym v1.2 README + NeurIPS 2023 paper: 217 DMS substitution assays + 66 indel assays + 2525 ClinVar clinical variants.",
      "breakdown": {
        "dms_substitutions": 217,
        "dms_indels": 66,
        "mutations": 2700000,
        "clinical_variants": 2525
      },
      "host_organization": "Marks Lab (Harvard) + OATML (Oxford) + DeepMind",
      "primary_contacts": [
        "Debora Marks",
        "Pascal Notin",
        "Yarin Gal"
      ],
      "founded": "2022",
      "license_model": "MIT",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "notes": "De facto standard for variant effect prediction. Clinical track enables ESM/EVE/AlphaMissense fair comparison.",
      "composite_score": 97.5
    },
    {
      "id": "elixir",
      "name": "ELIXIR Infrastructure",
      "kind": "consortium",
      "url": "https://elixir-europe.org/",
      "github": "N/A",
      "description": "European life-science data infrastructure hosting benchmark-relevant resources (UniProt, Ensembl, ChEMBL, PDBe, IntAct).",
      "benchmarks_tracked": 18,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "ELIXIR Core Data Resources list 2026-05: ~18 resources with benchmark/leaderboard components.",
      "breakdown": {
        "core_data_resources": 18,
        "member_nodes": 23
      },
      "host_organization": "EMBL-EBI + 23 EU member nodes",
      "primary_contacts": [
        "Niklas Blomberg",
        "Andrew Smith"
      ],
      "founded": "2013",
      "license_model": "Mostly CC-BY",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "notes": "Meta-resource of meta-resources.",
      "composite_score": 97.5
    },
    {
      "id": "cameo",
      "name": "CAMEO",
      "kind": "competition",
      "url": "https://www.cameo3d.org/",
      "github": "N/A",
      "description": "Continuous weekly blind eval of protein 3D / multimer / ligand prediction using pre-release PDB structures.",
      "benchmarks_tracked": 4,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "cameo3d.org 2026-05: 4 active categories \u2014 3D monomer, 3D multimer, model quality, ligand pocket; ~1000 targets/year.",
      "breakdown": {
        "monomer": 1,
        "multimer": 1,
        "quality_estimation": 1,
        "ligand": 1
      },
      "host_organization": "Biozentrum Basel + SIB",
      "primary_contacts": [
        "Torsten Schwede",
        "J\u00fcrgen Haas"
      ],
      "founded": "2013",
      "license_model": "CC-BY 4.0",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "notes": "Excellent continuous cadence complementing CASP.",
      "composite_score": 94.4
    },
    {
      "id": "posebusters-initiative",
      "name": "PoseBusters Evaluation Suite",
      "kind": "meta-platform",
      "url": "https://posebusters.readthedocs.io/",
      "github": "https://github.com/maabuu/posebusters",
      "description": "Physics-aware validation of docking/co-folding poses; 19 checks + curated test sets.",
      "benchmarks_tracked": 3,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "GitHub README: PoseBusters v1 (308 complexes), v2 (428), Astex Diverse Set (85) = 3 canonical suites.",
      "breakdown": {
        "test_sets": 3,
        "validation_checks": 19
      },
      "host_organization": "Oxford OPIG (Deane Lab)",
      "primary_contacts": [
        "Charlotte Deane",
        "Martin Buttenschoen"
      ],
      "founded": "2023-08",
      "license_model": "BSD-3-Clause",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Changed pose-prediction evaluation norms; default pharma filter now.",
      "composite_score": 93.9
    },
    {
      "id": "plinder-initiative",
      "name": "PLINDER / PINDER",
      "kind": "meta-platform",
      "url": "https://www.plinder.sh/",
      "github": "https://github.com/plinder-org/plinder",
      "description": "Leakage-controlled protein-ligand (PLINDER) and protein-protein (PINDER) docking datasets.",
      "benchmarks_tracked": 2,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "plinder.sh + pinder.sh: 2 major benchmarks (PLINDER 460k systems, PINDER 267k systems).",
      "breakdown": {
        "plinder_systems": 460000,
        "pinder_systems": 267498
      },
      "host_organization": "Biozentrum Basel + VantAI + Isomorphic Labs + EPFL",
      "primary_contacts": [
        "Torsten Schwede",
        "Max Jaderberg",
        "Andreas Fischer"
      ],
      "founded": "2024-07",
      "license_model": "CC-BY 4.0",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Replacing PDBbind/CASF for modern docking ML eval.",
      "composite_score": 93.9
    },
    {
      "id": "openproblems",
      "name": "Open Problems in Single-Cell Analysis",
      "kind": "consortium",
      "url": "https://openproblems.bio/",
      "github": "https://github.com/openproblems-bio/openproblems",
      "description": "Community benchmark suite for single-cell analysis with reproducible Viash/Nextflow pipelines and NeurIPS tracks.",
      "benchmarks_tracked": 29,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "openproblems.bio task registry + Luecken et al. Nat Biotech 2025: 29 benchmark tasks (batch integration, denoising, dim-reduction, label projection, perturbation, spatial, multimodal).",
      "breakdown": {
        "batch_integration": 3,
        "perturbation": 4,
        "multimodal": 6,
        "label_transfer": 4,
        "spatial": 5,
        "other": 7
      },
      "host_organization": "CZI + Helmholtz Munich + Yale + HMS",
      "primary_contacts": [
        "Fabian Theis",
        "Malte Luecken",
        "Daniel Burkhardt",
        "Sandrine Dudoit"
      ],
      "founded": "2021-06",
      "license_model": "MIT",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "Gold-standard single-cell benchmarking rigor; Nat Biotech 2025.",
      "composite_score": 91.9
    },
    {
      "id": "polaris",
      "name": "Polaris Hub",
      "kind": "meta-platform",
      "url": "https://polarishub.io/",
      "github": "https://github.com/polaris-hub/polaris",
      "description": "Industry-curated small-molecule benchmarks with working groups on method-comparison standards.",
      "benchmarks_tracked": 48,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "polarishub.io/benchmarks public listing 2026-05: ~48 public benchmarks across Recursion, Valence, Novartis, AstraZeneca, Polaris Small Molecule Steering Committee orgs.",
      "breakdown": {
        "public_benchmarks": 48,
        "datasets": 60,
        "competitions": 4
      },
      "host_organization": "Polaris consortium (Valence Labs, Recursion, Novartis, Pfizer, Merck, AstraZeneca)",
      "primary_contacts": [
        "Cas Wognum",
        "Emmanuel Noutahi",
        "Jonathan Hsu"
      ],
      "founded": "2023-10",
      "license_model": "CC-BY or Polaris Community License per benchmark",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "notes": "Industry-led counterweight to academic benchmarks. Strong on method-comparison rigor.",
      "composite_score": 91.4
    },
    {
      "id": "insilico-scienceaibench",
      "name": "ScienceAIBench",
      "kind": "meta-platform",
      "url": "https://scienceaibench.insilico.com/",
      "github": "N/A \u2014 hosted portal",
      "description": "Insilico Medicine's public scientific-AI benchmark portal. Spans biology (longevity, target ID), affinity/binding, ADMET, clinical trials, biologics, materials; leaderboards benchmark frontier LLMs (GPT-5.x, Claude Opus/Sonnet 4.x, Gemini 3, Grok 4.1, DeepSeek v3.2, Kimi K2.x).",
      "benchmarks_tracked": 227,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Fetched https://scienceaibench.insilico.com/api/benchmarks on 2026-05-12; meta.totalBenchmarks=227 across 7 taxonomy categories \u00d7 17 suites. Leaderboard submitters are external frontier LLMs (top entries: Grok 4.1, GPT 5.1/5.2, Claude Opus 4.5/4.6, Gemini 3 Flash, DeepSeek v3.2, Kimi K2.5). Not self-referential \u2014 Insilico's own models are not on the leaderboards.",
      "breakdown": {
        "Biology (TargetBench + Longevity)": 29,
        "Affinity and Binding": 94,
        "Chemical Synthesis (Retrosynthesis)": 2,
        "ADMET, PK & Safety": 50,
        "Clinical Trials (ClinBench Quarterly)": 25,
        "Biologics": 6,
        "Materials (MatBench + others)": 21
      },
      "host_organization": "Insilico Medicine",
      "primary_contacts": [
        "Alex Zhavoronkov",
        "Alex Aliper",
        "Alex Zhebrak"
      ],
      "founded": "2025",
      "license_model": "CC-BY (per portal); academic-friendly",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Biggest of the three Insilico portals. Live leaderboards regenerate against frontier LLMs \u2014 therefore NOT flagged self-referential. Strong longevity / aging benchmark slice (unique). Moves up the aging-relevance ranking.",
      "composite_score": 90.6
    },
    {
      "id": "dream",
      "name": "DREAM Challenges",
      "kind": "competition",
      "url": "https://dreamchallenges.org/",
      "github": "https://github.com/dreamchallenges",
      "description": "Long-running crowd-sourced biomedical prediction challenges, many pharma-sponsored.",
      "benchmarks_tracked": 74,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "dreamchallenges.org/closed-challenges + /active as of 2026-05: 74 completed/active challenges; ~38 drug-discovery-relevant.",
      "breakdown": {
        "drug_sensitivity": 9,
        "target_prediction": 6,
        "toxicity": 5,
        "disease_subtyping": 12,
        "other_biomed": 42
      },
      "host_organization": "Sage Bionetworks + IBM + academic partners",
      "primary_contacts": [
        "Gustavo Stolovitzky",
        "Justin Guinney",
        "Pablo Meyer"
      ],
      "founded": "2006",
      "license_model": "Per-challenge (mostly CC-BY-NC)",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 3,
        "adoption": 5,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 5
      },
      "notes": "Historical impact on field norms. Cadence has slowed 2022+.",
      "composite_score": 89.4
    },
    {
      "id": "mimic",
      "name": "MIMIC-IV / eICU",
      "kind": "data-platform",
      "url": "https://physionet.org/content/mimiciv/",
      "github": "https://github.com/MIT-LCP/mimic-code",
      "description": "ICU EHR datasets used for clinical outcome, adverse-event, and PK/PD benchmarks.",
      "benchmarks_tracked": 14,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "PhysioNet + BigBio MIMIC-IV benchmarks 2026-05: 14 derived benchmarks (mortality, LOS, readmission, sepsis, AKI, drug dosing, phenotyping).",
      "breakdown": {
        "outcome_prediction": 6,
        "drug_dosing": 3,
        "adverse_event": 3,
        "phenotyping": 2
      },
      "host_organization": "MIT Lab for Computational Physiology",
      "primary_contacts": [
        "Leo Anthony Celi",
        "Alistair Johnson",
        "Roger Mark"
      ],
      "founded": "2016 / 2020 (v4)",
      "license_model": "PhysioNet credentialed",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 5,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "notes": "Canonical for clinical ML. US-centric.",
      "composite_score": 89.4
    },
    {
      "id": "czi-virtual-cell",
      "name": "CZI Virtual Cell / CellxGene / VCC",
      "kind": "consortium",
      "url": "https://chanzuckerberg.com/science/programs-resources/virtual-cells/",
      "github": "https://github.com/chanzuckerberg",
      "description": "Umbrella for CZI-funded virtual-cell benchmark initiatives: CellxGene, Virtual Cell Challenge, Tabula atlases.",
      "benchmarks_tracked": 12,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "chanzuckerberg.com/science 2026-05: Virtual Cell Challenge (4 tracks), CellxGene Census benchmarks (4), Tabula Sapiens-derived eval suites (4).",
      "breakdown": {
        "virtual_cell_challenge": 4,
        "cellxgene": 4,
        "tabula": 4
      },
      "host_organization": "Chan Zuckerberg Initiative / CZ Biohub",
      "primary_contacts": [
        "Jonah Cool",
        "Stephen Quake",
        "Ambrose Carr"
      ],
      "founded": "2016 / 2024 (VCC)",
      "license_model": "CC-BY 4.0",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "notes": "VCC is becoming the canonical virtual-cell benchmark.",
      "composite_score": 88.9
    },
    {
      "id": "open-reaction-database",
      "name": "Open Reaction Database (ORD)",
      "kind": "data-platform",
      "url": "https://open-reaction-database.org/",
      "github": "https://github.com/open-reaction-database",
      "description": "Open reaction repository in a schema-validated format; enables reaction / yield / retrosynthesis benchmarks.",
      "benchmarks_tracked": 1,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "open-reaction-database.org 2026-05: ~2.1M reactions as single versioned benchmark corpus.",
      "breakdown": {
        "reactions": 2100000,
        "contributing_orgs": 30
      },
      "host_organization": "ORD consortium (Doyle, Coley, Pfizer, Merck, BASF)",
      "primary_contacts": [
        "Connor Coley",
        "Abigail Doyle",
        "Steven Kearnes"
      ],
      "founded": "2021-07",
      "license_model": "CC-BY-SA 4.0",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "notes": "Biggest open reaction corpus; industry donations accelerating.",
      "composite_score": 88.9
    },
    {
      "id": "insilico-ddb",
      "name": "Drug Discovery Benchmarks (DDB)",
      "kind": "meta-platform",
      "url": "https://ddb.insilico.com/",
      "github": "N/A \u2014 hosted portal",
      "description": "Insilico's drug-discovery-specific benchmark portal: TargetBench, Longevity Benchmark, GPCR affinity, PDBbind-style tasks, ISM ADMET, TDC ADMET mirror, ClinBench, biologics.",
      "benchmarks_tracked": 206,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Fetched https://ddb.insilico.com/api/benchmarks on 2026-05-12; meta.totalBenchmarks=206 across 6 categories \u00d7 15 suites.",
      "breakdown": {
        "Biology (TargetBench + Longevity)": 29,
        "Affinity and Binding": 94,
        "Chemical Synthesis": 2,
        "ADMET, PK & Safety": 50,
        "Clinical Trials": 25,
        "Biologics": 6
      },
      "host_organization": "Insilico Medicine",
      "primary_contacts": [
        "Alex Zhavoronkov",
        "Alex Aliper"
      ],
      "founded": "2025",
      "license_model": "CC-BY (per portal)",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 5,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Drug-discovery focused cut. Includes a mirror of TDC ADMET for cross-platform comparability.",
      "composite_score": 87.6
    },
    {
      "id": "cafa",
      "name": "CAFA",
      "kind": "competition",
      "url": "https://biofunctionprediction.org/",
      "github": "N/A",
      "description": "Blind eval of protein function prediction against time-delayed UniProt-GOA.",
      "benchmarks_tracked": 6,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "biofunctionprediction.org archives: CAFA1\u20135 (2010\u20132023) + CAFA6 announced 2025 = 6 editions.",
      "breakdown": {
        "editions": 6,
        "cafa5_targets": 142000
      },
      "host_organization": "Radivojac / Friedberg / Jiang consortium",
      "primary_contacts": [
        "Predrag Radivojac",
        "Iddo Friedberg"
      ],
      "founded": "2010",
      "license_model": "Public",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 5,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "CAFA5 (Kaggle, 2023) drew 1625 teams.",
      "composite_score": 86.8
    },
    {
      "id": "capri",
      "name": "CAPRI",
      "kind": "competition",
      "url": "https://www.ebi.ac.uk/pdbe/complex-pred/capri/",
      "github": "N/A",
      "description": "Blind prediction of protein-protein complexes, protein-peptide, and protein-ligand assemblies.",
      "benchmarks_tracked": 56,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "EBI CAPRI archive: Round 1 (2001) through Round 56 (2024).",
      "breakdown": {
        "rounds": 56,
        "targets_total_approx": 300
      },
      "host_organization": "EBI + CCP4",
      "primary_contacts": [
        "Marc Lensink",
        "Shoshana Wodak"
      ],
      "founded": "2001",
      "license_model": "Public",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "notes": "Oldest PPI prediction benchmark.",
      "composite_score": 86.3
    },
    {
      "id": "faers",
      "name": "FAERS / SIDER / OffSides / TWOSIDES",
      "kind": "data-platform",
      "url": "https://www.fda.gov/drugs/surveillance/questions-and-answers-fdas-adverse-event-reporting-system-faers",
      "github": "N/A",
      "description": "FDA adverse event reports + SIDER/OffSides/TWOSIDES derivatives for post-market signal detection.",
      "benchmarks_tracked": 4,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "FAERS (19M+ reports) + 3 derived benchmarks (SIDER, OffSides, TWOSIDES) = 4.",
      "breakdown": {
        "faers_reports": 19000000,
        "sider_pairs": 139000,
        "offsides_signals": 438000,
        "twosides_combo": 870000
      },
      "host_organization": "FDA CDER + Tatonetti Lab",
      "primary_contacts": [
        "Nick Tatonetti",
        "FDA CDER"
      ],
      "founded": "1969 / 2012",
      "license_model": "Public / CC-BY",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Essential for pharmacovigilance ML. Known reporting biases.",
      "composite_score": 85.6
    },
    {
      "id": "insilico-insilicobench",
      "name": "InsilicoBench",
      "kind": "meta-platform",
      "url": "https://insilicobench.insilico.com/",
      "github": "N/A \u2014 hosted portal",
      "description": "Compact cut of the Insilico benchmark stack focused on biology (longevity), GPCR affinity, retrosynthesis, ADMET, and clinical trials.",
      "benchmarks_tracked": 162,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Fetched https://insilicobench.insilico.com/api/benchmarks on 2026-05-12; meta.totalBenchmarks=162 across 5 categories (Biology 19, Affinity/Binding 88, Chemical Synthesis 2, ADMET 28, Clinical Trials 25).",
      "breakdown": {
        "Biology (Longevity)": 19,
        "Affinity and Binding": 88,
        "Chemical Synthesis": 2,
        "ADMET, PK & Safety": 28,
        "Clinical Trials": 25
      },
      "host_organization": "Insilico Medicine",
      "primary_contacts": [
        "Alex Zhavoronkov",
        "Alex Aliper"
      ],
      "founded": "2025",
      "license_model": "CC-BY (per portal)",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 5,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 5
      },
      "notes": "Curated subset of ScienceAIBench. Same leaderboard model pool \u2192 also NOT self-referential.",
      "composite_score": 84.6
    },
    {
      "id": "flip",
      "name": "FLIP",
      "kind": "meta-platform",
      "url": "https://benchmark.protein.properties/",
      "github": "https://github.com/J-SNACKKB/FLIP",
      "description": "Protein fitness benchmarks focused on realistic train/test splits (AAV, GB1, Meltome, SCL, Bind).",
      "benchmarks_tracked": 15,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "FLIP README: 5 landscapes \u00d7 3 splits = 15 benchmarks.",
      "breakdown": {
        "landscapes": 5,
        "splits_per_landscape": 3
      },
      "host_organization": "Rostlab TUM + AlQuraishi Lab Columbia",
      "primary_contacts": [
        "Burkhard Rost",
        "Mohammed AlQuraishi",
        "Christian Dallago"
      ],
      "founded": "2021-12",
      "license_model": "CC-BY 4.0",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 3,
        "adoption": 4,
        "quality": 5,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "Complementary to ProteinGym (smaller but careful splits).",
      "composite_score": 80.8
    },
    {
      "id": "cptac",
      "name": "CPTAC",
      "kind": "consortium",
      "url": "https://proteomics.cancer.gov/programs/cptac",
      "github": "https://github.com/PayneLab/cptac",
      "description": "Integrated proteogenomic datasets across 10 tumor types; hosts DREAM proteogenomic benchmarks.",
      "benchmarks_tracked": 10,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "CPTAC data portal 2026-05: 10 tumor types with full proteogenomic characterization (BR, CO, EN, GBM, HNSCC, LSCC, LUAD, OV, PDAC, CCRCC).",
      "breakdown": {
        "tumor_types": 10,
        "samples": 1600,
        "omics_layers": 6
      },
      "host_organization": "NCI Office of Cancer Clinical Proteomics Research",
      "primary_contacts": [
        "Henry Rodriguez",
        "Amanda Paulovich",
        "Bing Zhang"
      ],
      "founded": "2011",
      "license_model": "dbGaP controlled / public tiers",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 4,
        "quality": 5,
        "accessibility": 3,
        "industry_relevance": 4
      },
      "notes": "Deep but narrow (oncology).",
      "composite_score": 80.8
    },
    {
      "id": "deepchem",
      "name": "DeepChem",
      "kind": "meta-platform",
      "url": "https://deepchem.io/",
      "github": "https://github.com/deepchem/deepchem",
      "description": "OSS library bundling molecular ML benchmark datasets and baselines; hosts MoleculeNet.",
      "benchmarks_tracked": 40,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "deepchem.molnet module listing: ~40 packaged datasets (MoleculeNet core + extensions).",
      "breakdown": {
        "moleculenet_core": 17,
        "adme_tox_extensions": 10,
        "protein": 5,
        "materials": 4,
        "misc": 4
      },
      "host_organization": "DeepChem community",
      "primary_contacts": [
        "Bharath Ramsundar",
        "Peter Eastman"
      ],
      "founded": "2016",
      "license_model": "MIT",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 4,
        "adoption": 4,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "Excellent reproducibility \u2014 one-liner dataset loaders.",
      "composite_score": 80.0
    },
    {
      "id": "moleculenet",
      "name": "MoleculeNet",
      "kind": "meta-platform",
      "url": "https://moleculenet.org/",
      "github": "https://github.com/deepchem/deepchem",
      "description": "Benchmark suite covering quantum, physical, biophysical, physiological molecular ML tasks.",
      "benchmarks_tracked": 17,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Wu et al. 2018 Chem Sci + DeepChem repo enumeration: QM7/QM7b/QM8/QM9, ESOL, FreeSolv, Lipophilicity, PCBA, MUV, HIV, BACE, BBBP, Tox21, ToxCast, SIDER, ClinTox, PDBbind (17).",
      "breakdown": {
        "quantum": 4,
        "physical_chem": 3,
        "biophysics": 4,
        "physiology": 6
      },
      "host_organization": "DeepChem community (Pande Lab alumni)",
      "primary_contacts": [
        "Bharath Ramsundar",
        "Vijay Pande"
      ],
      "founded": "2018-03",
      "license_model": "MIT",
      "flags": [
        "data-leakage-known"
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "notes": "Historically foundational; many splits have documented leakage. Community has largely moved to TDC / Polaris for new work.",
      "composite_score": 78.0
    },
    {
      "id": "trialbench",
      "name": "TrialBench / HINT / TOP",
      "kind": "meta-platform",
      "url": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "github": "https://github.com/futianfan/clinical-trial-outcome-prediction",
      "description": "Suite of benchmarks for clinical trial outcome prediction.",
      "benchmarks_tracked": 4,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Fu et al. 2022-2024: HINT (17k trials), TOP (17k), TrialBench (21k trials, 12k drugs), CT-Outcome.",
      "breakdown": {
        "hint_trials": 17000,
        "top_trials": 17000,
        "trialbench_trials": 21000,
        "task_variants": 12
      },
      "host_organization": "Fu/Sun Lab, Georgia Tech + HMS",
      "primary_contacts": [
        "Tianfan Fu",
        "Jimeng Sun",
        "Marinka Zitnik"
      ],
      "founded": "2022",
      "license_model": "MIT",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 4
      },
      "notes": "First rigorous ML benchmarks on trial outcomes. Limited by CTgov quality.",
      "composite_score": 76.5
    },
    {
      "id": "clawbio",
      "name": "ClawBio Benchmarks",
      "kind": "meta-platform",
      "url": "https://clawbio.ai/benchmarks.html",
      "github": "https://github.com/biostochastics/clawbio_bench",
      "description": "Public scientific-correctness leaderboard for bio-analysis skills. Independent third-party benchmark (clawbio_bench, authored by Biostochastics LLC) tests ClawBio skills on safety, correctness, honesty. Public failure surface with remediation tasks.",
      "benchmarks_tracked": 10,
      "benchmark_count_asof": "2026-05-03",
      "count_methodology": "Scraped https://clawbio.ai/benchmarks.html on 2026-05-12; last bench run 2026-05-03 against ClawBio commit 7820473 using clawbio_bench v0.1.5. 10 skills audited: claw-metagenomics, equity-scorer, nutrigx-advisor, bio-orchestrator, pharmgx-reporter, fine-mapping, clinical-variant-reporter, cvr-acmg-correctness, gwas-prs, cvr-variant-identity. 168/182 tests passing (92.3%).",
      "breakdown": {
        "skills_audited": 10,
        "tests_total": 182,
        "tests_passing": 168,
        "pass_rate_pct": 92.3
      },
      "host_organization": "ClawBio (open source, MIT)",
      "primary_contacts": [
        "ClawBio maintainers",
        "Biostochastics LLC (bench author)"
      ],
      "founded": "2026-04",
      "license_model": "MIT",
      "flags": [],
      "rubric": {
        "rigor": 5,
        "coverage": 2,
        "maintenance": 5,
        "adoption": 2,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "Independent third-party bench in a separate repo \u2014 structurally NOT self-referential. Coverage narrow (bio-analysis skills) but rigor is exemplary (safety \u00d7 correctness \u00d7 honesty tri-dimensional). Model for how skill/agent correctness should be audited.",
      "composite_score": 74.2
    },
    {
      "id": "euos",
      "name": "EU-OPENSCREEN / EUbOPEN",
      "kind": "consortium",
      "url": "https://www.eu-openscreen.eu/",
      "github": "N/A",
      "description": "EU chemical biology ERIC compound libraries + EUbOPEN chemogenomic probes.",
      "benchmarks_tracked": 5,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "eu-openscreen.eu + eubopen.org 2026-05: ECBD (1), Bioactivity sets (2), EUbOPEN probe set (1), EUOS solubility (1).",
      "breakdown": {
        "chem_libraries": 2,
        "bioactivity_benchmarks": 2,
        "probe_sets": 1
      },
      "host_organization": "EU-OPENSCREEN ERIC + IMI EUbOPEN",
      "primary_contacts": [
        "Philip Gribbon",
        "Susanne M\u00fcller-Knapp"
      ],
      "founded": "2018",
      "license_model": "CC-BY",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "notes": "EUOS solubility benchmark (on Polaris) is most ML-ready.",
      "composite_score": 73.9
    },
    {
      "id": "pku-aidd",
      "name": "PKU-AIDD / ChinaDrug Benchmarks",
      "kind": "consortium",
      "url": "https://aidd.pku.edu.cn/",
      "github": "https://github.com/pku-aidd",
      "description": "PKU AI Drug Discovery + SIMM CAS + Tsinghua + Baidu + Huawei benchmark releases.",
      "benchmarks_tracked": 7,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "PKU-AIDD + SIMM CAS + BDBench GitHub 2026-05: 7 public releases (PocketBench, ProteinInvBench, GeoMol-CN, HelixFold-Bench, UniMol-Bench, BDBench, PDBbind-China).",
      "breakdown": {
        "pocket": 1,
        "inverse_folding": 1,
        "molecular_geometry": 1,
        "foundation_models": 3,
        "pdbbind": 1
      },
      "host_organization": "PKU + SIMM CAS + Tsinghua + Baidu + Huawei",
      "primary_contacts": [
        "Jianfeng Pei",
        "Luhua Lai",
        "Jianzhu Ma"
      ],
      "founded": "2020",
      "license_model": "Apache-2.0 / MIT",
      "flags": [
        "self_referential"
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 4,
        "adoption": 3,
        "quality": 4,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "Growing Chinese benchmark ecosystem. Some self-referential flags (HelixFold on its own bench).",
      "composite_score": 73.9
    },
    {
      "id": "kaggle-bio",
      "name": "Kaggle \u2014 Pharma / Bio Competitions",
      "kind": "competition",
      "url": "https://www.kaggle.com/competitions",
      "github": "N/A",
      "description": "Industry-sponsored ML competitions (Merck MAC 2012, Open Problems \u00d73, NovoZymes, BMS, CAFA 5).",
      "benchmarks_tracked": 23,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "Kaggle search for bio/chem/pharma competitions 2010-2025: 23 distinct drug-discovery-adjacent competitions identified.",
      "breakdown": {
        "molecule_activity": 5,
        "single_cell": 4,
        "protein_function": 4,
        "histopathology": 6,
        "other": 4
      },
      "host_organization": "Google / Kaggle + sponsoring companies",
      "primary_contacts": [
        "Competition hosts vary"
      ],
      "founded": "2010",
      "license_model": "Per-competition",
      "flags": [],
      "rubric": {
        "rigor": 4,
        "coverage": 3,
        "maintenance": 2,
        "adoption": 4,
        "quality": 4,
        "accessibility": 4,
        "industry_relevance": 4
      },
      "notes": "Impactful one-off events; leaderboards go stale post-close.",
      "composite_score": 71.9
    },
    {
      "id": "papers-with-code-drug",
      "name": "Papers With Code \u2014 Drug Discovery",
      "kind": "meta-platform",
      "url": "https://paperswithcode.com/area/medical",
      "github": "N/A",
      "description": "Aggregates published ML benchmarks with linked code; crowd-curated.",
      "benchmarks_tracked": 120,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "paperswithcode.com/area/medical + /task search 2026-05: ~120 drug-discovery-adjacent benchmarks (DTI, generation, ADMET, structure, drug response, etc.).",
      "breakdown": {
        "dti": 18,
        "molecule_generation": 15,
        "admet": 14,
        "protein_structure": 22,
        "drug_response": 11,
        "other": 40
      },
      "host_organization": "Meta AI / Papers With Code community",
      "primary_contacts": [
        "PwC community moderators"
      ],
      "founded": "2018",
      "license_model": "Per benchmark",
      "flags": [],
      "rubric": {
        "rigor": 3,
        "coverage": 5,
        "maintenance": 3,
        "adoption": 4,
        "quality": 2,
        "accessibility": 5,
        "industry_relevance": 3
      },
      "notes": "Useful for discovery; curation quality varies sharply.",
      "composite_score": 71.6
    },
    {
      "id": "pdbbind-casf",
      "name": "PDBbind / CASF",
      "kind": "meta-platform",
      "url": "http://www.pdbbind.org.cn/",
      "github": "N/A",
      "description": "Curated experimental binding affinities for PDB complexes + CASF scoring power tests.",
      "benchmarks_tracked": 6,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "pdbbind.org.cn: 2 splits (refined + general) \u00d7 3 CASF editions (2013, 2016, 2020) = 6 configurations.",
      "breakdown": {
        "pdbbind_refined_2020": 5316,
        "pdbbind_general_2020": 19443,
        "casf_editions": 3
      },
      "host_organization": "SIMM, Chinese Academy of Sciences",
      "primary_contacts": [
        "Renxiao Wang"
      ],
      "founded": "2004",
      "license_model": "Academic-only",
      "flags": [
        "data-leakage-known"
      ],
      "rubric": {
        "rigor": 4,
        "coverage": 4,
        "maintenance": 2,
        "adoption": 5,
        "quality": 3,
        "accessibility": 3,
        "industry_relevance": 3
      },
      "notes": "Known leakage; still dominant in published benchmarks. Academic-only licensing limits pharma use.",
      "composite_score": 70.4
    },
    {
      "id": "huggingface-biobench",
      "name": "HuggingFace \u2014 Bio/Chem Datasets",
      "kind": "data-platform",
      "url": "https://huggingface.co/datasets",
      "github": "N/A",
      "description": "HuggingFace Datasets hub filtered for bio/chem benchmarks (tdc, bigbio, InstaDeep).",
      "benchmarks_tracked": 310,
      "benchmark_count_asof": "2026-05-12",
      "count_methodology": "huggingface.co/datasets tag search (biology/chemistry/medical/drug-discovery) + curated orgs tdc/bigbio/InstaDeepAI 2026-05: ~310 entries, with duplication.",
      "breakdown": {
        "molecular": 90,
        "protein": 70,
        "clinical_text": 80,
        "genomic": 40,
        "other": 30
      },
      "host_organization": "HuggingFace + community uploaders",
      "primary_contacts": [
        "HF community"
      ],
      "founded": "2020",
      "license_model": "Per-dataset",
      "flags": [],
      "rubric": {
        "rigor": 2,
        "coverage": 5,
        "maintenance": 4,
        "adoption": 4,
        "quality": 2,
        "accessibility": 5,
        "industry_relevance": 2
      },
      "notes": "High discoverability, low quality floor.",
      "composite_score": 67.8
    }
  ]
}